### 1.3 - Reading and exploring the .csv dataset using pandas.read_csv()


In [39]:
# Compared to other solution:
# This file uses the pandas.get_dummies() function for "One Hot Encoding" as well having a different feauture selection.
# Also an example of scaling numerical columns

import pandas as pd
import numpy as np

# Reading the csv dataset into a pandas dataframe

customer_data = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")

# customer_data is now a pandas Dataframe object

In [40]:
customer_data.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [41]:
# Inspection - Printing all column names in dataset

list(customer_data.columns)

['customerID',
 'gender',
 'SeniorCitizen',
 'Partner',
 'Dependents',
 'tenure',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'MonthlyCharges',
 'TotalCharges',
 'Churn']

In [42]:
# Inspection - How many rows, columns?
customer_data.shape

(7043, 21)

In [43]:
# Inspection - Do we have imbalanced classes?

customer_data["Churn"].value_counts()

# Yes, a bit. 

No     5174
Yes    1869
Name: Churn, dtype: int64

### 1.4 - Dropping columns that might be uncessary.


In [44]:
# removing columns (commented out)
# customer_data = customer_data.drop(['customerID', 'Gender'], axis=1, inplace=True)

# Here you can make your own selection of feature data if you want!
# Maybe it will positively affect the accuracy!
# But remember we need at least the target column, "Churn" to remain.


# We can also "remove" columns by select a subset of the columns instead.

customer_data = customer_data[["SeniorCitizen", "Dependents", 'OnlineSecurity',"tenure", 
                               "InternetService", "Contract","PaymentMethod", "TechSupport", 
                               'PaperlessBilling',"MonthlyCharges", "Churn"]]


In [45]:
customer_data.shape

(7043, 11)

### 1.5 - Removing missing values


In [46]:
# Remove rows with missing values  (NaN)

customer_data = customer_data.dropna()

In [47]:
# Lets check if we removed any rows

customer_data.shape

# No, same amount of rows as before!

(7043, 11)

In [48]:
# We didnt remove any rows, so it probably means there are no missing values.
# Lets double check by counting missing values using isnull().sum()

customer_data.isnull().sum()

# Again, no

SeniorCitizen       0
Dependents          0
OnlineSecurity      0
tenure              0
InternetService     0
Contract            0
PaymentMethod       0
TechSupport         0
PaperlessBilling    0
MonthlyCharges      0
Churn               0
dtype: int64

### 1.6 - Convert data to a format usable for scikit-learn

In [49]:
# Inspection - 
# Display dataframe information like column datatypes

# We can see that we have 5 string (a.k.a object or category) columns.

# We can also see that TotalCharges are strings for some reason.

# We can also see that SeniorCitizen are int type, even though we would typically think of this as categorical.

customer_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   SeniorCitizen     7043 non-null   int64  
 1   Dependents        7043 non-null   object 
 2   OnlineSecurity    7043 non-null   object 
 3   tenure            7043 non-null   int64  
 4   InternetService   7043 non-null   object 
 5   Contract          7043 non-null   object 
 6   PaymentMethod     7043 non-null   object 
 7   TechSupport       7043 non-null   object 
 8   PaperlessBilling  7043 non-null   object 
 9   MonthlyCharges    7043 non-null   float64
 10  Churn             7043 non-null   object 
dtypes: float64(1), int64(2), object(8)
memory usage: 605.4+ KB


In [50]:
# Inspection - Printing category ((a.ka string/ or objects)) columns only. 

object_types = customer_data.select_dtypes(include=[np.object])
object_types.head(5)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  object_types = customer_data.select_dtypes(include=[np.object])


Unnamed: 0,Dependents,OnlineSecurity,InternetService,Contract,PaymentMethod,TechSupport,PaperlessBilling,Churn
0,No,No,DSL,Month-to-month,Electronic check,No,Yes,No
1,No,Yes,DSL,One year,Mailed check,No,No,No
2,No,Yes,DSL,Month-to-month,Mailed check,No,Yes,Yes
3,No,Yes,DSL,One year,Bank transfer (automatic),Yes,No,No
4,No,No,Fiber optic,Month-to-month,Electronic check,No,Yes,Yes


In [51]:
# Let's use the OneHotEncoding strategy to change categories to integers.
# we can accomplish this with pd.get_dummies()

customer_data = pd.get_dummies(data=customer_data, columns=["SeniorCitizen", "InternetService", 'OnlineSecurity',"Dependents", 
                                                            "TechSupport", "Contract","PaymentMethod", "PaperlessBilling"])
customer_data.head(5)

# Looks good below! Column Churn is still string, but this is fine since it is the target of our model.
# The Target won't be used to fit the model

Unnamed: 0,tenure,MonthlyCharges,Churn,SeniorCitizen_0,SeniorCitizen_1,InternetService_DSL,InternetService_Fiber optic,InternetService_No,OnlineSecurity_No,OnlineSecurity_No internet service,...,TechSupport_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,PaperlessBilling_No,PaperlessBilling_Yes
0,1,29.85,No,1,0,1,0,0,1,0,...,0,1,0,0,0,0,1,0,0,1
1,34,56.95,No,1,0,1,0,0,0,0,...,0,0,1,0,0,0,0,1,1,0
2,2,53.85,Yes,1,0,1,0,0,0,0,...,0,1,0,0,0,0,0,1,0,1
3,45,42.3,No,1,0,1,0,0,0,0,...,1,0,1,0,1,0,0,0,1,0
4,2,70.7,Yes,1,0,0,1,0,1,0,...,0,1,0,0,0,0,1,0,0,1


In [52]:
# We might want to scale continious numerical columns

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
customer_data[["tenure", "MonthlyCharges"]] = scaler.fit_transform(customer_data[["tenure", "MonthlyCharges"]])

### 1.7 - Run a kNN algorithm on the data


In [53]:
import sklearn as sk
from sklearn.model_selection import train_test_split

# Seperating data to features and target.

# Target is a single column here: Churn
target = customer_data["Churn"] 

# Features columns we can retrieve by removing the target from the dataset.
features = customer_data.drop("Churn", axis=1)

# Splitting the dataset into training and test data.
f_train, f_test, t_train, t_test = train_test_split(features, target, random_state=1)

features.head()

Unnamed: 0,tenure,MonthlyCharges,SeniorCitizen_0,SeniorCitizen_1,InternetService_DSL,InternetService_Fiber optic,InternetService_No,OnlineSecurity_No,OnlineSecurity_No internet service,OnlineSecurity_Yes,...,TechSupport_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,PaperlessBilling_No,PaperlessBilling_Yes
0,0.013889,0.115423,1,0,1,0,0,1,0,0,...,0,1,0,0,0,0,1,0,0,1
1,0.472222,0.385075,1,0,1,0,0,0,0,1,...,0,0,1,0,0,0,0,1,1,0
2,0.027778,0.354229,1,0,1,0,0,0,0,1,...,0,1,0,0,0,0,0,1,0,1
3,0.625,0.239303,1,0,1,0,0,0,0,1,...,1,0,1,0,1,0,0,0,1,0
4,0.027778,0.521891,1,0,0,1,0,1,0,0,...,0,1,0,0,0,0,1,0,0,1


In [54]:
from sklearn.neighbors import KNeighborsClassifier

# Training the data on the KNN classifier. 
# Remember to try to tweak the n_neighbors parameter for maybe a better accuracy

knn = KNeighborsClassifier(n_neighbors=14)
knn.fit(f_train, t_train)

KNeighborsClassifier(n_neighbors=14)

### 1.8 - Find the performance of your model in terms of accuracy


In [55]:
# Accuracy Score
knn.score(f_test, t_test)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


0.8188529244747302

In [56]:
# Optional Confusion Matrix for the interested

from sklearn.metrics import confusion_matrix

t_pred = knn.predict(f_test)
c_matrix = confusion_matrix(t_test, t_pred)
c_matrix


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


array([[1219,  108],
       [ 211,  223]], dtype=int64)