Use the teleco-customer-churn dataset for the following:

1. Apply all the classification algorithms (KNN, Logisitc Regression, Naive Bayes, Decision Trees, SVM) on this dataset and print the accuracies.


2. Find out the different tunable parameters for each algorithms mentioned above.


3. Apply gridsearchCV and randomizedsearchCV for all the above classification algorithms and get the best parameters.

<h2>PreProcessing</h2>

In [1]:
import pandas as pd

data = pd.read_csv('Telco-Customer-Churn.csv')

In [2]:
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
data = data.drop('customerID', axis=1)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null   object 
 14  Contract          7043 non-null   object 
 15  PaperlessBilling  7043 non-null   object 
 16  PaymentMethod     7043 non-null   object 


In [5]:
data = data[data['TotalCharges'] != ' ']

In [6]:
# Convert TotalCharges to 'float64' datatype
data['TotalCharges'] = data['TotalCharges'].astype('float64')

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7032 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7032 non-null   object 
 1   SeniorCitizen     7032 non-null   int64  
 2   Partner           7032 non-null   object 
 3   Dependents        7032 non-null   object 
 4   tenure            7032 non-null   int64  
 5   PhoneService      7032 non-null   object 
 6   MultipleLines     7032 non-null   object 
 7   InternetService   7032 non-null   object 
 8   OnlineSecurity    7032 non-null   object 
 9   OnlineBackup      7032 non-null   object 
 10  DeviceProtection  7032 non-null   object 
 11  TechSupport       7032 non-null   object 
 12  StreamingTV       7032 non-null   object 
 13  StreamingMovies   7032 non-null   object 
 14  Contract          7032 non-null   object 
 15  PaperlessBilling  7032 non-null   object 
 16  PaymentMethod     7032 non-null   object 


In [8]:
data.to_csv('cleaned_teleco_customer_churn.csv')

In [9]:
# 1-hot encode object Datatypes
obj_columns = data.select_dtypes(exclude=['int64', 'float64']).columns
codes = pd.get_dummies(data[obj_columns], drop_first=True, dtype='int64')
encoded_data = data.join(codes)
encoded_data = encoded_data.drop(obj_columns, axis=1)

x = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

encoded_x = encoded_data.iloc[:, :-1].values
encoded_y = encoded_data.iloc[:, -1].values

In [10]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

scaled_x = sc.fit_transform(encoded_x)

<h2>Classification Algorithms</h2>
<h3>1. kNN</h3>

In [11]:
from sklearn.model_selection import train_test_split
knn_x_train, knn_x_test, knn_y_train, knn_y_test = train_test_split(scaled_x, encoded_y, test_size=0.2, random_state=0)

In [12]:
#knn_data = data.select_dtypes(include=['int64', 'float64'])

from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors=3)
knn_model.fit(knn_x_train, knn_y_train)

knn_y_pred = knn_model.predict(knn_x_test)

In [13]:
# Accuracy
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

accuracy_score(knn_y_test, knn_y_pred)

0.7405828002842928

In [14]:
confusion_matrix(knn_y_test, knn_y_pred)

# TP FP
# FN TN

array([[859, 179],
       [186, 183]])

In [15]:
print(classification_report(knn_y_test, knn_y_pred))

              precision    recall  f1-score   support

           0       0.82      0.83      0.82      1038
           1       0.51      0.50      0.50       369

    accuracy                           0.74      1407
   macro avg       0.66      0.66      0.66      1407
weighted avg       0.74      0.74      0.74      1407



<h3>2. Logistic Regression</h3>

In [16]:
from sklearn.model_selection import train_test_split
logreg_x_train, logreg_x_test, logreg_y_train, logreg_y_test = train_test_split(scaled_x, encoded_y, test_size=0.25, random_state=0)

In [17]:
from sklearn.linear_model import LogisticRegression
logreg_model = LogisticRegression()

logreg_model.fit(logreg_x_train, logreg_y_train)

In [18]:
logreg_y_pred = logreg_model.predict(logreg_x_test)

In [19]:
accuracy_score(logreg_y_test, logreg_y_pred)

0.8128555176336746

In [20]:
confusion_matrix(logreg_y_test, logreg_y_pred)

array([[1181,  130],
       [ 199,  248]])

In [21]:
print(classification_report(logreg_y_test, logreg_y_pred))

              precision    recall  f1-score   support

           0       0.86      0.90      0.88      1311
           1       0.66      0.55      0.60       447

    accuracy                           0.81      1758
   macro avg       0.76      0.73      0.74      1758
weighted avg       0.81      0.81      0.81      1758



<h3>3. Naive Bayes</h3>

Source: https://towardsdatascience.com/learning-by-implementing-gaussian-naive-bayes-3f0e3d2c01b2

If your features are 0 and 1 only, you could use a Bernoulli distribution. 

If they are integers, a Multinomial distribution. 

However, we have real feature values and decide for a Gaussian distribution, hence the name Gaussian naive Bayes.

Also, for Continuous NB Classification refer to: https://remykarem.github.io/blog/naive-bayes

In [22]:
nb_x_train, nb_x_test, nb_y_train, nb_y_test = train_test_split(scaled_x, encoded_y, test_size=0.25, random_state=0)

In [23]:
from sklearn.naive_bayes import GaussianNB
nb_model = GaussianNB()
nb_model.fit(nb_x_train, nb_y_train)

In [24]:
nb_y_pred = nb_model.predict(nb_x_test)

In [25]:
accuracy_score(nb_y_test, nb_y_pred)

0.6712172923777019

In [26]:
confusion_matrix(nb_y_test, nb_y_pred)

array([[793, 518],
       [ 60, 387]])

In [27]:
print(classification_report(nb_y_test, nb_y_pred))

              precision    recall  f1-score   support

           0       0.93      0.60      0.73      1311
           1       0.43      0.87      0.57       447

    accuracy                           0.67      1758
   macro avg       0.68      0.74      0.65      1758
weighted avg       0.80      0.67      0.69      1758



<h3>4. Decision Trees</h3>

In [28]:
dtree_x_train, dtree_x_test, dtree_y_train, dtree_y_test = train_test_split(scaled_x, encoded_y, test_size=0.25, random_state=0)

In [29]:
from sklearn.tree import DecisionTreeClassifier
dtree_model = DecisionTreeClassifier(criterion='entropy') #or criterion = 'gini'
dtree_model.fit(dtree_x_train, dtree_y_train)

In [30]:
dtree_y_pred = dtree_model.predict(dtree_x_test)

In [31]:
accuracy_score(dtree_y_test, dtree_y_pred)

0.7468714448236633

In [32]:
confusion_matrix(dtree_y_test, dtree_y_pred)

array([[1084,  227],
       [ 218,  229]])

In [33]:
print(classification_report(dtree_y_test, dtree_y_pred))

              precision    recall  f1-score   support

           0       0.83      0.83      0.83      1311
           1       0.50      0.51      0.51       447

    accuracy                           0.75      1758
   macro avg       0.67      0.67      0.67      1758
weighted avg       0.75      0.75      0.75      1758



<h3>5. SVM</h3>

In [34]:
from sklearn.svm import SVC

svc_x_train, svc_x_test, svc_y_train, svc_y_test = train_test_split(scaled_x, encoded_y, test_size=0.25, random_state=0)

In [35]:
svc_model = SVC()

svc_model.fit(svc_x_train, svc_y_train)

In [36]:
svc_y_pred = svc_model.predict(svc_x_test)

In [37]:
accuracy_score(svc_y_test, svc_y_pred)

0.8071672354948806

In [38]:
confusion_matrix(svc_y_test, svc_y_pred)

array([[1202,  109],
       [ 230,  217]])

In [39]:
print(classification_report(svc_y_test, svc_y_pred))

              precision    recall  f1-score   support

           0       0.84      0.92      0.88      1311
           1       0.67      0.49      0.56       447

    accuracy                           0.81      1758
   macro avg       0.75      0.70      0.72      1758
weighted avg       0.80      0.81      0.80      1758



<h2> Tunable Parameters </h2>


In [40]:
import timeit
import warnings
warnings.filterwarnings('ignore')

<h3>1. kNN</h3>

In [41]:
?KNeighborsClassifier

In [42]:
# ALL PARAMETERS

knn_parameters = {
    'n_neighbors': [1, 5, 10, 15, 20, 25, 30, 50, 100],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [10, 20, 30, 40, 50, 100],
    'p': [1, 2, 3],
    'metric' : ['cityblock', 'euclidean','l1', 'l2', 'manhattan'],
    'n_jobs': [-1]
}

# Metric - 'cosine', 'nan_euclidean', & 'haversine' throw errors.

In [43]:
# TUNED PARAMETER SET

knn_parameters = {
    'n_neighbors': [27, 28, 29, 30],
    'weights': ['uniform'],
    'algorithm': ['auto', 'kd_tree', 'brute'],
    'leaf_size': [6],
    'p': [2, 3],
    'metric' : ['cityblock', 'l1', 'manhattan'],
    'n_jobs': [-1]
}

# Metric - 'cosine', 'nan_euclidean', & 'haversine' throw errors.

In [44]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
knn_grid = GridSearchCV(knn_model, knn_parameters, scoring='accuracy', cv=7, n_jobs=-1, verbose=1)
knn_randomized = RandomizedSearchCV(knn_model, knn_parameters, scoring='accuracy', cv=7, n_jobs=-1, verbose=1)

In [45]:
%%timeit -n 1 -r 1
# Source: https://docs.python.org/3/library/timeit.html#timeit-command-line-interface

# 'n' = how many times to execute ‘statement’
# 'r' = how many times to repeat the timer, and calculate average (default 5)

knn_grid.fit(scaled_x, encoded_y)

print()
print(knn_grid.best_params_)
print("Accuracy Score: ", knn_grid.best_score_)
print()

Fitting 7 folds for each of 72 candidates, totalling 504 fits

{'algorithm': 'kd_tree', 'leaf_size': 6, 'metric': 'cityblock', 'n_jobs': -1, 'n_neighbors': 28, 'p': 2, 'weights': 'uniform'}
Accuracy Score:  0.79422593917153

26.9 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [46]:
%%timeit -n 1 -r 1
# Source: https://docs.python.org/3/library/timeit.html#timeit-command-line-interface

# 'n' = how many times to execute ‘statement’
# 'r' = how many times to repeat the timer, and calculate average (default 5)

knn_randomized.fit(scaled_x, encoded_y)

print()
print(knn_randomized.best_params_)
print("Accuracy Score: ", knn_randomized.best_score_)
print()

Fitting 7 folds for each of 10 candidates, totalling 70 fits

{'weights': 'uniform', 'p': 2, 'n_neighbors': 28, 'n_jobs': -1, 'metric': 'cityblock', 'leaf_size': 6, 'algorithm': 'kd_tree'}
Accuracy Score:  0.79422593917153

3.92 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


<h3>2. Logistic Regression</h3>

In [47]:
?LogisticRegression

In [48]:
# ALL PARAMETERS

logreg_parameters = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'C' : [1e-5, 1, 10, 100, 800, 900, 1000, 1100, 1500],
    'fit_intercept' : [True, False],
    'intercept_scaling': [True, False],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'max_iter': [100, 800, 900, 1000, 1100, 1200],
    'multi_class': ['auto', 'ovr', 'multinomial'], 
    'n_jobs': [-1]
}

In [49]:
# TUNED PARAMETER SET

logreg_parameters = {
    'penalty': ['none'],
    'C' : [1e-175],
    'fit_intercept' : [True],
    'intercept_scaling': [True, False],
    'solver': ['lbfgs', 'liblinear'],
    'max_iter': [1000, 1050, 1100, 1500, 2000],
    'multi_class': ['auto', 'ovr', 'multinomial'], 
    'n_jobs': [-1]
}

In [50]:
logreg_randomized = RandomizedSearchCV(logreg_model, logreg_parameters, scoring='accuracy', cv=7, n_jobs=-1, verbose=1)
logreg_grid = GridSearchCV(logreg_model, logreg_parameters, scoring='accuracy', cv=7, n_jobs=-1, verbose=1)

In [51]:
%%timeit -n 1 -r 1

logreg_randomized.fit(scaled_x, encoded_y)

print()
print(logreg_randomized.best_params_)
print("Accuracy Score: ", logreg_randomized.best_score_)
print()

Fitting 7 folds for each of 10 candidates, totalling 70 fits

{'solver': 'lbfgs', 'penalty': 'none', 'n_jobs': -1, 'multi_class': 'auto', 'max_iter': 1100, 'intercept_scaling': False, 'fit_intercept': True, 'C': 1e-175}
Accuracy Score:  0.8043255549231646

899 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [52]:
%%timeit -n 1 -r 1

logreg_grid.fit(scaled_x, encoded_y)

print()
print(logreg_grid.best_params_)
print("Accuracy Score: ", logreg_grid.best_score_)
print()

Fitting 7 folds for each of 60 candidates, totalling 420 fits

{'C': 1e-175, 'fit_intercept': True, 'intercept_scaling': True, 'max_iter': 1000, 'multi_class': 'auto', 'n_jobs': -1, 'penalty': 'none', 'solver': 'lbfgs'}
Accuracy Score:  0.8043255549231646

1.93 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [53]:
from sklearn.metrics import get_scorer_names
get_scorer_names()

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'matthews_corrcoef',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_absolute_percentage_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'rand_score',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'roc_auc_ovo',
 'roc_auc_ovo_weighted',
 'roc_auc_

In [54]:
?GridSearchCV

<h3>3. Naive Bayes</h3>

In [55]:
?GaussianNB

In [56]:
nb_parameters = {'var_smoothing': [5, 6, 7, 8, 9, 10]}

In [57]:
nb_model = GaussianNB()

In [58]:
nb_randomized = RandomizedSearchCV(nb_model, nb_parameters, scoring='accuracy', cv=7, n_jobs=-1, verbose=1)
nb_grid = GridSearchCV(nb_model, nb_parameters, scoring='accuracy', cv=7, n_jobs=-1, verbose=1)

In [59]:
%%timeit -n 1 -r 1

nb_randomized.fit(scaled_x, encoded_y)

print()
print(nb_randomized.best_params_)
print("Accuracy Score: ", nb_randomized.best_score_)
print()

Fitting 7 folds for each of 6 candidates, totalling 42 fits

{'var_smoothing': 7}
Accuracy Score:  0.7929469046344827

183 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [60]:
%%timeit -n 1 -r 1

nb_grid.fit(scaled_x, encoded_y)

print()
print(nb_grid.best_params_)
print("Accuracy Score: ", nb_grid.best_score_)
print()

Fitting 7 folds for each of 6 candidates, totalling 42 fits

{'var_smoothing': 7}
Accuracy Score:  0.7929469046344827

187 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


<h3>4. Decision Trees</h3>

In [61]:
?DecisionTreeClassifier

In [62]:
# ALL PARAMETERS

dtree_parameters = {
    'criterion': ["gini", "entropy", "log_loss"],
    'splitter': ["best", "random"],
    'max_depth': [0, 10, 20, 50, 10, 1000, 100000],
    'max_features': [15, 10, 25, 50, 100, 1000, "sqrt", "log2"] # 'auto' is deprecated
}

In [63]:
# TUNED PARAMETER SET

dtree_parameters = {
    'criterion': ["gini", "entropy", "log_loss"],
    'splitter': ["best", "random"],
    'max_depth': [7, 8, 9],
    'max_features': [11, 12, 13, ] # 'auto' is deprecated
}

In [64]:
dtree_model = DecisionTreeClassifier() #or criterion = 'gini'

In [65]:
dtree_randomized = RandomizedSearchCV(dtree_model, dtree_parameters, scoring='accuracy', cv=7, n_jobs=-1, verbose=1)
dtree_grid = GridSearchCV(dtree_model, dtree_parameters, scoring='accuracy', cv=7, n_jobs=-1, verbose=1)

In [66]:
%%timeit -n 1 -r 1

dtree_randomized.fit(scaled_x, encoded_y)

print()
print(dtree_randomized.best_params_)
print("Accuracy Score: ", dtree_randomized.best_score_)
print()

Fitting 7 folds for each of 10 candidates, totalling 70 fits

{'splitter': 'best', 'max_features': 11, 'max_depth': 9, 'criterion': 'entropy'}
Accuracy Score:  0.7885402243194953

296 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [67]:
%%timeit -n 1 -r 1

dtree_grid.fit(scaled_x, encoded_y)

print()
print(dtree_grid.best_params_)
print("Accuracy Score: ", dtree_grid.best_score_)
print()

Fitting 7 folds for each of 54 candidates, totalling 378 fits

{'criterion': 'gini', 'max_depth': 7, 'max_features': 13, 'splitter': 'random'}
Accuracy Score:  0.7926620454925146

852 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


<h3>5. SVM</h3>

In [68]:
?SVC

In [69]:
# ALL PARAMETERS

svm_parameters = {
    'kernel': ['linear', 'rbf', 'sigmoid', 'precomputed'],
    'gamma': ['scale', 'auto'],
    'break_ties': [True, False]
}

In [70]:
# TUNED PARAMETER SET

svm_parameters = {
    'kernel': ['linear', 'rbf', 'sigmoid'],
    'gamma': ['scale', 'auto'],
    'break_ties': [True, False]
}

In [71]:
'''
Instead of needlessly computing a lot of SVC fits with different 
degree parameter values, where that will be ignored (all the 
kernels but poly). I suggest splitting the runs for poly and the 
other kernels, you will save a lot of time.

Source:
https://stackoverflow.com/questions/72101295/python-gridsearchcv-taking-too-long-to-finish-running
'''

svm_parameters = {
    'kernel': ['poly'],
    'degree': [1, 2, 3],
    'gamma': ['scale', 'auto'],
    'break_ties': [True, False]
}

In [72]:
svm_model = SVC()

In [73]:
svm_randomized = RandomizedSearchCV(svm_model, svm_parameters, scoring='accuracy', cv=7, n_jobs=-1, verbose=1)
svm_grid = GridSearchCV(svm_model, svm_parameters, scoring='accuracy', cv=7, n_jobs=-1, verbose=1)

In [74]:
%%timeit -n 1 -r 1

svm_randomized.fit(scaled_x, encoded_y)

print()
print(svm_randomized.best_params_)
print("Accuracy Score: ", svm_randomized.best_score_)
print()

Fitting 7 folds for each of 10 candidates, totalling 70 fits

{'kernel': 'poly', 'gamma': 'auto', 'degree': 2, 'break_ties': False}
Accuracy Score:  0.8016220830961868

16.3 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [75]:
%%timeit -n 1 -r 1

svm_grid.fit(scaled_x, encoded_y)

print()
print(svm_grid.best_params_)
print("Accuracy Score: ", svm_grid.best_score_)
print()

Fitting 7 folds for each of 12 candidates, totalling 84 fits

{'break_ties': True, 'degree': 2, 'gamma': 'auto', 'kernel': 'poly'}
Accuracy Score:  0.8016220830961868

19.6 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)






