# Tag-Along Project 3
# Telco Customer Churn

In [3]:
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')


# Data processing and analysis
import numpy as np
import pandas as pd
import math
import re


# Data visualisation
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import plotly.express as px
import plotly.graph_objects as go



# Configure visualisations
%matplotlib inline
plt.style.use('fivethirtyeight')
sns.set(context="notebook", palette="dark", style = 'whitegrid' , color_codes=True)


# Classification algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis , QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
import lightgbm as lgb
import xgboost as xgb

In [4]:
path = '/content/WA_Fn-UseC_-Telco-Customer-Churn.csv.xls'


In [35]:
df = pd.read_csv(path)
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [37]:
df.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


In [38]:
df.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

## Preprocessing

## Perform initial data preparation by converting the 'TotalCharges' column to numeric values and filling missing values with 0.

In [39]:
# TotalCharges column should be numerical, but it is object. Let's check.
df.TotalCharges.values

array(['29.85', '1889.5', '108.15', ..., '346.45', '306.6', '6844.5'],
      dtype=object)

In [40]:
# convert Totalcharges to nuerical
df.TotalCharges = pd.to_numeric(df.TotalCharges, errors='coerce')

In [41]:
#Check missing values in Totalcharges
df.TotalCharges.isnull().sum()



11

In [42]:
#check the rows where Totalcharges are missing
df[df.TotalCharges.isnull()]

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
488,4472-LVYGI,Female,0,Yes,Yes,0,No,No phone service,DSL,Yes,...,Yes,Yes,Yes,No,Two year,Yes,Bank transfer (automatic),52.55,,No
753,3115-CZMZD,Male,0,No,Yes,0,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,20.25,,No
936,5709-LVOEQ,Female,0,Yes,Yes,0,Yes,No,DSL,Yes,...,Yes,No,Yes,Yes,Two year,No,Mailed check,80.85,,No
1082,4367-NUYAO,Male,0,Yes,Yes,0,Yes,Yes,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,25.75,,No
1340,1371-DWPAZ,Female,0,Yes,Yes,0,No,No phone service,DSL,Yes,...,Yes,Yes,Yes,No,Two year,No,Credit card (automatic),56.05,,No
3331,7644-OMVMY,Male,0,Yes,Yes,0,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,19.85,,No
3826,3213-VVOLG,Male,0,Yes,Yes,0,Yes,Yes,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,25.35,,No
4380,2520-SGTTA,Female,0,Yes,Yes,0,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,20.0,,No
5218,2923-ARZLG,Male,0,Yes,Yes,0,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,One year,Yes,Mailed check,19.7,,No
6670,4075-WKNIU,Female,0,Yes,Yes,0,Yes,Yes,DSL,No,...,Yes,Yes,Yes,No,Two year,No,Mailed check,73.35,,No


In [43]:
# fill NaNs in Totalcharges with 0
df.TotalCharges = df.TotalCharges.fillna(0)

In [44]:
df.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

## Convert the 'Churn' column to binary values, where 'No' is mapped to 0 and 'Yes' is mapped to 1.



In [45]:
df.Churn.value_counts()

Churn
No     5174
Yes    1869
Name: count, dtype: int64

In [47]:
# map  'No' to 0 and 'Yes' to 1
df['Churn'] = df['Churn'].replace({'Yes': 1, 'No': 0})
df.Churn.value_counts()

Churn
0    5174
1    1869
Name: count, dtype: int64

## Feature engineering:

### The numerical features should be scaled using StandardScaler, convert the output back to a dataframe and put back the column names.




In [50]:
numerical = ['tenure', 'MonthlyCharges', 'TotalCharges']

In [60]:
# Standardize the numerical columns
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_data  = scaler.fit_transform(df[numerical])
scaled_df = pd.DataFrame(scaled_data, columns=numerical)
scaled_df.head()

Unnamed: 0,tenure,MonthlyCharges,TotalCharges
0,-1.277445,-1.160323,-0.992611
1,0.066327,-0.259629,-0.172165
2,-1.236724,-0.36266,-0.958066
3,0.514251,-0.746535,-0.193672
4,-1.236724,0.197365,-0.938874


### The categorical features are one-hot encoded using OneHotEncoder(set sparse_output to false), convert the output back to a dataframe and put back the column names.


In [49]:
categorical = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService',

               'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',

               'Contract', 'PaperlessBilling', 'PaymentMethod']

In [62]:
from sklearn.preprocessing import OneHotEncoder
# Create a OneHotEncoder object
encoder = OneHotEncoder(sparse_output=False)

# Fit the encoder to the categorical features
encoded_data = encoder.fit_transform(df[categorical])

# Get the feature names after encoding
encoded_columns = encoder.get_feature_names_out(categorical)

# Convert the encoded data back to a dataframe
encoded_df = pd.DataFrame(encoded_data, columns=encoded_columns)
encoded_df

Unnamed: 0,gender_Female,gender_Male,SeniorCitizen_0,SeniorCitizen_1,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,PhoneService_Yes,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
7039,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
7040,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
7041,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


 ## Combine scaled numerical and one-hot encoded categorical features into train and test set dataframes (use pd.concat)



In [63]:
combined_df = pd.concat([scaled_df, encoded_df], axis=1)
combined_df.head()

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,gender_Female,gender_Male,SeniorCitizen_0,SeniorCitizen_1,Partner_No,Partner_Yes,Dependents_No,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,-1.277445,-1.160323,-0.992611,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.066327,-0.259629,-0.172165,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,-1.236724,-0.36266,-0.958066,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,0.514251,-0.746535,-0.193672,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,-1.236724,0.197365,-0.938874,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


## Split the df into an 80-20 train-test split with a random state of “1”.


In [64]:
X = combined_df
y = df['Churn']

In [65]:
# train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)



## Use scikit learn to train a random forest and extra trees classifier, and use xgboost and lightgbm to train an extreme boosting model and a light gradient boosting model. Use random_state = 1 for training all models and evaluate on the test set. Answer from question 14


In [None]:
Use scikit learn to train a random forest and extra trees classifier, and use xgboost and lightgbm to train an extreme boosting model and a light gradient boosting model. Use random_state = 1 for training all models and evaluate on the test set.


In [67]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
import xgboost as xgb
import lightgbm as lgb

# Train the Random Forest classifier
rf = RandomForestClassifier(random_state=1)
rf.fit(X_train, y_train)



In [68]:

# Train the Extra Trees classifier
et = ExtraTreesClassifier(random_state=1)
et.fit(X_train, y_train)



In [69]:
# Train the XGBoost classifier
xgb_model = xgb.XGBClassifier(random_state=1)
xgb_model.fit(X_train, y_train)


In [70]:

# Train the LightGBM classifier
lgb_model = lgb.LGBMClassifier(random_state=1)
lgb_model.fit(X_train, y_train)


[LightGBM] [Info] Number of positive: 1521, number of negative: 4113
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000814 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 5634, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.269968 -> initscore=-0.994785
[LightGBM] [Info] Start training from score -0.994785


In [71]:
# Evaluate all models on the test set
rf_score = rf.score(X_test, y_test)
et_score = et.score(X_test, y_test)
xgb_score = xgb_model.score(X_test, y_test)
lgb_score = lgb_model.score(X_test, y_test)


In [82]:
from sklearn.metrics import accuracy_score
et_accuracy = accuracy_score(y_test, et.predict(X_test))
et_accuracy



0.7700496806245565

## 14. What is the accuracy on the test set using the random forest classifier?


In [75]:

rf_accuracy = accuracy_score(y_test, rf.predict(X_test))
rf_accuracy

0.7906316536550745


## 15. What is the accuracy on the test set using the xgboost classifier?


In [77]:
xgb_accuracy = accuracy_score(y_test, xgb_model.predict(X_test))
xgb_accuracy

0.7934705464868701

## 16. What is the accuracy on the test set using the LGBM classifier?


In [79]:
lgb_accuracy = accuracy_score(y_test, lgb_model.predict(X_test))
lgb_accuracy

0.8133427963094393

## 17. To improve the Extra Trees Classifier,
 you will use the following parameters (number of estimators, minimum number of samples, minimum number of samples for leaf node and the number of features to consider when looking for the best split) for the hyperparameter grid needed to run a Randomized Cross Validation Search (RandomizedSearchCV).

n_estimators = [50, 100, 300, 500, 1000]

min_samples_split = [2, 3, 5, 7, 9]

min_samples_leaf = [1, 2, 4, 6, 8]

max_features = ['auto', 'sqrt', 'log2', None]

hyperparameter_grid = {'n_estimators': n_estimators,

                       'min_samples_leaf': min_samples_leaf,

                       'min_samples_split': min_samples_split,

                       'max_features': max_features}

Using the ExtraTreesClassifier as your estimator with cv=5, n_iter=10, scoring = 'accuracy', n_jobs = -1, verbose = 1 and random_state = 1. What are the best hyperparameters from the randomized search CV?

In [80]:
from sklearn.model_selection import RandomizedSearchCV

# Define the parameters
n_estimators = [50, 100, 300, 500, 1000]
min_samples_split = [2, 3, 5, 7, 9]
min_samples_leaf = [1, 2, 4, 6, 8]
max_features = ['auto', 'sqrt', 'log2', None]

# Create the hyperparameter grid
hyperparameter_grid = {
    'n_estimators': n_estimators,
    'min_samples_leaf': min_samples_leaf,
    'min_samples_split': min_samples_split,
    'max_features': max_features
}

# Initialize the ExtraTreesClassifier
extra_trees = ExtraTreesClassifier()


In [81]:
# Initialize RandomizedSearchCV
randomized_search = RandomizedSearchCV(
    estimator=extra_trees,
    param_distributions=hyperparameter_grid,
    n_iter=10,
    scoring='accuracy',
    cv=5,
    n_jobs=-1,
    verbose=1,
    random_state=1
)

# Perform the randomized search
randomized_search.fit(X_train, y_train)  # Assuming you have X_train and y_train

# Get the best hyperparameters
best_params = randomized_search.best_params_
print("Best hyperparameters:", best_params)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best hyperparameters: {'n_estimators': 1000, 'min_samples_split': 9, 'min_samples_leaf': 8, 'max_features': 'sqrt'}


## 18. Train a new ExtraTreesClassifier Model
with the new Hyperparameters from the RandomizedSearchCV (with random_state = 1). Is the accuracy of the new optimal model higher or lower than the initial ExtraTreesClassifier model with no hyperparameter tuning?


In [85]:
# the optimal hyperparameters from RandomizedSearchCV
best_params = {
    'n_estimators': 1000,
    'min_samples_split': 9,
    'min_samples_leaf': 8,
    'max_features': 'sqrt'
}

# Initialize the ExtraTreesClassifier with the optimal hyperparameters
optimal_extra_trees = ExtraTreesClassifier(**best_params, random_state=1)

# Train the model on your training data
optimal_extra_trees.fit(X_train, y_train)

# Predict the labels for the test data
y_pred_optimal = optimal_extra_trees.predict(X_test)



In [86]:
# Calculate the accuracy of the optimized ExtraTreesClassifier model
accuracy_optimal = accuracy_score(y_test, y_pred_optimal)
accuracy_optimal

0.8041163946061036

Earlier we got et_accuracy 0.770, but after the hyperparameter tuning ExtraTreesClassifier model accuracy **became higher** - 0.804.

## 19. What other hyperparameters can be tuned for ExtraTreeClassifer?


In [88]:
et.get_params()

{'bootstrap': False,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 1,
 'verbose': 0,
 'warm_start': False}

From the options given in the task there are **max_leaf_nodes** and **min_weight_fraction_leaf** parameters which can also be tuned.


## 20 Find the feature importance using the optimal ExtraTreesClassifier model. Which features are the two most important respectively?

In [89]:
feature_importances = optimal_extra_trees.feature_importances_
feature_importances

array([0.09280047, 0.01492623, 0.04771397, 0.01065307, 0.01033262,
       0.00686424, 0.00714407, 0.00855604, 0.00896891, 0.00504508,
       0.00501079, 0.00317177, 0.00310987, 0.00829134, 0.00310918,
       0.00828334, 0.03268689, 0.06528743, 0.00681595, 0.07499757,
       0.00793063, 0.02169984, 0.03007741, 0.00899164, 0.01259782,
       0.01685725, 0.00618396, 0.00745443, 0.0641409 , 0.00683073,
       0.01455862, 0.00817886, 0.00743675, 0.00828469, 0.00887974,
       0.0056501 , 0.00929173, 0.15223652, 0.0285427 , 0.05442309,
       0.01188317, 0.01177935, 0.00698283, 0.00860032, 0.05166596,
       0.00507212])

In [91]:
# Create a list of tuples (feature_name, importance)
feature_importance_list = [(feature, importance) for feature, importance in zip(X_train.columns, feature_importances)]

# Sort the list by importance
feature_importance_list.sort(key=lambda x: x[1], reverse=True)
feature_importance_list

[('Contract_Month-to-month', 0.1522365223825757),
 ('tenure', 0.09280046728070009),
 ('OnlineSecurity_No', 0.07499756668925914),
 ('InternetService_Fiber optic', 0.06528743070518972),
 ('TechSupport_No', 0.06414089705963495),
 ('Contract_Two year', 0.05442309306017465),
 ('PaymentMethod_Electronic check', 0.051665955261978744),
 ('TotalCharges', 0.047713974468078756),
 ('InternetService_DSL', 0.03268688697354864),
 ('OnlineBackup_No', 0.030077412045788524),
 ('Contract_One year', 0.028542700191358017),
 ('OnlineSecurity_Yes', 0.021699839701421534),
 ('DeviceProtection_No', 0.01685724934907098),
 ('MonthlyCharges', 0.01492623419583065),
 ('TechSupport_Yes', 0.0145586189112325),
 ('OnlineBackup_Yes', 0.012597816332646977),
 ('PaperlessBilling_No', 0.011883172396864736),
 ('PaperlessBilling_Yes', 0.01177934523732429),
 ('gender_Female', 0.010653066633429322),
 ('gender_Male', 0.010332617797728823),
 ('StreamingMovies_Yes', 0.009291725680038769),
 ('OnlineBackup_No internet service', 0.008

The most important features are **Contract_Month-to-month** and **tenure**.