In [101]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [3]:
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [7]:
pd.set_option('display.max_columns', None)

In [8]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


### Data Cleaning and Preprocessing

In [15]:
# Replace empty strings with 0
df['TotalCharges'] = df['TotalCharges'].replace(' ', 0)

In [19]:
# Convert the column to float
df['TotalCharges'] = df['TotalCharges'].astype(float)

In [23]:
# Map the target column to binary
df['Churn'] = df['Churn'].map({
    'Yes': 1,
    'No': 0
})

In [28]:
# Split the data into features and target
X = df.drop(columns=['customerID', 'Churn'])
y = df['Churn']

In [29]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [30]:
# Split into numeric and categorical columns
cat_cols = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService','OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies','Contract', 'PaperlessBilling', 'PaymentMethod']
num_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']

In [36]:
# Normalise the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[num_cols])
X_test_scaled = scaler.transform(X_test[num_cols])

# Convert to dataframe
train_num_df = pd.DataFrame(X_train_scaled, columns=X_train[num_cols].columns)
test_num_df = pd.DataFrame(X_test_scaled, columns=X_test[num_cols].columns)

In [46]:
# Initialise the encoder
one_enc = OneHotEncoder(sparse_output=False)

# Encode the categorical features
X_train_enc = one_enc.fit_transform(X_train[cat_cols])
X_test_enc = one_enc.transform(X_test[cat_cols])

# Convert to dataframe
train_cat_df = pd.DataFrame(X_train_enc, columns=one_enc.get_feature_names_out(cat_cols))
test_cat_df = pd.DataFrame(X_test_enc, columns=one_enc.get_feature_names_out(cat_cols))

In [53]:
# Concatenate the numerical and categorical features
train = pd.concat([train_num_df, train_cat_df], axis=1)
test = pd.concat([test_num_df, test_cat_df], axis=1)

### Model Selection and Evaluation

In [80]:
# Function to fit the model
def fit_model(model):
  model = model(random_state=1)
  return model.fit(train, y_train)

In [98]:
# Fit the various models
rfc = fit_model(RandomForestClassifier)
dtc = fit_model(ExtraTreesClassifier)
xgb = fit_model(XGBClassifier)
lgbm = fit_model(LGBMClassifier)

[LightGBM] [Info] Number of positive: 1521, number of negative: 4113
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009639 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 5634, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.269968 -> initscore=-0.994785
[LightGBM] [Info] Start training from score -0.994785


In [99]:
# Obtain the accuracy of each model
scores = {}

for model in [rfc, dtc, xgb, lgbm]:
  y_pred = model.predict(test)
  scores[type(model).__name__] = accuracy_score(y_test, y_pred)

In [100]:
scores

{'RandomForestClassifier': 0.7913413768630234,
 'ExtraTreesClassifier': 0.7672107877927609,
 'XGBClassifier': 0.7934705464868701,
 'LGBMClassifier': 0.8034066713981547}

In [103]:
# Parameters for tuning
n_estimators = [50, 100, 300, 500, 1000]

min_samples_split = [2, 3, 5, 7, 9]
min_samples_leaf = [1, 2, 4, 6, 8]
max_features = ['auto', 'sqrt', 'log2', None]
hyperparameter_grid = {'n_estimators': n_estimators,
                       'min_samples_leaf': min_samples_leaf,
                       'min_samples_split': min_samples_split,
                       'max_features': max_features}

In [109]:
# Use RandomizedSearchCV to obtain best parameters
etc = ExtraTreesClassifier()
rcv = RandomizedSearchCV(estimator=etc, param_distributions=hyperparameter_grid, cv=5, n_iter=10, scoring = 'accuracy', n_jobs = -1, verbose = 1, random_state = 1)

In [112]:
# Fit the model with best params to train data
model = rcv.fit(train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [128]:
model.best_params_

{'n_estimators': 50,
 'min_samples_split': 5,
 'min_samples_leaf': 4,
 'max_features': 'log2'}

In [131]:
# Use the best model for prediction
best_etc = ExtraTreesClassifier(**model.best_params_, random_state=1)
best_etc.fit(train, y_train)
y_pred = best_etc.predict(test)

In [132]:
accuracy_score(y_test, y_pred)

0.8019872249822569

In [140]:
# Obtain the two most important features
pd.Series(best_etc.feature_importances_, index=train.columns).sort_values(ascending=False).head(2)

Contract_Month-to-month    0.127887
tenure                     0.085185
dtype: float64