In [1]:
import warnings
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score
import xgboost as xgb
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV

In [2]:
warnings.filterwarnings("ignore")

In [3]:
df=pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes



# Preprocessing:
Perform initial data preparation by converting the 'TotalCharges' column to numeric values and filling missing values with 0.
Convert the 'Churn' column to binary values, where 'No' is mapped to 0 and 'Yes' is mapped to 1.<br>
Split the data into an 80-20 train-test split with a random state of “1”.<br>
Select these features:  
categorical = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService','OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies','Contract', 'PaperlessBilling', 'PaymentMethod']
numerical = ['tenure', 'MonthlyCharges', 'TotalCharges']<br>

In [4]:
df["TotalCharges"]=df["TotalCharges"].replace(" ", np.nan).astype(float)
df["Churn"] =df["Churn"].replace(['No',"Yes"],[0,1]).astype(int)

In [5]:
# Subsetting X and Y
X=df.drop(columns=["Churn","customerID"])
y=df["Churn"]

X_numeric =X[['tenure', 'MonthlyCharges', 'TotalCharges']]
X_categorical=X.drop(columns=['tenure', 'MonthlyCharges', 'TotalCharges'])

In [6]:
df=df.fillna(0)

In [7]:
categorical = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
               'InternetService','OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 
               'StreamingTV', 'StreamingMovies','Contract', 'PaperlessBilling', 'PaymentMethod'] 
numerical = ['tenure', 'MonthlyCharges', 'TotalCharges']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.2, random_state=1)


In [9]:
print(X_train.shape)
print(X_test.shape)

(5634, 19)
(1409, 19)


# Feature engineering:
The numerical features should be scaled using StandardScaler, convert the output back to a dataframe and put back the column names.
The categorical features are one-hot encoded using OneHotEncoder(set sparse_output to false), convert the output back to a dataframe and put back the column names.<br>
Combine scaled numerical and one-hot encoded categorical features into train and test set dataframes (use pd.concat)<br>
Use scikit learn to train a random forest and extra trees classifier, and use xgboost and lightgbm to train an extreme boosting model and a light gradient boosting model. Use random_state = 1 for training all models and evaluate on the test set. Answer the following questions:



In [10]:
# Standardize the numerical feature for training set
scaler=StandardScaler()
X_numerical=X_train[numerical]
scaler.fit(X_numerical)
X_train_scaled=scaler.transform(X_numerical)

In [11]:
# encode categorical feature for training dataset
encoder=OneHotEncoder(sparse_output=False)
X_categorical=X_train[categorical]
encoder.fit(X_categorical)
X_train_encoded=encoder.transform(X_categorical)

In [12]:
#convert the encoded and scaled dataset into a dataframe and concatenate them into one dataframe
numeric_train=pd.DataFrame(X_train_scaled,columns=X_numerical.columns)
encoded_train_df = pd.DataFrame(X_train_encoded, columns=encoder.get_feature_names_out(input_features=X_categorical.columns))
final_X_train = pd.concat([numeric_train, encoded_train_df], axis=1)

In [13]:
#fill missing values
final_X_train=final_X_train.fillna(0)

In [14]:
# Standardize the numerical feature for test set
X_test_numerical=X_test[numerical]
X_test_scaled=scaler.transform(X_test_numerical)

In [15]:
# encode categorical feature for test dataset
X_test_categorical=X_test[categorical]
X_test_encoded=encoder.transform(X_test_categorical)

In [16]:
#convert the encoded and scaled dataset into a dataframe and concatenate them into one dataframe
numeric_test=pd.DataFrame(X_test_scaled,columns=X_test_numerical.columns)
encoded_test_df = pd.DataFrame(X_test_encoded, columns=encoder.get_feature_names_out(input_features=X_test_categorical.columns))
final_X_test = pd.concat([numeric_test, encoded_test_df], axis=1)

In [17]:
final_X_test=final_X_test.fillna(0)

In [18]:
# RandomForestClassifier
rfc = RandomForestClassifier(random_state=1)
rfc.fit(final_X_train, y_train)

In [19]:
# Accuracy of random forest classifier model for the train set
rfc.score(final_X_train, y_train)

0.9980475683351083

In [20]:
# Accuracy of random forest classifier model for the test set
rfc.score(final_X_test, y_test)

0.7927608232789212

In [21]:
# Extra Trees Classifier
etc= ExtraTreesClassifier(random_state=1)
etc.fit(final_X_train, y_train)

In [22]:
# Accuracy of ExtraTreesClassifier model for the train set
etc.score(final_X_train, y_train)

0.9980475683351083

In [23]:
# Accuracy of ExtraTreesClassifier model for the test set
etc.score(final_X_test, y_test)

0.7721788502484032

In [24]:
# fit the XGBoost model
xg = XGBClassifier(random_state=1)
xg.fit(final_X_train, y_train)

In [25]:
# Accuracy of XGBoost model for the train set
xg.score(final_X_train, y_train)

0.9403620873269436

In [26]:
# Accuracy of XGBoost model for the test set
xg.score(final_X_test, y_test)

0.801277501774308

In [27]:
# fit the LightGm model
lg = lgb.LGBMClassifier(random_state=1)
lg.fit(final_X_train, y_train)

[LightGBM] [Info] Number of positive: 1521, number of negative: 4113
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001848 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 5634, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.269968 -> initscore=-0.994785
[LightGBM] [Info] Start training from score -0.994785


In [28]:
# Accuracy of lightGM model for the train set
lg.score(final_X_train, y_train)

0.8768193113241036

In [29]:
# Accuracy of lightGB model for the test set
lg.score(final_X_test, y_test)

0.8069552874378992

To improve the Extra Trees Classifier, you will use the following parameters (number of estimators, minimum number of samples, minimum number of samples for leaf node and the number of features to consider when looking for the best split) for the hyperparameter grid needed to run a Randomized Cross Validation Search (RandomizedSearchCV).

n_estimators = [50, 100, 300, 500, 1000]

min_samples_split = [2, 3, 5, 7, 9]

min_samples_leaf = [1, 2, 4, 6, 8]

max_features = ['auto', 'sqrt', 'log2', None]

hyperparameter_grid = {'n_estimators': n_estimators,

                   'min_samples_leaf': min_samples_leaf,

                   'min_samples_split': min_samples_split,

                   'max_features': max_features}

Using the ExtraTreesClassifier as your estimator with cv=5, n_iter=10, scoring = 'accuracy', n_jobs = -1, verbose = 1 and random_state = 1. What are the best hyperparameters from the randomized search CV?

Options N_estimators = 1000 , min_samples_split = 2 , min_samples_leaf = 8, max_features = None

N_estimators = 500 , min_samples_split = 2 , min_samples_leaf = 8, max_features = ‘log2‘

N_estimators = 300 , min_samples_split = 5 , min_samples_leaf = 6, max_features = ‘auto’

N_estimators = 1000 , min_samples_split = 9 , min_samples_leaf = 8, max_features = None

In [30]:
# The hyperparameter grid
n_estimators = [50, 100, 300, 500, 1000]
min_samples_split = [2, 3, 5, 7, 9]
min_samples_leaf = [1, 2, 4, 6, 8]
max_features = ['auto', 'sqrt', 'log2', None]

hyperparameter_grid = {
    'n_estimators': n_estimators,
    'min_samples_leaf': min_samples_leaf,
    'min_samples_split': min_samples_split,
    'max_features': max_features
}
# Create an Extra Trees Classifier
extc = ExtraTreesClassifier(random_state=1)

# Create a RandomizedSearchCV object
rand_search = RandomizedSearchCV(
    extc,  # Estimator
    hyperparameter_grid, 
    cv=5,  # Cross-validation with 5 folds
    n_iter=10, 
    scoring='accuracy', 
    n_jobs=-1, 
    verbose=1,  
    random_state=1  
)
# Fit the RandomizedSearchCV 
rand_search.fit(final_X_train, y_train)

# Get the best hyperparameters
best_hyperparameters = rand_search.best_params_

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [31]:
print(best_hyperparameters)

{'n_estimators': 1000, 'min_samples_split': 9, 'min_samples_leaf': 8, 'max_features': 'sqrt'}


Train a new ExtraTreesClassifier Model with the new Hyperparameters from the RandomizedSearchCV (with random_state = 1). Is the accuracy of the new optimal model higher or lower than the initial ExtraTreesClassifier model with no hyperparameter tuning

In [32]:
e_tree=ExtraTreesClassifier(random_state=1,
                               n_estimators=100,
                               min_samples_split=9,
                               min_samples_leaf=8,
                               max_features='sqrt')

In [33]:
e_tree.fit(final_X_train,y_train)

In [34]:
# accuracy on the training dataset
e_tree.score(final_X_train,y_train)

0.8384806531771388

In [35]:
# accuracy on the test dataset
e_tree.score(final_X_test,y_test)

0.8041163946061036


Find the feature importance using the optimal ExtraTreesClassifier model. Which features are the two most important respectively?

In [36]:
pd.Series(e_tree.feature_importances_,index=final_X_train.columns).sort_values(ascending=False).head(5)

Contract_Month-to-month        0.146798
OnlineSecurity_No              0.087070
tenure                         0.085165
Contract_Two year              0.064490
InternetService_Fiber optic    0.060689
dtype: float64