In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix

In [2]:
#train-test generation
train_df= pd.read_csv('customer_churn_dataset-training-master.csv') #Used for training, splits and overall model building
test_df= pd.read_csv('customer_churn_dataset-testing-master.csv')   #Used for model evaluation against dummy dataset

#training_df head
train_df.head(10)

Unnamed: 0,CustomerID,Age,Gender,Tenure,Usage Frequency,Support Calls,Payment Delay,Subscription Type,Contract Length,Total Spend,Last Interaction,Churn
0,2.0,30.0,Female,39.0,14.0,5.0,18.0,Standard,Annual,932.0,17.0,1.0
1,3.0,65.0,Female,49.0,1.0,10.0,8.0,Basic,Monthly,557.0,6.0,1.0
2,4.0,55.0,Female,14.0,4.0,6.0,18.0,Basic,Quarterly,185.0,3.0,1.0
3,5.0,58.0,Male,38.0,21.0,7.0,7.0,Standard,Monthly,396.0,29.0,1.0
4,6.0,23.0,Male,32.0,20.0,5.0,8.0,Basic,Monthly,617.0,20.0,1.0
5,8.0,51.0,Male,33.0,25.0,9.0,26.0,Premium,Annual,129.0,8.0,1.0
6,9.0,58.0,Female,49.0,12.0,3.0,16.0,Standard,Quarterly,821.0,24.0,1.0
7,10.0,55.0,Female,37.0,8.0,4.0,15.0,Premium,Annual,445.0,30.0,1.0
8,11.0,39.0,Male,12.0,5.0,7.0,4.0,Standard,Quarterly,969.0,13.0,1.0
9,12.0,64.0,Female,3.0,25.0,2.0,11.0,Standard,Quarterly,415.0,29.0,1.0


In [None]:
#Label Encoding
#Categories into numerals

label_encoders = {}   #Training DataFrame
for column in train_df.select_dtypes(include='object').columns:
    le = LabelEncoder()
    train_df[column] = le.fit_transform(train_df[column])
    label_encoders[column] = le

label_encoders = {}   #Testing DataFrame
for column in test_df.select_dtypes(include='object').columns:
    le = LabelEncoder()
    test_df[column] = le.fit_transform(test_df[column])
    label_encoders[column] = le
for column in test_df.columns:
  test_df[column]= test_df[column].astype(float)

#Checking if all the data have the same datatype(dtypes)
test_df.info()
test_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64374 entries, 0 to 64373
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   CustomerID         64374 non-null  float64
 1   Age                64374 non-null  float64
 2   Gender             64374 non-null  float64
 3   Tenure             64374 non-null  float64
 4   Usage Frequency    64374 non-null  float64
 5   Support Calls      64374 non-null  float64
 6   Payment Delay      64374 non-null  float64
 7   Subscription Type  64374 non-null  float64
 8   Contract Length    64374 non-null  float64
 9   Total Spend        64374 non-null  float64
 10  Last Interaction   64374 non-null  float64
 11  Churn              64374 non-null  float64
dtypes: float64(12)
memory usage: 5.9 MB


Unnamed: 0,CustomerID,Age,Gender,Tenure,Usage Frequency,Support Calls,Payment Delay,Subscription Type,Contract Length,Total Spend,Last Interaction,Churn
0,1.0,22.0,0.0,25.0,14.0,4.0,27.0,0.0,1.0,598.0,9.0,1.0
1,2.0,41.0,0.0,28.0,28.0,7.0,13.0,2.0,1.0,584.0,20.0,0.0
2,3.0,47.0,1.0,27.0,10.0,2.0,29.0,1.0,0.0,757.0,21.0,0.0
3,4.0,35.0,1.0,9.0,12.0,5.0,17.0,1.0,2.0,232.0,18.0,0.0
4,5.0,53.0,0.0,58.0,24.0,9.0,2.0,2.0,0.0,533.0,18.0,0.0


In [None]:
#correlation filter
corr_matrix= train_df.corr()
print(corr_matrix)
f= corr_matrix.index[abs(corr_matrix['Churn'])>0.4].to_list()
print(f)

                   CustomerID       Age    Gender    Tenure  Usage Frequency  \
CustomerID           1.000000 -0.181977  0.146922  0.044129         0.038455   
Age                 -0.181977  1.000000 -0.031419 -0.011630        -0.007190   
Gender               0.146922 -0.031419  1.000000  0.007978         0.007978   
Tenure               0.044129 -0.011630  0.007978  1.000000        -0.026800   
Usage Frequency      0.038455 -0.007190  0.007978 -0.026800         1.000000   
Support Calls       -0.482894  0.158451 -0.091212 -0.027640        -0.022013   
Payment Delay       -0.262029  0.061738 -0.048449 -0.016588        -0.014470   
Subscription Type    0.012572 -0.003816 -0.001002 -0.022416        -0.000197   
Contract Length      0.001050 -0.000334 -0.002106 -0.000702         0.000321   
Total Spend          0.359289 -0.084684  0.066138  0.019006         0.018631   
Last Interaction    -0.125356  0.028980  0.134786 -0.006903        -0.004662   
Churn               -0.839365  0.218394 

In [None]:
#filtering the sets
train_df= train_df[f]
test_df= test_df[f]
#Dropping irrelevent columns
train_df= train_df.dropna().drop('CustomerID', axis=1)
test_df= test_df.dropna().drop('CustomerID', axis=1)
#DF to numpy
train_df= train_df.to_numpy()
test_df= test_df.to_numpy()

#Splitting X and y
X_train= train_df[:, :-1]
y_train= train_df[:, -1]
X_test= test_df[:, :-1]
y_test= test_df[:, -1]
print(X_train.shape)
print(y_train.shape)

#Split train-CrossValidation
X_train, X_cv, y_train, y_cv = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

print("X_train:", X_train.shape)
print("y_train:",y_train.shape)
print("X_cv:", X_cv.shape)
print("y_cv:", y_cv.shape)
print("X_test:", X_test.shape)
print("y_test:", y_test.shape)

(440832, 2)
(440832,)
(352665, 2)
(352665,)
(88167, 2)
(88167,)
(64374, 2)
(64374,)


In [None]:
#Feature manipulation using selection polynomials and feature combinaion
#It seems the combination of the two most important features with prioritization generalizes best
X_train= ((X_train[:, 0]**10)*(X_train[:,1]**0.25)).reshape(-1,1)
X_cv= ((X_cv[:, 0]**10)*(X_cv[:,1]**0.25)).reshape(-1,1)
X_test= ((X_test[:, 0]**10)*(X_test[:,1]**0.25)).reshape(-1,1)

In [None]:
#Normalizing the features
scalar = StandardScaler()
X_train= scalar.fit_transform(X_train)
X_cv= scalar.transform(X_cv)
X_test= scalar.transform(X_test)

In [None]:
#Random Forest Classifier
model= RandomForestClassifier(n_estimators=100, max_depth= 3)
model.fit(X_train, y_train)

In [None]:
#RF Classifier Results
y_pred= model.predict(X_test)
print("Head y:")
print(y_pred[:10])
train_pred= model.predict(X_train)
cv_pred= model.predict(X_cv)
print("\n\nCLASSIFICATION REPORT:\n",classification_report(y_test, y_pred))
print("\nCONFUSION MATRIX:\n",confusion_matrix(y_test, y_pred))
print("\nACCURACIES:")
print("Train accuracy", accuracy_score(y_train, train_pred))
print("CV accuracy", accuracy_score(y_cv, cv_pred))
print("Test accuracy", accuracy_score(y_test, y_pred))


Head y:
[0. 1. 0. 1. 1. 1. 1. 0. 1. 0.]


CLASSIFICATION REPORT:
               precision    recall  f1-score   support

         0.0       0.76      0.50      0.60     33881
         1.0       0.60      0.83      0.69     30493

    accuracy                           0.65     64374
   macro avg       0.68      0.66      0.65     64374
weighted avg       0.68      0.65      0.65     64374


CONFUSION MATRIX:
 [[16881 17000]
 [ 5257 25236]]

ACCURACIES:
Train accuracy 0.7759233266697858
CV accuracy 0.7772295756915852
Test accuracy 0.6542548233758971


In [None]:
#XGBoost Classifier
modelx= xgb.XGBClassifier(n_estimators=10000, max_depth=2, reg_lambda=50, reg_alpha=50,
                          objective='binary:logistic', random_state=42, learning_rate=0.01,
                          eval_set=[(X_cv, y_cv)],
                          early_stopping_rounds=50, verbose= 0)

modelx.fit(X_train, y_train, eval_set=[(X_cv, y_cv)])

[0]	validation_0-logloss:0.68010
[1]	validation_0-logloss:0.67647
[2]	validation_0-logloss:0.67291
[3]	validation_0-logloss:0.66941
[4]	validation_0-logloss:0.66598
[5]	validation_0-logloss:0.66261
[6]	validation_0-logloss:0.65930
[7]	validation_0-logloss:0.65607
[8]	validation_0-logloss:0.65288


Parameters: { "eval_set", "verbose" } are not used.



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[4478]	validation_0-logloss:0.37525
[4479]	validation_0-logloss:0.37525
[4480]	validation_0-logloss:0.37525
[4481]	validation_0-logloss:0.37525
[4482]	validation_0-logloss:0.37525
[4483]	validation_0-logloss:0.37525
[4484]	validation_0-logloss:0.37525
[4485]	validation_0-logloss:0.37525
[4486]	validation_0-logloss:0.37525
[4487]	validation_0-logloss:0.37525
[4488]	validation_0-logloss:0.37525
[4489]	validation_0-logloss:0.37525
[4490]	validation_0-logloss:0.37524
[4491]	validation_0-logloss:0.37524
[4492]	validation_0-logloss:0.37524
[4493]	validation_0-logloss:0.37524
[4494]	validation_0-logloss:0.37524
[4495]	validation_0-logloss:0.37524
[4496]	validation_0-logloss:0.37524
[4497]	validation_0-logloss:0.37524
[4498]	validation_0-logloss:0.37524
[4499]	validation_0-logloss:0.37524
[4500]	validation_0-logloss:0.37524
[4501]	validation_0-logloss:0.37524
[4502]	validation_0-logloss:0.37524
[4503]	validation_0-logloss:0.37524

In [None]:
#XGBoost Results
y_predx= modelx.predict(X_test)
train_predx= modelx.predict(X_train)
cv_predx= modelx.predict(X_cv)
print("Head y:")
print(y_predx[:10])
print("\n\nCLASSIFICATION REPORT:\n",classification_report(y_test, y_predx))
print("\nCONFUSION MATRIX:\n",confusion_matrix(y_test, y_predx))
print("\nACCURACIES:")
print("Train accuracy", accuracy_score(y_train, train_predx))
print("CV accuracy", accuracy_score(y_cv, cv_predx))
print("Test accuracy", accuracy_score(y_test, y_predx))

Head y:
[0 1 0 1 1 1 1 0 1 0]


CLASSIFICATION REPORT:
               precision    recall  f1-score   support

         0.0       0.79      0.42      0.54     33881
         1.0       0.57      0.88      0.69     30493

    accuracy                           0.63     64374
   macro avg       0.68      0.65      0.62     64374
weighted avg       0.69      0.63      0.62     64374


CONFUSION MATRIX:
 [[14099 19782]
 [ 3794 26699]]

ACCURACIES:
Train accuracy 0.8166078289594941
CV accuracy 0.8175281000827974
Test accuracy 0.6337651847018982


#### Our Baseline source: [Link](https://www.kaggle.com/datasets/muhammadshahidazeem/customer-churn-dataset/code?datasetId=3404076&sortBy=voteCount)

In [None]:
rfacc= accuracy_score(y_test, y_pred)
xgbacc= accuracy_score(y_test, y_predx)
print("RF Classifier accuracy:", rfacc)
print("XGBoost Classifier accuracy:", xgbacc)
base= 0.54
print("\nBaseline Performace:", base, "\n")
acc= [rfacc, xgbacc]
models= ["Random Forest", "XGBoost"]
for i in range(len(acc)):
  if model > base:
    print(f"Our {models[i]} model performs better than industrial baseline.")
  else:
    print(f"Our {models[i]} model performs worse than industrial baseline.")

RF Classifier accuracy: 0.6542548233758971
XGBoost Classifier accuracy: 0.6337651847018982

Baseline Performace: 0.54 

Our Random Forest model performs better than industrial baseline.
Our XGBoost model performs better than industrial baseline.


In [437]:
rf_model= model
xgb_model= modelx