In [129]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, r2_score, mean_squared_error, accuracy_score

In [130]:
#importing Dataset
data = pd.read_csv("asdsd.csv")
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [131]:
data.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [132]:
data['Tenure'] = pd.cut(data['tenure'],
                      bins=[0, 12, 24, 36, 48, 60, np.inf],
                      labels=[0, 1, 2, 3, 4, 5])

In [133]:
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,Tenure
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No,0
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,No,No,No,One year,No,Mailed check,56.95,1889.5,No,2
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,0
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No,3
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,0


In [134]:
#'Female - 0', 'Male - 1'
data.gender.unique()

array(['Female', 'Male'], dtype=object)

In [135]:
data.SeniorCitizen.unique()

array([0, 1], dtype=int64)

In [136]:
#'Yes - 0', 'No - 1'
data.Partner.unique()

array(['Yes', 'No'], dtype=object)

In [137]:
#'No - 0', 'Yes - 1'
data.Dependents.unique()

array(['No', 'Yes'], dtype=object)

In [138]:
#'No - 0', 'Yes - 1'
data.PhoneService.unique()

array(['No', 'Yes'], dtype=object)

In [139]:
#'No phone service - 0', 'No - 1', 'Yes - 2'
data.MultipleLines.unique()

array(['No phone service', 'No', 'Yes'], dtype=object)

In [140]:
#'DSL - 0', 'Fiber optic - 1', 'No - 2'
data.InternetService.unique()

array(['DSL', 'Fiber optic', 'No'], dtype=object)

In [141]:
#'No - 0', 'Yes - 1', 'No internet service - 2'
data.OnlineSecurity.unique()

array(['No', 'Yes', 'No internet service'], dtype=object)

In [142]:
#'Yes - 0', 'No - 1', 'No internet service - 2'
data.OnlineBackup.unique()

array(['Yes', 'No', 'No internet service'], dtype=object)

In [143]:
#'No - 0', 'Yes - 1', 'No internet service - 2'
data.DeviceProtection.unique()

array(['No', 'Yes', 'No internet service'], dtype=object)

In [144]:
#'No - 0', 'Yes - 1', 'No internet service - 2'
data.TechSupport.unique()

array(['No', 'Yes', 'No internet service'], dtype=object)

In [145]:
#'No - 0', 'Yes - 1', 'No internet service - 2'
data.StreamingTV.unique()

array(['No', 'Yes', 'No internet service'], dtype=object)

In [146]:
#'No - 0', 'Yes - 1', 'No internet service - 2'
data.StreamingMovies.unique()

array(['No', 'Yes', 'No internet service'], dtype=object)

In [147]:
#'Month-to-month - 0', 'One year - 1', 'Two year - 2'
data.Contract.unique()

array(['Month-to-month', 'One year', 'Two year'], dtype=object)

In [148]:
#'Yes - 0', 'No - 1'
data.PaperlessBilling.unique()

array(['Yes', 'No'], dtype=object)

In [149]:
#'Electronic check - 0 ', 'Mailed check - 1', 'Bank transfer (automatic) - 2','Credit card (automatic) - 3'
data.PaymentMethod.unique()

array(['Electronic check', 'Mailed check', 'Bank transfer (automatic)',
       'Credit card (automatic)'], dtype=object)

In [150]:
#No - 0; Yes - 1
data.Churn.unique()

array(['No', 'Yes'], dtype=object)

In [151]:
le = LabelEncoder()
data['gender'] = le.fit_transform(data.gender.values)
data['SeniorCitizen'] = le.fit_transform(data.SeniorCitizen.values)
data['Partner'] = le.fit_transform(data.Partner.values)
data['Dependents'] = le.fit_transform(data.Dependents.values)
data['PhoneService'] = le.fit_transform(data.PhoneService.values)
data['MultipleLines'] = le.fit_transform(data.MultipleLines.values)
data['InternetService'] = le.fit_transform(data.InternetService.values)
data['OnlineSecurity'] = le.fit_transform(data.OnlineSecurity.values)
data['OnlineBackup'] = le.fit_transform(data.OnlineBackup.values)
data['DeviceProtection'] = le.fit_transform(data.DeviceProtection.values)
data['TechSupport'] = le.fit_transform(data.TechSupport.values)
data['StreamingTV'] = le.fit_transform(data.StreamingTV.values)
data['StreamingMovies'] = le.fit_transform(data.StreamingMovies.values)
data['Contract'] = le.fit_transform(data.Contract.values)
data['PaperlessBilling'] = le.fit_transform(data.PaperlessBilling.values)
data['PaymentMethod'] = le.fit_transform(data.PaymentMethod.values)

In [152]:
data['Churn'] = le.fit_transform(data.Churn.values)

In [153]:
data.Churn.unique()

array([0, 1])

In [154]:
data.Contract.unique()

array([0, 1, 2])

In [155]:
data = data.drop(['customerID','tenure'], axis=1)

In [156]:
data.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,Tenure
0,0,0,1,0,0,1,0,0,2,0,0,0,0,0,1,2,29.85,29.85,0,0
1,1,0,0,0,1,0,0,2,0,2,0,0,0,1,0,3,56.95,1889.5,0,2
2,1,0,0,0,1,0,0,2,2,0,0,0,0,0,1,3,53.85,108.15,1,0
3,1,0,0,0,0,1,0,2,0,2,2,0,0,1,0,0,42.3,1840.75,0,3
4,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,2,70.7,151.65,1,0


In [157]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   gender            7043 non-null   int32   
 1   SeniorCitizen     7043 non-null   int64   
 2   Partner           7043 non-null   int32   
 3   Dependents        7043 non-null   int32   
 4   PhoneService      7043 non-null   int32   
 5   MultipleLines     7043 non-null   int32   
 6   InternetService   7043 non-null   int32   
 7   OnlineSecurity    7043 non-null   int32   
 8   OnlineBackup      7043 non-null   int32   
 9   DeviceProtection  7043 non-null   int32   
 10  TechSupport       7043 non-null   int32   
 11  StreamingTV       7043 non-null   int32   
 12  StreamingMovies   7043 non-null   int32   
 13  Contract          7043 non-null   int32   
 14  PaperlessBilling  7043 non-null   int32   
 15  PaymentMethod     7043 non-null   int32   
 16  MonthlyCharges    7043 n

In [158]:
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')

In [159]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   gender            7043 non-null   int32   
 1   SeniorCitizen     7043 non-null   int64   
 2   Partner           7043 non-null   int32   
 3   Dependents        7043 non-null   int32   
 4   PhoneService      7043 non-null   int32   
 5   MultipleLines     7043 non-null   int32   
 6   InternetService   7043 non-null   int32   
 7   OnlineSecurity    7043 non-null   int32   
 8   OnlineBackup      7043 non-null   int32   
 9   DeviceProtection  7043 non-null   int32   
 10  TechSupport       7043 non-null   int32   
 11  StreamingTV       7043 non-null   int32   
 12  StreamingMovies   7043 non-null   int32   
 13  Contract          7043 non-null   int32   
 14  PaperlessBilling  7043 non-null   int32   
 15  PaymentMethod     7043 non-null   int32   
 16  MonthlyCharges    7043 n

In [160]:
# Check the descriptive statistics of numeric variables
data.describe()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
count,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7032.0,7043.0
mean,0.504756,0.162147,0.483033,0.299588,0.903166,0.940508,0.872923,0.790004,0.906432,0.904444,0.797104,0.985376,0.992475,0.690473,0.592219,1.574329,64.761692,2283.300441,0.26537
std,0.500013,0.368612,0.499748,0.45811,0.295752,0.948554,0.737796,0.859848,0.880162,0.879949,0.861551,0.885002,0.885091,0.833755,0.491457,1.068104,30.090047,2266.771362,0.441561
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.25,18.8,0.0
25%,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,35.5,401.45,0.0
50%,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,2.0,70.35,1397.475,0.0
75%,1.0,0.0,1.0,1.0,1.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,2.0,89.85,3794.7375,1.0
max,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,3.0,118.75,8684.8,1.0


SeniorCitizen is actually a categorical hence the 25%-50%-75% distribution is not propoer

75% customers have tenure less than 55 months

Average Monthly charges are USD 64.76 whereas 25% customers pay more than USD 89.85 per month

In [161]:
predict = "Churn"
data1 = data[["gender","SeniorCitizen","Partner","Dependents","PhoneService","MultipleLines","InternetService","OnlineSecurity","OnlineBackup","DeviceProtection","TechSupport","StreamingTV","StreamingMovies","Contract","PaperlessBilling","PaymentMethod","MonthlyCharges","TotalCharges","Churn","Tenure"]]

In [162]:
x = np.array(data1.drop([predict], axis=1))
y = np.array(data1[predict])

In [163]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)

In [164]:
nan_rows = np.isnan(x_train).any(axis=1)
# Remove rows with missing values from both x_train and y_train
x_train = x_train[~nan_rows]
y_train = y_train[~nan_rows]

In [165]:
nan_rows = np.isnan(x_test).any(axis=1)

# Remove rows with missing values from both x_test and y_test
x_test = x_test[~nan_rows]
y_test = y_test[~nan_rows]

In [166]:
#Training Different Classification models
xgb_model = XGBClassifier().fit(x_train,y_train)
lgbm_model = LGBMClassifier().fit(x_train,y_train)
dt_model = DecisionTreeClassifier().fit(x_train,y_train)
hgb_model = HistGradientBoostingClassifier().fit(x_train,y_train)
RF_model = RandomForestClassifier().fit(x_train,y_train)
gb_model = GradientBoostingClassifier().fit(x_train,y_train)
svc_model = svm.SVC().fit(x_train,y_train)

[LightGBM] [Info] Number of positive: 1478, number of negative: 4146
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000435 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 559
[LightGBM] [Info] Number of data points in the train set: 5624, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.262802 -> initscore=-1.031454
[LightGBM] [Info] Start training from score -1.031454


In [167]:
#creating a function to test the model and display its metrics
def evaluate_model(model,x_test,y_test):
    prediction = model.predict(x_test)
    print("Mean Absolute error", mean_absolute_error(y_test,prediction))
    print("Mean Absolute Percentage error", mean_absolute_percentage_error(y_test,prediction))
    print("Mean Squared error", mean_squared_error(y_test,prediction))
    print("Root Mean Squared error", np.sqrt(mean_squared_error(y_test,prediction)))
    print("R2 Score", r2_score(y_test,prediction))
    print("accuracy score:",accuracy_score(y_test,prediction))

In [168]:
#printing the values of the metrics of all the models
print("model: XG Boost Classifier")
print(evaluate_model(xgb_model,x_test,y_test))
print()
print("model: LGBM Classifier")
print(evaluate_model(lgbm_model,x_test,y_test))
print()
print("model: Decision Tree Classifier")
print(evaluate_model(dt_model,x_test,y_test))
print()
print("model: Hist Gradient Boosting Classifier")
print(evaluate_model(hgb_model,x_test,y_test))
print()
print("model: Random Forest Classifier")
print(evaluate_model(RF_model,x_test,y_test))
print()
print("model: Gradient Boosting Classifier")
print(evaluate_model(gb_model,x_test,y_test))
print()
print("model: SVM Classifier")
print(evaluate_model(svc_model,x_test,y_test))

model: XG Boost Classifier
Mean Absolute error 0.21875
Mean Absolute Percentage error 396623830819560.94
Mean Squared error 0.21875
Root Mean Squared error 0.46770717334674267
R2 Score -0.09057530925670254
accuracy score: 0.78125
None

model: LGBM Classifier
Mean Absolute error 0.18607954545454544
Mean Absolute Percentage error 319857928080291.0
Mean Squared error 0.18607954545454544
Root Mean Squared error 0.43136938400232516
R2 Score 0.07230282134657118
accuracy score: 0.8139204545454546
None

model: Decision Tree Classifier
Mean Absolute error 0.26704545454545453
Mean Absolute Percentage error 578942849825326.6
Mean Squared error 0.26704545454545453
Root Mean Squared error 0.5167644091319124
R2 Score -0.33135167623545514
accuracy score: 0.7329545454545454
None

model: Hist Gradient Boosting Classifier
Mean Absolute error 0.1981534090909091
Mean Absolute Percentage error 335850824484305.56
Mean Squared error 0.1981534090909091
Root Mean Squared error 0.44514425649547484
R2 Score 0.01

In [169]:
#saving the model using pickle and choosing the Gradient Boosting classifier algorithm
import pickle
pickle.dump(gb_model,open("model.pkl","wb"))