## Different models with RandomizedSearchCV

In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
df= pd.read_csv("churn.csv")
df.head(3)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes


In [3]:
# customerID won't provide any information for ML.
df.drop('customerID', axis=1, inplace= True)

In [4]:
df[df.TotalCharges == ' ']

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
488,Female,0,Yes,Yes,0,No,No phone service,DSL,Yes,No,Yes,Yes,Yes,No,Two year,Yes,Bank transfer (automatic),52.55,,No
753,Male,0,No,Yes,0,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,20.25,,No
936,Female,0,Yes,Yes,0,Yes,No,DSL,Yes,Yes,Yes,No,Yes,Yes,Two year,No,Mailed check,80.85,,No
1082,Male,0,Yes,Yes,0,Yes,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,25.75,,No
1340,Female,0,Yes,Yes,0,No,No phone service,DSL,Yes,Yes,Yes,Yes,Yes,No,Two year,No,Credit card (automatic),56.05,,No
3331,Male,0,Yes,Yes,0,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,19.85,,No
3826,Male,0,Yes,Yes,0,Yes,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,25.35,,No
4380,Female,0,Yes,Yes,0,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,20.0,,No
5218,Male,0,Yes,Yes,0,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,One year,Yes,Mailed check,19.7,,No
6670,Female,0,Yes,Yes,0,Yes,Yes,DSL,No,Yes,Yes,Yes,Yes,No,Two year,No,Mailed check,73.35,,No


In [5]:
df1= df[df.TotalCharges != ' ']
df1.TotalCharges= pd.to_numeric(df1.TotalCharges)
df1.dtypes

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
Churn                object
dtype: object

In [6]:
# Some of the columns have no internet service or no phone service, that can be replaced with a simple No

df1.replace({'No phone service': 'No'}, inplace= True)
df1.replace({'No internet service': 'No'}, inplace= True)

In [7]:
cols= ['InternetService','Contract','PaymentMethod']
df2= pd.get_dummies(data= df1, columns= cols)

df2['gender'].replace({'Female':1, 'Male':0}, inplace= True)

yes_no_columns= ['Partner','Dependents','PhoneService','MultipleLines','OnlineSecurity','OnlineBackup',
                  'DeviceProtection','TechSupport','StreamingTV','StreamingMovies','PaperlessBilling','Churn']

for col in yes_no_columns:
    df2[col].replace({'Yes':1, 'No':0}, inplace= True)

In [8]:
df3= df2.drop(['TotalCharges','Partner'],axis='columns')
df3.shape

(7032, 25)

In [9]:
X = df3.drop('Churn',axis='columns')
X.shape

(7032, 24)

In [10]:
y = df3['Churn']

### Split Dataset into train and test

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=5,stratify=y)

In [12]:
# Let's scale the values

from sklearn.preprocessing import StandardScaler
scalar= StandardScaler()

X_train= scalar.fit_transform(X_train)
X_test= scalar.transform(X_test)

In [16]:
from sklearn.metrics import accuracy_score

### Customize function to try different models.

In [17]:
models= list()
models.append(('Logistic', LogisticRegression()))
models.append(('AdaBoost', AdaBoostClassifier()))
models.append(('GBoost', GradientBoostingClassifier()))
models.append(('XGBoost', XGBClassifier()))

for name, model in models:
    
    model.fit(X_train,y_train)
    
    pred= model.predict(X_test)
    
    accuracy= accuracy_score(y_test, pred)
    
    print(f'Accuracy of {name} model is : {accuracy*100}')
    

Accuracy of Logistic model is : 78.3226723525231
Accuracy of AdaBoost model is : 79.60199004975125
Accuracy of GBoost model is : 79.1044776119403
Accuracy of XGBoost model is : 79.317697228145


## RandomizedSearchCV

In [18]:
from sklearn.model_selection import RandomizedSearchCV

### AdaBoostClassifier using RandomizedSearchCV

In [19]:
params_AD= {
    'n_estimators' : [50,60,70,80,90,100],
    'learning_rate' : [0.05,0.10,0.15,0.20,0.25,0.30],   
}

In [20]:
random_search_AD= RandomizedSearchCV(AdaBoostClassifier(), params_AD, n_iter= 5, scoring= 'accuracy', n_jobs= -1, cv=5, verbose= 3)

In [21]:
random_search_AD.fit(X,y)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


RandomizedSearchCV(cv=5, estimator=AdaBoostClassifier(), n_iter=5, n_jobs=-1,
                   param_distributions={'learning_rate': [0.05, 0.1, 0.15, 0.2,
                                                          0.25, 0.3],
                                        'n_estimators': [50, 60, 70, 80, 90,
                                                         100]},
                   scoring='accuracy', verbose=3)

In [22]:
random_search_AD.best_params_

{'n_estimators': 70, 'learning_rate': 0.25}

In [23]:
random_search_AD.best_score_

0.8028988364416486

In [24]:
random_search_AD.best_estimator_

AdaBoostClassifier(learning_rate=0.25, n_estimators=70)

In [32]:
cls_AD= AdaBoostClassifier(learning_rate=0.25, n_estimators=70)

### XGBClassifier using RandomizedSearchCV

In [26]:
params_XG= {
    'learning_rate' : [0.05,0.10,0.15,0.20,0.25,0.30],
    'max_depth' : [3,4,5,6,8,10,12],
    'min_child_weight' : [1,3,5,7],
    'gamma' : [0,0.1,0.2,0.3,0.4],
    'colsample_bytree' : [0.3,0.4,0.5,0.6]
}

In [27]:
random_search_XG= RandomizedSearchCV(XGBClassifier(), params_XG, n_iter= 5, scoring= 'accuracy', n_jobs= -1, cv=5, verbose= 3)

In [28]:
random_search_XG.fit(X,y)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


RandomizedSearchCV(cv=5, estimator=XGBClassifier(), n_iter=5, n_jobs=-1,
                   param_distributions={'colsample_bytree': [0.3, 0.4, 0.5,
                                                             0.6],
                                        'gamma': [0, 0.1, 0.2, 0.3, 0.4],
                                        'learning_rate': [0.05, 0.1, 0.15, 0.2,
                                                          0.25, 0.3],
                                        'max_depth': [3, 4, 5, 6, 8, 10, 12],
                                        'min_child_weight': [1, 3, 5, 7]},
                   scoring='accuracy', verbose=3)

In [29]:
random_search_XG.best_params_

{'min_child_weight': 3,
 'max_depth': 3,
 'learning_rate': 0.05,
 'gamma': 0,
 'colsample_bytree': 0.4}

In [30]:
random_search_XG.best_score_

0.8038943668165978

In [31]:
random_search_XG.best_estimator_

XGBClassifier(colsample_bytree=0.4, learning_rate=0.05, min_child_weight=3)

In [33]:
cls_XG= XGBClassifier(colsample_bytree=0.4, learning_rate=0.05, min_child_weight=3)

In [34]:
models= list()
models.append(('AdaBoost', cls_AD))
models.append(('XGBoost', cls_XG))

for name, model in models:
    
    model.fit(X_train,y_train)
    
    pred= model.predict(X_test)
    
    accuracy= accuracy_score(y_test, pred)
    
    print(f'Accuracy of {name} model is : {accuracy*100}')
    

Accuracy of AdaBoost model is : 79.03340440653874
Accuracy of XGBoost model is : 79.60199004975125
