In [167]:
import pandas as pd
df = pd.read_csv('telco_churn.csv')
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [168]:
pd.set_option('display.max_columns', df.shape[1])

In [169]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [170]:
del df['customerID']

In [171]:
#df['TotalCharges'] = pd.to_numeric(df['TotalCharges'])

In [172]:
df = df.replace(r'^\s+$', 0, regex=True)

In [173]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'])

In [174]:
df = pd.get_dummies(df)

In [176]:
del df['Churn_No']

In [177]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [178]:
# Split data into X and y
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

In [179]:
xgb = XGBClassifier(booster='gbtree', objective='binary:logistic',random_state=2, n_jobs=-1)
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)
score = accuracy_score(y_pred, y_test)
print('Score: ' + str(score))

Score: 0.8091993185689949


In [180]:
xgb = XGBClassifier(random_state=2, n_jobs=-1)
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)
score = accuracy_score(y_pred, y_test)
print('Score: ' + str(score))

Score: 0.8091993185689949


In [181]:
# Import GridSearchCV
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [215]:
def grid_search(params, random=False): 
    
    xgb = XGBClassifier(booster='gbtree', objective='binary:logistic', random_state=2)
    
    if random:
        grid = RandomizedSearchCV(xgb, params, cv=5, n_jobs=-1, random_state=2)
    else:
        # Instantiate GridSearchCV as grid_reg
        grid = GridSearchCV(xgb, params, cv=5, n_jobs=-1)
    
    # Fit grid_reg on X_train and y_train
    grid.fit(X_train, y_train)

    # Extract best params
    best_params = grid.best_params_

    # Print best params
    print("Best params:", best_params)
    
    # Compute best score
    best_score = grid.best_score_

    # Print best score
    print("Training score: {:.5f}".format(best_score))

    # Predict test set labels
    y_pred = grid.predict(X_test)

    # Compute rmse_test
    acc = accuracy_score(y_test, y_pred)

    # Print rmse_test
    print('Test score: {:.5f}'.format(acc))

In [216]:
grid_search(params={'n_estimators':[100, 200, 400, 800]})

Best params: {'n_estimators': 100}
Training score: 0.79800
Test score: 0.80920


In [217]:
grid_search(params={'learning_rate':[0.01, 0.05, 0.1, 0.2, 0.3]})

Best params: {'learning_rate': 0.05}
Training score: 0.79875
Test score: 0.80693


In [218]:
grid_search(params={'max_depth':[2, 3, 5, 6, 8]})

Best params: {'max_depth': 2}
Training score: 0.80083
Test score: 0.80863


In [219]:
grid_search(params={'gamma':[0, 0.01, 0.1, 0.5, 1, 2]})

Best params: {'gamma': 0.5}
Training score: 0.80102
Test score: 0.80693


In [221]:
grid_search(params={'min_child_weight':[0.5, 1, 2, 3, 5]})

Best params: {'min_child_weight': 3}
Training score: 0.80197
Test score: 0.80352


In [222]:
grid_search(params={'subsample':[0.5, 0.7, 0.8, 0.9, 1]})

Best params: {'subsample': 0.7}
Training score: 0.80083
Test score: 0.80522


In [223]:
grid_search(params={'colsample_bytree':[0.5, 0.7, 0.8, 0.9, 1]})

Best params: {'colsample_bytree': 0.9}
Training score: 0.80027
Test score: 0.80806


In [230]:
model = XGBClassifier(random_state=2)
eval_set = [(X_test, y_test)]
eval_metric='error'
model.fit(X_train, y_train, eval_metric=eval_metric, eval_set=eval_set)
# make predictions for test data
y_pred = model.predict(X_test)
# evaluate predictions
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

[0]	validation_0-error:0.212947
[1]	validation_0-error:0.210108
[2]	validation_0-error:0.204997
[3]	validation_0-error:0.210108
[4]	validation_0-error:0.202726
[5]	validation_0-error:0.20159
[6]	validation_0-error:0.203861
[7]	validation_0-error:0.200454
[8]	validation_0-error:0.201022
[9]	validation_0-error:0.201022
[10]	validation_0-error:0.20159
[11]	validation_0-error:0.202158
[12]	validation_0-error:0.202158
[13]	validation_0-error:0.203861
[14]	validation_0-error:0.204997
[15]	validation_0-error:0.202726
[16]	validation_0-error:0.20159
[17]	validation_0-error:0.201022
[18]	validation_0-error:0.201022
[19]	validation_0-error:0.200454
[20]	validation_0-error:0.197615
[21]	validation_0-error:0.197615
[22]	validation_0-error:0.197615
[23]	validation_0-error:0.198183
[24]	validation_0-error:0.197047
[25]	validation_0-error:0.196479
[26]	validation_0-error:0.196479
[27]	validation_0-error:0.197047
[28]	validation_0-error:0.196479
[29]	validation_0-error:0.195344
[30]	validation_0-error

In [226]:
model = XGBClassifier(random_state=2)
eval_set = [(X_test, y_test)]
model.fit(X_train, y_train, eval_metric="error", eval_set=eval_set, early_stopping_rounds=10, verbose=True)
# make predictions for test data
y_pred = model.predict(X_test)
# evaluate predictions
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

[0]	validation_0-error:0.212947
Will train until validation_0-error hasn't improved in 10 rounds.
[1]	validation_0-error:0.210108
[2]	validation_0-error:0.204997
[3]	validation_0-error:0.210108
[4]	validation_0-error:0.202726
[5]	validation_0-error:0.20159
[6]	validation_0-error:0.203861
[7]	validation_0-error:0.200454
[8]	validation_0-error:0.201022
[9]	validation_0-error:0.201022
[10]	validation_0-error:0.20159
[11]	validation_0-error:0.202158
[12]	validation_0-error:0.202158
[13]	validation_0-error:0.203861
[14]	validation_0-error:0.204997
[15]	validation_0-error:0.202726
[16]	validation_0-error:0.20159
[17]	validation_0-error:0.201022
Stopping. Best iteration:
[7]	validation_0-error:0.200454

Accuracy: 79.95%


In [234]:
model = XGBClassifier(random_state=2, n_estimators=5000)
eval_set = [(X_test, y_test)]
model.fit(X_train, y_train, eval_metric="error", eval_set=eval_set, early_stopping_rounds=50)
# make predictions for test data
y_pred = model.predict(X_test)
# evaluate predictions
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

[0]	validation_0-error:0.212947
Will train until validation_0-error hasn't improved in 50 rounds.
[1]	validation_0-error:0.210108
[2]	validation_0-error:0.204997
[3]	validation_0-error:0.210108
[4]	validation_0-error:0.202726
[5]	validation_0-error:0.20159
[6]	validation_0-error:0.203861
[7]	validation_0-error:0.200454
[8]	validation_0-error:0.201022
[9]	validation_0-error:0.201022
[10]	validation_0-error:0.20159
[11]	validation_0-error:0.202158
[12]	validation_0-error:0.202158
[13]	validation_0-error:0.203861
[14]	validation_0-error:0.204997
[15]	validation_0-error:0.202726
[16]	validation_0-error:0.20159
[17]	validation_0-error:0.201022
[18]	validation_0-error:0.201022
[19]	validation_0-error:0.200454
[20]	validation_0-error:0.197615
[21]	validation_0-error:0.197615
[22]	validation_0-error:0.197615
[23]	validation_0-error:0.198183
[24]	validation_0-error:0.197047
[25]	validation_0-error:0.196479
[26]	validation_0-error:0.196479
[27]	validation_0-error:0.197047
[28]	validation_0-error

In [237]:
grid_search(params={'max_depth':[2, 3, 5, 6, 8], 'gamma':[0, 0.1, 0.5, 1, 2, 5], 
                    'learning_rate':[0.01, 0.05, 0.1, 0.2, 0.3], 'n_estimators':[48]}, random=True)

Best params: {'n_estimators': 48, 'max_depth': 2, 'learning_rate': 0.3, 'gamma': 0.1}
Training score: 0.79913
Test score: 0.80352


In [239]:
grid_search(params={'max_depth':[2, 3, 4, 5, 6, 7, 8], 'n_estimators':[48]})

Best params: {'max_depth': 3, 'n_estimators': 48}
Training score: 0.80008
Test score: 0.80806


In [241]:
grid_search(params={'learning_rate':[0.01, 0.05, 0.1, 0.2, 0.3, 0.3], 'max_depth':[3], 'n_estimators':[48]})

Best params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 48}
Training score: 0.80008
Test score: 0.80806


In [243]:
grid_search(params={'learning_rate':[0.08, 0.09, 0.1, 0.11, 0.12], 'max_depth':[3], 'n_estimators':[48]})

Best params: {'learning_rate': 0.09, 'max_depth': 3, 'n_estimators': 48}
Training score: 0.80027
Test score: 0.80295


In [244]:
grid_search(params={'min_child_weight':[0.5, 1, 2, 3, 4, 5], 'max_depth':[3], 'n_estimators':[48]})

Best params: {'max_depth': 3, 'min_child_weight': 5, 'n_estimators': 48}
Training score: 0.80121
Test score: 0.80522


In [248]:
grid_search(params={'subsample':[0.5, 0.5, 0.7, 0.8, 0.9, 1], 'min_child_weight':[5], 'max_depth':[3], 'n_estimators':[48]})

Best params: {'max_depth': 3, 'min_child_weight': 5, 'n_estimators': 48, 'subsample': 0.7}
Training score: 0.80159
Test score: 0.81090


In [249]:
grid_search(params={'colsample_bytree':[0.5, 0.5, 0.7, 0.8, 0.9, 1], 'subsample':[0.7], 
                    'min_child_weight':[5], 'max_depth':[3], 'n_estimators':[48]})

Best params: {'colsample_bytree': 0.7, 'max_depth': 3, 'min_child_weight': 5, 'n_estimators': 48, 'subsample': 0.7}
Training score: 0.80273
Test score: 0.80920


In [250]:
model = XGBClassifier(max_depth=3, subsample=0.7, min_child_weight=5, colsample_bytree=0.7, 
                      random_state=2, n_estimators=5000)
eval_set = [(X_test, y_test)]
model.fit(X_train, y_train, eval_metric="error", eval_set=eval_set, early_stopping_rounds=50)
# make predictions for test data
y_pred = model.predict(X_test)
# evaluate predictions
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

[0]	validation_0-error:0.215219
Will train until validation_0-error hasn't improved in 50 rounds.
[1]	validation_0-error:0.206701
[2]	validation_0-error:0.206701
[3]	validation_0-error:0.204429
[4]	validation_0-error:0.201022
[5]	validation_0-error:0.200454
[6]	validation_0-error:0.198183
[7]	validation_0-error:0.199319
[8]	validation_0-error:0.203861
[9]	validation_0-error:0.198183
[10]	validation_0-error:0.198751
[11]	validation_0-error:0.199319
[12]	validation_0-error:0.200454
[13]	validation_0-error:0.200454
[14]	validation_0-error:0.202726
[15]	validation_0-error:0.199319
[16]	validation_0-error:0.195344
[17]	validation_0-error:0.195911
[18]	validation_0-error:0.196479
[19]	validation_0-error:0.198751
[20]	validation_0-error:0.197047
[21]	validation_0-error:0.195344
[22]	validation_0-error:0.197047
[23]	validation_0-error:0.197047
[24]	validation_0-error:0.197615
[25]	validation_0-error:0.196479
[26]	validation_0-error:0.196479
[27]	validation_0-error:0.195344
[28]	validation_0-er

In [251]:
grid_search(params={'learning_rate':[0.01, 0.025, 0.05, 0.075, 0.1, 0.2,], 'colsample_bytree':[0.7], 
                    'subsample':[0.7], 'min_child_weight':[5], 'max_depth':[3], 'n_estimators':[81]})

Best params: {'colsample_bytree': 0.7, 'learning_rate': 0.05, 'max_depth': 3, 'min_child_weight': 5, 'n_estimators': 81, 'subsample': 0.7}
Training score: 0.80026
Test score: 0.80750


In [252]:
grid_search(params={'colsample_bytree':[0.7], 
                    'subsample':[0.7], 'min_child_weight':[5], 'max_depth':[3], 'n_estimators':[81]})

Best params: {'colsample_bytree': 0.7, 'max_depth': 3, 'min_child_weight': 5, 'n_estimators': 81, 'subsample': 0.7}
Training score: 0.79837
Test score: 0.80977
