In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder

In [2]:
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [3]:
def tenure_bucket(tenure):
    if tenure <= 10:
        return '0-10'
    elif tenure <= 30:
        return '10-30'
    elif tenure <= 50:
        return '30-50'
    elif tenure <= 70:
        return '50-70'
    else:
        return '70+'

In [4]:
df['tenure_bucket'] = df['tenure'].map(tenure_bucket)

In [5]:
columns_to_encode = ['gender', 'Partner', 'Dependents'
                     , 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod' , 'tenure_bucket']

In [6]:
x1 = df.drop(columns=['Churn' , 'customerID', 'tenure'])
y = df['Churn'].replace({'Yes': 1, 'No': 0 ,' ': 0})

  y = df['Churn'].replace({'Yes': 1, 'No': 0 ,' ': 0})


In [7]:
encoder = OneHotEncoder(sparse_output= False)
dummy_df = encoder.fit_transform(x1[columns_to_encode])
one_hot_df = pd.DataFrame(dummy_df, columns=encoder.get_feature_names_out(columns_to_encode))
df_encoded = pd.concat([x1, one_hot_df], axis=1)
x = df_encoded.drop(columns_to_encode, axis=1).replace({' ': 0}).astype(float)

# dummi_df  = dummi_df 

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

In [9]:
rf_p_list = {'n_estimators' : [10 , 30 , 100 , 400 , 1000],
            'max_depth' : [3 , 5 , 10 , None],
             'max_features' : randint(1,3),
             'criterion' : ['gini' , 'entropy'],
             'min_samples_leaf' : randint(1,4),
             }

In [10]:
est = RandomForestClassifier()
rmdsearch = RandomizedSearchCV(est , param_distributions= rf_p_list , n_iter= 40,
                               n_jobs= -1 , cv = 10)
rmdsearch.fit(x , y)



In [11]:
rmdsearch.best_estimator_

In [12]:
from sklearn.model_selection import cross_val_score
est = RandomForestClassifier(criterion='entropy', max_depth=10, max_features=2,
                             min_samples_leaf=3 , n_estimators=400)
est.fit(x,y)
score = cross_val_score(est, x, y , cv= 5)
print(score)
print(score.mean())

[0.79843861 0.80766501 0.78140525 0.80752841 0.80539773]
0.8000870015807472


In [15]:
from sklearn.model_selection import train_test_split

In [14]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=140)

In [15]:
pred = est.predict(x_test)

In [16]:
print(accuracy_score(y_test, pred))
print(confusion_matrix(y_test, pred))

0.8275372604684174
[[969  60]
 [183 197]]


In [9]:
import xgboost
from sklearn.model_selection import cross_val_score

In [10]:
xgb_p_list = {'n_estimators' : [10 , 30 , 100 , 400 , 1000],
            'max_depth' : [3 , 5 , 10 , None],
            'learning_rate': [0.01, 0.1, 0.2, 0.3],
            'min_child_weight': [1, 3, 6, 10],
            'subsample': [0.5,0.6, 0.85, 1.0],
            'colsample_bytree': [0.3 ,0.5, 0.75, 1.0],
            'gamma': [0, 0.1, 0.2 , 0.5],
            'reg_alpha': [0, 0.1, 0.5],
            'reg_lambda': [0.5, 1.0, 1.5]
             }

In [11]:
est = xgboost.XGBClassifier()
xgbsearch = RandomizedSearchCV(est , param_distributions= xgb_p_list , n_iter= 40,
                               n_jobs= -1 , cv = 10)
xgbsearch.fit(x , y)
xgbsearch.best_estimator_

In [12]:
print(xgbsearch.best_params_)

{'subsample': 0.5, 'reg_lambda': 1.5, 'reg_alpha': 0.1, 'n_estimators': 400, 'min_child_weight': 6, 'max_depth': 5, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 0.3}


In [13]:
est = xgboost.XGBClassifier(n_estimators = 400, max_depth = 5, learning_rate = 0.01,
            min_child_weight = 6 , subsample= 0.5,
            colsample_bytree = 0.3, gamma =0, reg_alpha = 0.1,
            reg_lambda = 1.5)

est.fit(x,y)
score = cross_val_score(est, x, y , cv= 10)
print(score)
print(score.mean())

[0.81702128 0.80851064 0.8070922  0.81534091 0.796875   0.78409091
 0.83238636 0.80113636 0.79403409 0.80539773]
0.8061885477111541


In [16]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=140)

In [17]:
pred = est.predict(x_test)
print(accuracy_score(y_test, pred))
print(confusion_matrix(y_test, pred))

0.8254080908445706
[[953  76]
 [170 210]]
