In [2]:
import pandas as pd
import numpy as np

In [3]:
telecom = pd.read_csv('telecom_users.csv')

In [4]:
telecom.head()

Unnamed: 0.1,Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,1869,7010-BRBUU,Male,0,Yes,Yes,72,Yes,Yes,No,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Credit card (automatic),24.1,1734.65,No
1,4528,9688-YGXVR,Female,0,No,No,44,Yes,No,Fiber optic,...,Yes,No,Yes,No,Month-to-month,Yes,Credit card (automatic),88.15,3973.2,No
2,6344,9286-DOJGF,Female,1,Yes,No,38,Yes,Yes,Fiber optic,...,No,No,No,No,Month-to-month,Yes,Bank transfer (automatic),74.95,2869.85,Yes
3,6739,6994-KERXL,Male,0,No,No,4,Yes,No,DSL,...,No,No,No,Yes,Month-to-month,Yes,Electronic check,55.9,238.5,No
4,432,2181-UAESM,Male,0,No,No,2,Yes,No,DSL,...,Yes,No,No,No,Month-to-month,No,Electronic check,53.45,119.5,No


### Checking for null values

In [5]:
telecom.isna().any()

Unnamed: 0          False
customerID          False
gender              False
SeniorCitizen       False
Partner             False
Dependents          False
tenure              False
PhoneService        False
MultipleLines       False
InternetService     False
OnlineSecurity      False
OnlineBackup        False
DeviceProtection    False
TechSupport         False
StreamingTV         False
StreamingMovies     False
Contract            False
PaperlessBilling    False
PaymentMethod       False
MonthlyCharges      False
TotalCharges        False
Churn               False
dtype: bool

###### No null values. OK to proceed.

###### Drop columns Unnamed: 0 and customerID, since they are just unique ids and dont contribute any information related to churn.

In [6]:
telecom = telecom.drop(columns=['Unnamed: 0','customerID'])
telecom.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Male,0,Yes,Yes,72,Yes,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Credit card (automatic),24.1,1734.65,No
1,Female,0,No,No,44,Yes,No,Fiber optic,No,Yes,Yes,No,Yes,No,Month-to-month,Yes,Credit card (automatic),88.15,3973.2,No
2,Female,1,Yes,No,38,Yes,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Bank transfer (automatic),74.95,2869.85,Yes
3,Male,0,No,No,4,Yes,No,DSL,No,No,No,No,No,Yes,Month-to-month,Yes,Electronic check,55.9,238.5,No
4,Male,0,No,No,2,Yes,No,DSL,Yes,No,Yes,No,No,No,Month-to-month,No,Electronic check,53.45,119.5,No


##### The categorical variables need to be modified from Yes, No to 1 and 0.

In [7]:
telecom = telecom.replace('Yes',1)
telecom = telecom.replace('No',0)

In [8]:
telecom.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Male,0,1,1,72,1,1,0,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,0,Credit card (automatic),24.1,1734.65,0
1,Female,0,0,0,44,1,0,Fiber optic,0,1,1,0,1,0,Month-to-month,1,Credit card (automatic),88.15,3973.2,0
2,Female,1,1,0,38,1,1,Fiber optic,0,0,0,0,0,0,Month-to-month,1,Bank transfer (automatic),74.95,2869.85,1
3,Male,0,0,0,4,1,0,DSL,0,0,0,0,0,1,Month-to-month,1,Electronic check,55.9,238.5,0
4,Male,0,0,0,2,1,0,DSL,1,0,1,0,0,0,Month-to-month,0,Electronic check,53.45,119.5,0


#### For gender,
**1:Male  ,  0:Female**

In [9]:
telecom['gender'].replace({'Male':1,'Female':0},inplace=True)

In [10]:
telecom.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,1,0,1,1,72,1,1,0,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,0,Credit card (automatic),24.1,1734.65,0
1,0,0,0,0,44,1,0,Fiber optic,0,1,1,0,1,0,Month-to-month,1,Credit card (automatic),88.15,3973.2,0
2,0,1,1,0,38,1,1,Fiber optic,0,0,0,0,0,0,Month-to-month,1,Bank transfer (automatic),74.95,2869.85,1
3,1,0,0,0,4,1,0,DSL,0,0,0,0,0,1,Month-to-month,1,Electronic check,55.9,238.5,0
4,1,0,0,0,2,1,0,DSL,1,0,1,0,0,0,Month-to-month,0,Electronic check,53.45,119.5,0


In [11]:
telecom['MultipleLines'].unique()

array([1, 0, 'No phone service'], dtype=object)

In [12]:
# Replace No phone service as 2, to mark seperate category
telecom['MultipleLines'].replace('No phone service',2,inplace=True)

In [13]:
telecom['InternetService'].unique()

array([0, 'Fiber optic', 'DSL'], dtype=object)

In [15]:
# Replace Fiber optic with 1 and DSL to 2
telecom['InternetService'].replace({'Fiber optic':1,'DSL':2},inplace=True)

In [16]:
# No internet service is a common category in many features. Replace it with 2.

telecom.replace('No internet service',2,inplace=True)

In [19]:
telecom.Contract.unique()

array(['Two year', 'Month-to-month', 'One year'], dtype=object)

In [21]:
telecom.Contract.replace({'Two year':2, 'Month-to-month':0, 'One year':1},inplace = True)

In [22]:
telecom.PaymentMethod.unique()

array(['Credit card (automatic)', 'Bank transfer (automatic)',
       'Electronic check', 'Mailed check'], dtype=object)

In [23]:
telecom.PaymentMethod.replace({'Credit card (automatic)':1, 'Bank transfer (automatic)':2, 'Electronic check':3,'Mailed check':4},inplace = True)

- customerID - customer id  
- gender - client gender (male / female)  
- SeniorCitizen - is the client retired (1, 0)  
- Partner - is the client married (Yes, No)  
- tenure - how many months a person has been a client of the company  
- PhoneService - is the telephone service connected (Yes, No)  
- MultipleLines - are multiple phone lines connected (Yes, No, No phone service)  
- InternetService - client's Internet service provider (DSL, Fiber optic, No)  
- OnlineSecurity - is the online security service connected (Yes, No, No internet service)  
- OnlineBackup - is the online backup service activated (Yes, No, No internet service)  
- DeviceProtection - does the client have equipment insurance (Yes, No, No internet service)  
- TechSupport - is the technical support service connected (Yes, No, No internet service)  
- StreamingTV - is the streaming TV service connected (Yes, No, No internet service)  
- StreamingMovies - is the streaming cinema service activated (Yes, No, No internet service)  
- Contract - type of customer contract (Month-to-month, One year, Two year)  
- PaperlessBilling - whether the client uses paperless billing (Yes, No)  
- PaymentMethod - payment method (Electronic check, Mailed check, Bank transfer (automatic), Credit card (automatic))  
- MonthlyCharges - current monthly payment  
- TotalCharges - the total amount that the client paid for the services for the entire time  
- Churn - whether there was a churn (Yes or No)  

### Cast categorical columns to data type category

In [25]:
categorical_cols = ['gender','SeniorCitizen','Partner','PhoneService','InternetService','OnlineSecurity','OnlineBackup',\
                    'DeviceProtection','TechSupport','StreamingTV','StreamingMovies','Contract','PaperlessBilling',\
                    'PaymentMethod','Churn']
casting = {col:'category' for col in categorical_cols}

In [28]:
telecom = telecom.astype(casting)

In [38]:
telecom.dtypes

gender              category
SeniorCitizen       category
Partner             category
Dependents             int64
tenure                 int64
PhoneService        category
MultipleLines          int64
InternetService     category
OnlineSecurity      category
OnlineBackup        category
DeviceProtection    category
TechSupport         category
StreamingTV         category
StreamingMovies     category
Contract            category
PaperlessBilling    category
PaymentMethod       category
MonthlyCharges       float64
TotalCharges          object
Churn               category
dtype: object

In [41]:
# There are some empty strings in Total Charges. Replace it with 0 and convert column to float

sum(telecom['TotalCharges'] == ' ')

10

In [42]:
telecom.TotalCharges.replace(' ',0,inplace=True)

## Now, the fun part!

In [44]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn import metrics

In [45]:
x = telecom.drop(['Churn'], axis=1)
y = telecom['Churn']

In [52]:
y.value_counts()

0    4399
1    1587
Name: Churn, dtype: int64

In [47]:
x.shape, y.shape

((5986, 19), (5986,))

In [46]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

#### Load Random Forest with default hyperparameters

In [53]:
from sklearn.ensemble import RandomForestClassifier

rf_classifier=RandomForestClassifier().fit(x_train,y_train)

prediction=rf_classifier.predict(x_test)

In [54]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
print(confusion_matrix(y_test,prediction))
print(accuracy_score(y_test,prediction))
print(classification_report(y_test,prediction))

[[801  94]
 [160 143]]
0.7879799666110183
              precision    recall  f1-score   support

           0       0.83      0.89      0.86       895
           1       0.60      0.47      0.53       303

    accuracy                           0.79      1198
   macro avg       0.72      0.68      0.70      1198
weighted avg       0.78      0.79      0.78      1198



**With default parameters, we are able to gain an accuracy of 79%. Let's try to improve it using some hyperparameter tuning.**

In [55]:
# Randomized Search
from sklearn.model_selection import RandomizedSearchCV

# Number of decision trees
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Number of features to sample for each tree

max_features = ['auto', 'sqrt','log2']

# Maximum depth for a tree
max_depth = [int(x) for x in np.linspace(100, 1000,10)]

# Minimum number of samples required to split an internal node
min_samples_split = [2, 5, 10,14]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4,6,8]
# Create random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
              'criterion':['entropy','gini']}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000], 'min_samples_split': [2, 5, 10, 14], 'min_samples_leaf': [1, 2, 4, 6, 8], 'criterion': ['entropy', 'gini']}


In [57]:
rf=RandomForestClassifier()
rf_randomcv=RandomizedSearchCV(estimator=rf,param_distributions=random_grid,n_iter=100,cv=3,verbose=2,
                               random_state=100,n_jobs=-1)
rf_randomcv.fit(x_train,y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'criterion': ['entropy', 'gini'],
                                        'max_depth': [10, 120, 230, 340, 450,
                                                      560, 670, 780, 890,
                                                      1000],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [1, 2, 4, 6, 8],
                                        'min_samples_split': [2, 5, 10, 14],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=100, verbose=2)

In [63]:
best_random_grid=rf_randomcv.best_estimator_

In [64]:
from sklearn.metrics import accuracy_score
y_pred=best_random_grid.predict(x_test)
print(confusion_matrix(y_test,y_pred))
print("Accuracy Score {}".format(accuracy_score(y_test,y_pred)))
print("Classification report: {}".format(classification_report(y_test,y_pred)))

[[806  89]
 [147 156]]
Accuracy Score 0.8030050083472454
Classification report:               precision    recall  f1-score   support

           0       0.85      0.90      0.87       895
           1       0.64      0.51      0.57       303

    accuracy                           0.80      1198
   macro avg       0.74      0.71      0.72      1198
weighted avg       0.79      0.80      0.80      1198



In [65]:
# Grid search

from sklearn.model_selection import GridSearchCV

param_grid = {
    'criterion': [rf_randomcv.best_params_['criterion']],
    'max_depth': [rf_randomcv.best_params_['max_depth']],
    'max_features': [rf_randomcv.best_params_['max_features']],
    'min_samples_leaf': [rf_randomcv.best_params_['min_samples_leaf'], 
                         rf_randomcv.best_params_['min_samples_leaf']+2, 
                         rf_randomcv.best_params_['min_samples_leaf'] + 4],
    'min_samples_split': [rf_randomcv.best_params_['min_samples_split'] - 2,
                          rf_randomcv.best_params_['min_samples_split'] - 1,
                          rf_randomcv.best_params_['min_samples_split'], 
                          rf_randomcv.best_params_['min_samples_split'] +1,
                          rf_randomcv.best_params_['min_samples_split'] + 2],
    'n_estimators': [rf_randomcv.best_params_['n_estimators'] - 200, rf_randomcv.best_params_['n_estimators'] - 100, 
                     rf_randomcv.best_params_['n_estimators'], 
                     rf_randomcv.best_params_['n_estimators'] + 100, rf_randomcv.best_params_['n_estimators'] + 200]
}

print(param_grid)

{'criterion': ['gini'], 'max_depth': [340], 'max_features': ['auto'], 'min_samples_leaf': [4, 6, 8], 'min_samples_split': [12, 13, 14, 15, 16], 'n_estimators': [1400, 1500, 1600, 1700, 1800]}


In [67]:
#### Fit the grid_search to the data

rf=RandomForestClassifier()
grid_search=GridSearchCV(estimator=rf,param_grid=param_grid,cv=10,n_jobs=-1,verbose=2)
grid_search.fit(x_train,y_train)

Fitting 10 folds for each of 75 candidates, totalling 750 fits


GridSearchCV(cv=10, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini'], 'max_depth': [340],
                         'max_features': ['auto'],
                         'min_samples_leaf': [4, 6, 8],
                         'min_samples_split': [12, 13, 14, 15, 16],
                         'n_estimators': [1400, 1500, 1600, 1700, 1800]},
             verbose=2)

In [69]:
best_grid=grid_search.best_estimator_

In [70]:
y_pred=best_grid.predict(x_test)
print(confusion_matrix(y_test,y_pred))
print("Accuracy Score {}".format(accuracy_score(y_test,y_pred)))
print("Classification report: {}".format(classification_report(y_test,y_pred)))

[[805  90]
 [141 162]]
Accuracy Score 0.8071786310517529
Classification report:               precision    recall  f1-score   support

           0       0.85      0.90      0.87       895
           1       0.64      0.53      0.58       303

    accuracy                           0.81      1198
   macro avg       0.75      0.72      0.73      1198
weighted avg       0.80      0.81      0.80      1198



#### SVM with default parameters

In [93]:
from sklearn.svm import SVC

svm_classifier=SVC().fit(x_train,y_train)

prediction=svm_classifier.predict(x_test)

In [94]:
print(confusion_matrix(y_test,prediction))
print(accuracy_score(y_test,prediction))
print(classification_report(y_test,prediction))

[[895   0]
 [303   0]]
0.7470784641068448
              precision    recall  f1-score   support

           0       0.75      1.00      0.86       895
           1       0.00      0.00      0.00       303

    accuracy                           0.75      1198
   macro avg       0.37      0.50      0.43      1198
weighted avg       0.56      0.75      0.64      1198



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**With default parameters, we are able to gain an accuracy of 75%. Let's try to improve it using some hyperparameter tuning.**

In [95]:
# Randomized Search
from sklearn.model_selection import GridSearchCV

# Regularization penalty
c = [0.1,1,100,1000]

# Type of kernel

kernel = ['poly', 'sigmoid','rbf']

# degree for kernel
degree = [1,2,3,4,5,6]


param_grid = {'C': c,
               'kernel': kernel,
               'degree': degree
              }
print(param_grid)

{'C': [0.1, 1, 100, 1000], 'kernel': ['poly', 'sigmoid', 'rbf'], 'degree': [1, 2, 3, 4, 5, 6]}


In [96]:
svm=SVC()
svm_gridcv=GridSearchCV(estimator=svm,param_grid=param_grid,cv=3,verbose=2,
                               n_jobs=-1)
svm_gridcv.fit(x_train,y_train)

Fitting 3 folds for each of 72 candidates, totalling 216 fits


GridSearchCV(cv=3, estimator=SVC(), n_jobs=-1,
             param_grid={'C': [0.1, 1, 100, 1000], 'degree': [1, 2, 3, 4, 5, 6],
                         'kernel': ['poly', 'sigmoid', 'rbf']},
             verbose=2)

In [91]:
best_random_grid=svm_gridcv.best_estimator_
best_random_grid

SVC(C=1000, degree=1)

In [92]:
from sklearn.metrics import accuracy_score
y_pred=best_random_grid.predict(x_test)
print(confusion_matrix(y_test,y_pred))
print("Accuracy Score {}".format(accuracy_score(y_test,y_pred)))
print("Classification report: {}".format(classification_report(y_test,y_pred)))

[[837  58]
 [193 110]]
Accuracy Score 0.7904841402337228
Classification report:               precision    recall  f1-score   support

           0       0.81      0.94      0.87       895
           1       0.65      0.36      0.47       303

    accuracy                           0.79      1198
   macro avg       0.73      0.65      0.67      1198
weighted avg       0.77      0.79      0.77      1198



In [80]:
svm_randomcv.best_params_

{'kernel': 'rbf', 'degree': 1, 'C': 1000}