In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("/Users/admin/Desktop/telecom_churn.csv")

In [3]:
df = pd.get_dummies(df, columns=['Income']) 
df.head()

Unnamed: 0,Churn,AccountWeeks,ContractRenewal,DataPlan,DataUsage,CustServCalls,DayMins,DayCalls,MonthlyCharge,OverageFee,RoamMins,Income_High,Income_Low,Income_Medium
0,0,128,1,1,2.7,1,265.1,110,89.0,9.87,10.0,1,0,0
1,0,107,1,1,3.7,1,161.6,123,82.0,9.78,13.7,0,0,1
2,0,137,1,0,0.0,0,243.4,114,52.0,6.06,12.2,0,1,0
3,0,84,0,0,0.0,2,299.4,71,57.0,3.1,6.6,0,1,0
4,0,75,0,0,0.0,3,166.7,113,41.0,7.42,10.1,1,0,0


In [4]:
df.columns

Index(['Churn', 'AccountWeeks', 'ContractRenewal', 'DataPlan', 'DataUsage',
       'CustServCalls', 'DayMins', 'DayCalls', 'MonthlyCharge', 'OverageFee',
       'RoamMins', 'Income_High', 'Income_Low', 'Income_Medium'],
      dtype='object')

# Random Forest Model

In [5]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn import metrics
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import classification_report

In [6]:
y=df['Churn']
X=df[['AccountWeeks', 'ContractRenewal', 'DataPlan', 'DataUsage',
       'CustServCalls', 'DayMins', 'DayCalls', 'MonthlyCharge', 'OverageFee',
       'RoamMins', 'Income_High', 'Income_Low', 'Income_Medium']]

In [7]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=4)

In [8]:
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

[[568  10]
 [ 24  65]]
              precision    recall  f1-score   support

           0       0.96      0.98      0.97       578
           1       0.87      0.73      0.79        89

    accuracy                           0.95       667
   macro avg       0.91      0.86      0.88       667
weighted avg       0.95      0.95      0.95       667

0.9490254872563718


In [10]:
param_grid = {
    'bootstrap': [True],
    'max_depth': [5,7,9],
    'max_features': [0.4,0.6,0.8,1.0],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [50, 100, 200, 300],
    'max_samples': [0.6,0.7,0.9,1.0]
}

In [11]:
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3,scoring='f1', n_jobs = -1, verbose = 2)
grid_search.fit(X_train, y_train)
grid_search.best_params_

Fitting 3 folds for each of 1728 candidates, totalling 5184 fits




{'bootstrap': True,
 'max_depth': 9,
 'max_features': 0.4,
 'max_samples': 0.9,
 'min_samples_leaf': 3,
 'min_samples_split': 12,
 'n_estimators': 100}

In [12]:
best_grid = grid_search.best_estimator_
y_pred = best_grid.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

[[569   9]
 [ 25  64]]
              precision    recall  f1-score   support

           0       0.96      0.98      0.97       578
           1       0.88      0.72      0.79        89

    accuracy                           0.95       667
   macro avg       0.92      0.85      0.88       667
weighted avg       0.95      0.95      0.95       667

0.9490254872563718


## Giving more importance to the positive class

In [16]:
y_pred=(best_grid.predict_proba(X_test)[:,1]>=0.20).astype('int')
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

[[527  51]
 [ 13  76]]
              precision    recall  f1-score   support

           0       0.98      0.91      0.94       578
           1       0.60      0.85      0.70        89

    accuracy                           0.90       667
   macro avg       0.79      0.88      0.82       667
weighted avg       0.93      0.90      0.91       667

0.904047976011994


## Oversampling and Undersampling

In [18]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [19]:
over=SMOTE(sampling_strategy=0.30,random_state=42)
under = RandomUnderSampler(sampling_strategy=0.7,random_state=42)

In [22]:
X_res, y_res = over.fit_resample(X_train, y_train)
X_res, y_res = under.fit_resample(X_res, y_res)

In [23]:
one=np.sum(y_res)
zero=len(y_res)-np.sum(y_res)
print(one)
print(zero)
print(one/zero)

681
972
0.7006172839506173


In [24]:
grid_search.fit(X_res, y_res)

grid_search.best_params_

Fitting 3 folds for each of 1728 candidates, totalling 5184 fits




{'bootstrap': True,
 'max_depth': 9,
 'max_features': 0.4,
 'max_samples': 0.9,
 'min_samples_leaf': 3,
 'min_samples_split': 8,
 'n_estimators': 200}

In [25]:
best_grid = grid_search.best_estimator_
y_pred = best_grid.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

[[526  52]
 [ 10  79]]
              precision    recall  f1-score   support

           0       0.98      0.91      0.94       578
           1       0.60      0.89      0.72        89

    accuracy                           0.91       667
   macro avg       0.79      0.90      0.83       667
weighted avg       0.93      0.91      0.91       667

0.9070464767616192


In [26]:
y_pred=(best_grid.predict_proba(X_test)[:,1]>=0.30).astype('int')
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

[[500  78]
 [  8  81]]
              precision    recall  f1-score   support

           0       0.98      0.87      0.92       578
           1       0.51      0.91      0.65        89

    accuracy                           0.87       667
   macro avg       0.75      0.89      0.79       667
weighted avg       0.92      0.87      0.89       667

0.8710644677661169


In [28]:
weights = {0:1, 1:1}
model = RandomForestClassifier(class_weight=weights,max_depth= 9,max_features=0.4,min_samples_leaf=3,min_samples_split=8,n_estimators=200,random_state=42)
model.fit(X_res,y_res)
y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

[[528  50]
 [ 12  77]]
              precision    recall  f1-score   support

           0       0.98      0.91      0.94       578
           1       0.61      0.87      0.71        89

    accuracy                           0.91       667
   macro avg       0.79      0.89      0.83       667
weighted avg       0.93      0.91      0.91       667

0.9070464767616192


## Feature Importance

In [29]:
importance = model.feature_importances_
importance

array([0.03813462, 0.17969611, 0.0415253 , 0.06321301, 0.18050858,
       0.18460629, 0.03259761, 0.13459198, 0.06092188, 0.04095901,
       0.01701727, 0.01124241, 0.01498595])

In [30]:
feature_imp = pd.Series(model.feature_importances_,index=X_test.columns).sort_values(ascending=False)
feature_imp

DayMins            0.184606
CustServCalls      0.180509
ContractRenewal    0.179696
MonthlyCharge      0.134592
DataUsage          0.063213
OverageFee         0.060922
DataPlan           0.041525
RoamMins           0.040959
AccountWeeks       0.038135
DayCalls           0.032598
Income_High        0.017017
Income_Medium      0.014986
Income_Low         0.011242
dtype: float64