### Importing required libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
#import imblearn
from imblearn.over_sampling import SMOTEN


### Reading csv

In [4]:
df = pd.read_csv(r'D:\git_project\Telecom-Churn-Analysis\telecom_df.csv')

In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,Churn,AccountWeeks,ContractRenewal,DataPlan,DataUsage,CustServCalls,DayMins,DayCalls,MonthlyCharge,OverageFee,RoamMins,AccountWeeksGroup,DayCallsGroup,DayMinsGroup,CustServCallsGroup
0,0,0,128,1,1,2.7,1,265.1,110,89.0,9.87,10.0,1,3,1,0
1,1,0,107,1,1,3.7,1,161.6,123,82.0,9.78,13.7,1,0,0,0
2,2,0,137,1,0,0.0,0,243.4,114,52.0,6.06,12.2,1,0,1,0
3,3,0,84,0,0,0.0,2,299.4,71,57.0,3.1,6.6,0,2,1,1
4,4,0,75,0,0,0.0,3,166.7,113,41.0,7.42,10.1,0,0,0,1


In [9]:
#dropping the unnamed column
df.drop('Unnamed: 0', axis = 1)

Unnamed: 0,Churn,AccountWeeks,ContractRenewal,DataPlan,DataUsage,CustServCalls,DayMins,DayCalls,MonthlyCharge,OverageFee,RoamMins,AccountWeeksGroup,DayCallsGroup,DayMinsGroup,CustServCallsGroup
0,0,128,1,1,2.70,1,265.1,110,89.0,9.87,10.0,1,3,1,0
1,0,107,1,1,3.70,1,161.6,123,82.0,9.78,13.7,1,0,0,0
2,0,137,1,0,0.00,0,243.4,114,52.0,6.06,12.2,1,0,1,0
3,0,84,0,0,0.00,2,299.4,71,57.0,3.10,6.6,0,2,1,1
4,0,75,0,0,0.00,3,166.7,113,41.0,7.42,10.1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3328,0,192,1,1,2.67,2,156.2,77,71.7,10.78,9.9,2,2,0,1
3329,0,68,1,0,0.34,3,231.1,57,56.4,7.67,9.6,0,2,0,1
3330,0,28,1,0,0.00,2,180.8,109,56.0,14.44,14.1,4,3,0,1
3331,0,184,0,0,0.00,2,213.8,105,50.0,7.98,5.0,2,3,0,1


In [13]:
x = df.drop("Churn", axis =1)
y = df['Churn']

In [15]:
y.value_counts()

0    2850
1     483
Name: Churn, dtype: int64

### Handling Imbalance data


In [25]:
#initalize smote
smote = SMOTEN()

x_resampled, y_resampled = smote.fit_resample(x,y)

### Train test split

In [26]:
x_train, x_test, y_train, y_test = train_test_split(x_resampled,y_resampled,test_size=0.2)

### Decision Tree Classifier

In [29]:
model_dt=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [30]:
model_dt.fit(x_train,y_train)

In [31]:
y_pred = model_dt.predict(x_test)

In [32]:
model_dt.score(x_test,y_test)

0.906140350877193

In [34]:
print(classification_report(y_test,y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.90      0.91      0.91       558
           1       0.92      0.90      0.91       582

    accuracy                           0.91      1140
   macro avg       0.91      0.91      0.91      1140
weighted avg       0.91      0.91      0.91      1140



### Random Forest Classifier

In [35]:
from sklearn.ensemble import RandomForestClassifier
model_rf = RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)
model_rf.fit(x_train, y_train)

In [37]:
y_pred = model_rf.predict(x_test)
model_rf.score(x_test,y_test)

0.9271929824561403

In [38]:
print(classification_report(y_test,y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.90      0.96      0.93       558
           1       0.96      0.89      0.93       582

    accuracy                           0.93      1140
   macro avg       0.93      0.93      0.93      1140
weighted avg       0.93      0.93      0.93      1140



### Pickling the model

In [39]:
import pickle
filename = 'model.sav'
pickle.dump(model_rf,open(filename,'wb'))

In [40]:
load_model = pickle.load(open(filename,'rb'))
model_score_rf = load_model.score(x_test,y_test)
print(model_score_rf)

0.9271929824561403
