In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
import pickle as pkl

In [2]:
#Reading the dataset
df = pd.read_csv('telecommunications_churn.csv', sep = ';')
df

Unnamed: 0,account_length,voice_mail_plan,voice_mail_messages,day_mins,evening_mins,night_mins,international_mins,customer_service_calls,international_plan,day_calls,day_charge,evening_calls,evening_charge,night_calls,night_charge,international_calls,international_charge,total_charge,churn
0,128,1,25,265.1,197.4,244.7,10.0,1,0,110,45.07,99,16.78,91,11.01,3,2.70,75.56,0
1,107,1,26,161.6,195.5,254.4,13.7,1,0,123,27.47,103,16.62,103,11.45,3,3.70,59.24,0
2,137,0,0,243.4,121.2,162.6,12.2,0,0,114,41.38,110,10.30,104,7.32,5,3.29,62.29,0
3,84,0,0,299.4,61.9,196.9,6.6,2,1,71,50.90,88,5.26,89,8.86,7,1.78,66.80,0
4,75,0,0,166.7,148.3,186.9,10.1,3,1,113,28.34,122,12.61,121,8.41,3,2.73,52.09,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3328,192,1,36,156.2,215.5,279.1,9.9,2,0,77,26.55,126,18.32,83,12.56,6,2.67,60.10,0
3329,68,0,0,231.1,153.4,191.3,9.6,3,0,57,39.29,55,13.04,123,8.61,4,2.59,63.53,0
3330,28,0,0,180.8,288.8,191.9,14.1,2,0,109,30.74,58,24.55,91,8.64,6,3.81,67.74,0
3331,184,0,0,213.8,159.6,139.2,5.0,2,1,105,36.35,84,13.57,137,6.26,10,1.35,57.53,0


In [3]:
# Univariate Analysis
data = df.drop(['churn'], axis=1)

from sklearn.feature_selection import SelectKBest, f_classif
select = SelectKBest(f_classif, k = 9)

data_req = select.fit_transform(data, df['churn'])
data_req

array([[  1.  ,  25.  , 265.1 , ...,  45.07,  16.78,  75.56],
       [  1.  ,  26.  , 161.6 , ...,  27.47,  16.62,  59.24],
       [  0.  ,   0.  , 243.4 , ...,  41.38,  10.3 ,  62.29],
       ...,
       [  0.  ,   0.  , 180.8 , ...,  30.74,  24.55,  67.74],
       [  0.  ,   0.  , 213.8 , ...,  36.35,  13.57,  57.53],
       [  1.  ,  25.  , 234.4 , ...,  39.85,  22.6 ,  77.01]])

In [4]:
df_new = pd.DataFrame(select.inverse_transform(data_req), index= df.index, columns= data.columns)

req_columns = df_new.columns[df_new.var() != 0]
df_final = df_new[req_columns]

df_target = df['churn']
Final_df = pd.concat([df_final, df_target], axis =1)

Final_df.describe()

Unnamed: 0,voice_mail_plan,voice_mail_messages,day_mins,evening_mins,customer_service_calls,international_plan,day_charge,evening_charge,total_charge,churn
count,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0
mean,0.276628,8.09901,179.775098,200.980348,1.562856,0.09691,30.562307,17.08354,59.449754,0.144914
std,0.447398,13.688365,54.467389,50.713844,1.315491,0.295879,9.259435,4.310668,10.502261,0.352067
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.93,0.0
25%,0.0,0.0,143.7,166.6,1.0,0.0,24.43,14.16,52.38,0.0
50%,0.0,0.0,179.4,201.4,1.0,0.0,30.5,17.12,59.47,0.0
75%,1.0,20.0,216.4,235.3,2.0,0.0,36.79,20.0,66.48,0.0
max,1.0,51.0,350.8,363.7,9.0,1.0,59.64,30.91,96.15,1.0


## ................................................ BALANCING THE DATA ............................................................

In [5]:
# Oversampling Minor Class
from imblearn.over_sampling import SMOTE

In [6]:
x1 = Final_df.iloc[:, :9]
y1 = Final_df['churn']

# Implementing Oversampling for Handling Imbalanced 
smk = SMOTE(sampling_strategy = 'minority', random_state= 0)
x1_res,y1_res = smk.fit_resample(x1, y1)
final_bal = pd.concat([pd.DataFrame(x1_res), pd.DataFrame(y1_res)], axis=1)
final_bal.head()

Unnamed: 0,voice_mail_plan,voice_mail_messages,day_mins,evening_mins,customer_service_calls,international_plan,day_charge,evening_charge,total_charge,churn
0,1.0,25.0,265.1,197.4,1.0,0.0,45.07,16.78,75.56,0
1,1.0,26.0,161.6,195.5,1.0,0.0,27.47,16.62,59.24,0
2,0.0,0.0,243.4,121.2,0.0,0.0,41.38,10.3,62.29,0
3,0.0,0.0,299.4,61.9,2.0,1.0,50.9,5.26,66.8,0
4,0.0,0.0,166.7,148.3,3.0,1.0,28.34,12.61,52.09,0


In [7]:
print(final_bal['churn'].value_counts())

0    2850
1    2850
Name: churn, dtype: int64


## Model Building : Balanced Processed Data post Feature Engineering

# .................................................................................................................

In [8]:
#Feature & Target Variables
X1 = final_bal.iloc[:, :9]
Y1 = final_bal['churn']

X1_train, X1_test, Y1_train, Y1_test = train_test_split(X1, Y1, test_size= 0.3, random_state=10)

In [9]:
rf = RandomForestClassifier(random_state=0)
rf.fit(X1_train, Y1_train)
#Train
Y1_pred_rf_t = rf.predict(X1_train)
acc1_RF_t1 = accuracy_score(Y1_pred_rf_t, Y1_train)
print("Training Accuracy :", acc1_RF_t1 * 100)
#Test
Y1_pred_rf = rf.predict(X1_test)
acc1_bal_RF = accuracy_score(Y1_pred_rf, Y1_test)
print("Testing Accuarcy :", acc1_bal_RF * 100)
print("\n")
print(classification_report(Y1_test, Y1_pred_rf))
print("\n")
print(confusion_matrix(Y1_test, Y1_pred_rf))

Training Accuracy : 100.0
Testing Accuarcy : 95.6140350877193


              precision    recall  f1-score   support

           0       0.94      0.98      0.96       856
           1       0.98      0.93      0.96       854

    accuracy                           0.96      1710
   macro avg       0.96      0.96      0.96      1710
weighted avg       0.96      0.96      0.96      1710



[[837  19]
 [ 56 798]]


In [10]:
import pickle
pickle_out = open("rf.pkl", "wb")
pickle.dump(rf, pickle_out)
pickle_out.close()