# Import Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split as TTS

from sklearn.preprocessing import StandardScaler as SS
from sklearn.preprocessing import MinMaxScaler as MS

from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.linear_model import LogisticRegression as LR
from sklearn.tree import DecisionTreeClassifier as DT
from sklearn.neighbors import KNeighborsClassifier as KNN

from sklearn.metrics import f1_score as FS
from sklearn.metrics import roc_auc_score as RAS
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score

from statistics import mode

from sklearn.model_selection import GridSearchCV as GS

# Reading Data File

In [21]:
data=pd.read_csv('churn_prediction.csv') 
data.shape

(28382, 21)

In [4]:
data

Unnamed: 0,customer_id,vintage,age,gender,dependents,occupation,city,customer_nw_category,branch_code,days_since_last_transaction,...,previous_month_end_balance,average_monthly_balance_prevQ,average_monthly_balance_prevQ2,current_month_credit,previous_month_credit,current_month_debit,previous_month_debit,current_month_balance,previous_month_balance,churn
0,1,3135,66,Male,0.0,self_employed,187.0,2,755,224.0,...,1458.71,1458.71,1449.07,0.20,0.20,0.20,0.20,1458.71,1458.71,0
1,2,310,35,Male,0.0,self_employed,,2,3214,60.0,...,8704.66,7799.26,12419.41,0.56,0.56,5486.27,100.56,6496.78,8787.61,0
2,4,2356,31,Male,0.0,salaried,146.0,2,41,,...,5815.29,4910.17,2815.94,0.61,0.61,6046.73,259.23,5006.28,5070.14,0
3,5,478,90,,,self_employed,1020.0,2,582,147.0,...,2291.91,2084.54,1006.54,0.47,0.47,0.47,2143.33,2291.91,1669.79,1
4,6,2531,42,Male,2.0,self_employed,1494.0,3,388,58.0,...,1401.72,1643.31,1871.12,0.33,714.61,588.62,1538.06,1157.15,1677.16,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28377,30297,1845,10,Female,0.0,student,1020.0,2,1207,70.0,...,1076.43,2282.19,2787.70,0.30,0.30,0.30,0.30,1076.43,1076.43,0
28378,30298,4919,34,Female,0.0,self_employed,1046.0,2,223,14.0,...,4069.21,3668.83,3865.55,1.71,2.29,901.00,1014.07,3738.54,3690.32,0
28379,30299,297,47,Male,0.0,salaried,1096.0,2,588,0.0,...,61017.55,53444.81,21925.81,4666.84,3883.06,168.23,71.80,61078.50,57564.24,1
28380,30300,2585,50,Male,3.0,self_employed,1219.0,3,274,,...,1625.55,1683.20,1857.42,0.20,0.20,0.20,0.20,1625.55,1625.55,0


In [5]:
data.isnull().sum()

customer_id                          0
vintage                              0
age                                  0
gender                             525
dependents                        2463
occupation                          80
city                               803
customer_nw_category                 0
branch_code                          0
days_since_last_transaction       3223
current_balance                      0
previous_month_end_balance           0
average_monthly_balance_prevQ        0
average_monthly_balance_prevQ2       0
current_month_credit                 0
previous_month_credit                0
current_month_debit                  0
previous_month_debit                 0
current_month_balance                0
previous_month_balance               0
churn                                0
dtype: int64

# Missing Values

In [6]:
data['gender'].fillna(value=data['gender'].mode()[0],inplace=True)
data['dependents'].fillna(value=0,inplace=True) # as 0 is the mode of dependents
data['occupation'].fillna(value=data['occupation'].mode()[0],inplace=True)
data['city'].fillna(value=data['city'].mean(),inplace=True)
data['days_since_last_transaction'].fillna(value=data['days_since_last_transaction'].mean(),inplace=True)
data.isnull().sum()

customer_id                       0
vintage                           0
age                               0
gender                            0
dependents                        0
occupation                        0
city                              0
customer_nw_category              0
branch_code                       0
days_since_last_transaction       0
current_balance                   0
previous_month_end_balance        0
average_monthly_balance_prevQ     0
average_monthly_balance_prevQ2    0
current_month_credit              0
previous_month_credit             0
current_month_debit               0
previous_month_debit              0
current_month_balance             0
previous_month_balance            0
churn                             0
dtype: int64

# Correcting Data Types

In [7]:
data=data.astype({'dependents':'object','customer_nw_category':'object'})
data.dtypes

customer_id                         int64
vintage                             int64
age                                 int64
gender                             object
dependents                         object
occupation                         object
city                              float64
customer_nw_category               object
branch_code                         int64
days_since_last_transaction       float64
current_balance                   float64
previous_month_end_balance        float64
average_monthly_balance_prevQ     float64
average_monthly_balance_prevQ2    float64
current_month_credit              float64
previous_month_credit             float64
current_month_debit               float64
previous_month_debit              float64
current_month_balance             float64
previous_month_balance            float64
churn                               int64
dtype: object

# Changing objects to numericals

In [8]:
#data=data.drop(['previous_month_end_balance','average_monthly_balance_prevQ','average_monthly_balance_prevQ2','current_month_balance','previous_month_balance','customer_id','current_month_debit','previous_month_debit'],axis=1)
data=data.drop(['customer_id'],axis=1)
data=pd.get_dummies(data)

# Train Test Split

In [9]:
x=data.drop(['churn'],axis=1)
y=data['churn']

scaler=MS()
x_scaled=scaler.fit_transform(x)
x=pd.DataFrame(x_scaled,columns=x.columns)

train_x, valid_x, train_y, valid_y=TTS(x,y,test_size=0.3,random_state=1,stratify=y)

# Random Forest

In [25]:
rf=RF(n_jobs=-1,max_depth=40,n_estimators=81)
rf.fit(train_x,train_y)
#pred_train=rf.predict_proba(train_x)
pred_1=rf.predict(valid_x)
s1=rf.score(valid_x,valid_y)
s1

0.8668232530827951

In [11]:
# from sklearn.model_selection import GridSearchCV
# param_grid={
#     'max_depth':range(10,100,10),
#     #'min_samples_split':range(1,201,40),
#     #'max_leaf_nodes':range(1,201,40),
#     #'min_samples_leaf':range(1,201,40),
#     #'n_estimators':range(1,201,40),
#     #'max_samples':range(1,201,40),
#     #'max_features':range(1,201,40),   
# }
# CV=GridSearchCV(estimator=rf,param_grid=param_grid,n_jobs=-1)
# grid_result=CV.fit(valid_x,valid_y)
# print (grid_result.best_params_)

# Logistics Regression

In [12]:
lr=LR(max_iter=200,random_state=42)
lr.fit(train_x,train_y)
lr.predict(train_x)
pred_2=lr.predict(valid_x)
s2=lr.score(valid_x,valid_y)
s2

0.8144450968878449

# Decision Tree

In [13]:
# dt=DT(random_state=42)
# param_grid={'max_depth':range(20,30,2),'min_samples_split':range(15,25,2),'min_samples_leaf':range(35,45,2),'max_features':range(5,15,2)}
# gs=GS(estimator=dt,param_grid=param_grid,n_jobs=-1)
# grid_result=gs.fit(valid_x,valid_y)
# print (grid_result.best_params_)

In [14]:
dt=DT(random_state=42,max_depth=20,min_samples_split=15,min_samples_leaf=40,max_features=10)
dt.fit(train_x,train_y)
pred_3=dt.predict(valid_x)
s3=dt.score(valid_x,valid_y)
s3

0.8482677627715796

# KNN

In [15]:
# knn=KNN(n_jobs=-1)
# param_grid={'n_neighbors':range(40,51,1)}
# gs=GS(estimator=knn,param_grid=param_grid,n_jobs=-1)
# grid_result=gs.fit(valid_x,valid_y)
# print (grid_result.best_params_)

In [16]:
knn=KNN(n_neighbors=44)
knn.fit(train_x,train_y)
pred_4=knn.predict(valid_x)
s4=knn.score(valid_x,valid_y)
s4

0.8146799765120376

# Rank Averaging

In [17]:
index_=[1,2,3]
valid_r2=[s2,s3,s4]

rank_eval=pd.DataFrame({
    'score':valid_r2
},index=index_)
sorted_rank=rank_eval.sort_values('score')
sorted_rank['rank']=[i for i in range(1,4)]
sorted_rank['weight']=sorted_rank['rank']/sorted_rank['rank'].sum()
sorted_rank

Unnamed: 0,score,rank,weight
1,0.814445,1,0.166667
3,0.81468,2,0.333333
2,0.848268,3,0.5


In [18]:
wt_pred2 = pred_2*float(sorted_rank.loc[[1],['weight']].values)
wt_pred3 = pred_3*float(sorted_rank.loc[[2],['weight']].values)
wt_pred4 = pred_4*float(sorted_rank.loc[[3],['weight']].values)
ranked_pred=wt_pred2 + wt_pred3 + wt_pred4
ranked_pred

array([0. , 0. , 0.5, ..., 0. , 0. , 0. ])

# Voting

In [19]:
voted_pred=np.array([])
for i in range(0,len(valid_x)):
    voted_pred=np.append(voted_pred,mode([pred_1[i],pred_3[i],pred_4[i]]))

In [20]:
accuracy_score(valid_y,voted_pred)

0.8581327069876689

 # *******************************Final Score******************************** = 86% accurate predictions on unseen testing dataset