## Loading Packages

In [2]:
#import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

In [3]:
#Load Data
churn_df=pd.read_csv("churn_prediction.csv")
churn_df.head()

Unnamed: 0,customer_id,vintage,age,gender,dependents,occupation,city,customer_nw_category,branch_code,days_since_last_transaction,...,previous_month_end_balance,average_monthly_balance_prevQ,average_monthly_balance_prevQ2,current_month_credit,previous_month_credit,current_month_debit,previous_month_debit,current_month_balance,previous_month_balance,churn
0,1,3135,66,Male,0.0,self_employed,187.0,2,755,224.0,...,1458.71,1458.71,1449.07,0.2,0.2,0.2,0.2,1458.71,1458.71,0
1,2,310,35,Male,0.0,self_employed,,2,3214,60.0,...,8704.66,7799.26,12419.41,0.56,0.56,5486.27,100.56,6496.78,8787.61,0
2,4,2356,31,Male,0.0,salaried,146.0,2,41,,...,5815.29,4910.17,2815.94,0.61,0.61,6046.73,259.23,5006.28,5070.14,0
3,5,478,90,,,self_employed,1020.0,2,582,147.0,...,2291.91,2084.54,1006.54,0.47,0.47,0.47,2143.33,2291.91,1669.79,1
4,6,2531,42,Male,2.0,self_employed,1494.0,3,388,58.0,...,1401.72,1643.31,1871.12,0.33,714.61,588.62,1538.06,1157.15,1677.16,1


## Check Missing Values

In [4]:
churn_df.isnull().sum()

customer_id                          0
vintage                              0
age                                  0
gender                             525
dependents                        2463
occupation                          80
city                               803
customer_nw_category                 0
branch_code                          0
days_since_last_transaction       3223
current_balance                      0
previous_month_end_balance           0
average_monthly_balance_prevQ        0
average_monthly_balance_prevQ2       0
current_month_credit                 0
previous_month_credit                0
current_month_debit                  0
previous_month_debit                 0
current_month_balance                0
previous_month_balance               0
churn                                0
dtype: int64

## Remove Missing Values

In [5]:
#Fill all the missing value new category -1
churn_df['gender'] = churn_df['gender'].fillna(-1)
#Fill missing value in dependents variable
churn_df['dependents'] = churn_df['dependents'].fillna(0)
#Fill missing values in occupation variable
churn_df['occupation'] = churn_df['occupation'].fillna('self_employed')
#Fill missing values in city variable new city code 1061
churn_df['city'] = churn_df['city'].fillna(1061)
#Fill missing values in days_since_last_transaction A fair assumption can be made on this column as 
#this is number of days since last transaction in 1 year, we can substitute missing values with a 
#value greater than 1 year say 999
churn_df['days_since_last_transaction'] = churn_df['days_since_last_transaction'].fillna(999)

## Encoding

In [6]:
#Convert 
dict_gender = {'Male': 1, 'Female':0}
churn_df.replace({'gender': dict_gender}, inplace = True)

In [7]:
le = LabelEncoder()
le.fit(churn_df['occupation'])

LabelEncoder()

In [8]:
churn_df['occupation']=le.transform(churn_df['occupation'])

In [9]:
churn_df.isnull().sum()

customer_id                       0
vintage                           0
age                               0
gender                            0
dependents                        0
occupation                        0
city                              0
customer_nw_category              0
branch_code                       0
days_since_last_transaction       0
current_balance                   0
previous_month_end_balance        0
average_monthly_balance_prevQ     0
average_monthly_balance_prevQ2    0
current_month_credit              0
previous_month_credit             0
current_month_debit               0
previous_month_debit              0
current_month_balance             0
previous_month_balance            0
churn                             0
dtype: int64

In [10]:
churn_df.tail(10)

Unnamed: 0,customer_id,vintage,age,gender,dependents,occupation,city,customer_nw_category,branch_code,days_since_last_transaction,...,previous_month_end_balance,average_monthly_balance_prevQ,average_monthly_balance_prevQ2,current_month_credit,previous_month_credit,current_month_debit,previous_month_debit,current_month_balance,previous_month_balance,churn
28372,30292,2161,49,1,1.0,3,1320.0,3,1347,25.0,...,10535.24,2145.22,5827.11,286.33,10686.33,10609.46,609.46,2046.76,3843.4,1
28373,30293,2384,73,1,0.0,1,1381.0,3,790,3.0,...,1926.57,1491.87,1973.16,2514.47,433.49,1039.76,180.37,1410.8,1800.7,0
28374,30294,658,55,1,0.0,2,1076.0,3,463,306.0,...,7713.88,7713.88,7670.06,0.33,0.33,0.33,0.33,7713.88,7713.88,1
28375,30295,2041,42,1,0.0,3,146.0,2,286,57.0,...,6777.41,8082.48,3515.03,0.51,90.1,1103.2,1183.04,7956.03,7431.36,0
28376,30296,2155,85,1,0.0,1,1589.0,2,389,999.0,...,1741.5,1741.5,1730.46,0.04,0.04,0.04,0.04,1741.5,1741.5,0
28377,30297,1845,10,0,0.0,4,1020.0,2,1207,70.0,...,1076.43,2282.19,2787.7,0.3,0.3,0.3,0.3,1076.43,1076.43,0
28378,30298,4919,34,0,0.0,3,1046.0,2,223,14.0,...,4069.21,3668.83,3865.55,1.71,2.29,901.0,1014.07,3738.54,3690.32,0
28379,30299,297,47,1,0.0,2,1096.0,2,588,0.0,...,61017.55,53444.81,21925.81,4666.84,3883.06,168.23,71.8,61078.5,57564.24,1
28380,30300,2585,50,1,3.0,3,1219.0,3,274,999.0,...,1625.55,1683.2,1857.42,0.2,0.2,0.2,0.2,1625.55,1625.55,0
28381,30301,2349,18,1,0.0,4,1232.0,2,474,59.0,...,2821.34,3213.44,4447.45,0.11,7.44,714.4,1094.09,2402.62,3260.58,1


In [11]:
churn_df['churn'].value_counts()

0    23122
1     5260
Name: churn, dtype: int64

## Separating the dependent and independent variables

In [12]:
# separating the dependent and independent variables
y = churn_df['churn']
x = churn_df.drop(['churn'], axis = 1)

## creating the train and validation set

In [13]:
#creating the train and validation set
x_train, x_valid, y_train, y_valid = train_test_split(x, y, random_state = 101, stratify=y, test_size=0.25)

In [14]:
# distribution in training set
y_train.value_counts(normalize=True)

0    0.814667
1    0.185333
Name: churn, dtype: float64

In [15]:
#  distribution in validation set
y_valid.value_counts(normalize=True)

0    0.814684
1    0.185316
Name: churn, dtype: float64

In [16]:
#shape of training set
x_train.shape, y_train.shape

((21286, 20), (21286,))

In [17]:
# shape of validation set 
x_valid.shape, y_train.shape

((7096, 20), (21286,))

## Random forest model creation

In [18]:

# random forest model creation
rfc = RandomForestClassifier()
rfc.fit(x_train,y_train)

RandomForestClassifier()

In [19]:
y_train

991      0
22143    0
3708     0
1428     0
626      0
        ..
7778     0
20253    0
15338    0
7715     0
26031    1
Name: churn, Length: 21286, dtype: int64

In [20]:
result=rfc.predict(x_valid)
print(result)

[0 1 1 ... 0 0 0]


In [21]:
rfc.score(x_valid, y_valid)

0.8680947012401353

## Testing 

In [25]:
sample=churn_df.loc[churn_df['customer_id'] == 30296]

In [26]:
sample=sample.drop(['churn'], axis=1)

In [28]:
rfc.predict(sample)

array([0])