## Churn Prediction 

In [None]:
# import necessary libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from yellowbrick.cluster import SilhouetteVisualizer
import pickle
import json
import warnings

# Configurations:
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', 1500)
pd.set_option('display.float_format', lambda x: '%.6f' % x)
pd.set_option('display.max_rows', 500)
pd.set_option('display.width', 1500)


#### Import the dataset

In [None]:
df = pd.read_csv('.\data\BankChurners.csv')

#### Understanding the data

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
# Visualizing the categotical features in one plot
fig, ax = plt.subplots(3, 2, figsize = (8, 8))
sns.countplot(df['Attrition_Flag'], ax = ax[0, 0])
sns.countplot(df['Gender'], ax = ax[0, 1])
f1 = sns.countplot(df['Education_Level'], ax = ax[1, 0])
f1.set_xticklabels(f1.get_xticklabels(), rotation = 20)
sns.countplot(df['Marital_Status'], ax = ax[1, 1])
f2 = sns.countplot(df['Income_Category'], ax = ax[2, 0])
f2.set_xticklabels(f2.get_xticklabels(), rotation = 20)
sns.countplot(df['Card_Category'], ax = ax[2, 1])
fig.tight_layout()

### Data Preprocessing

In [None]:
# select except the last two (unnecessary) columns
df = df.iloc[:, :-2]
df.head()

In [None]:
# Statictical information and Variables relationships
df.describe()

In [None]:
# convert Attrited Customer to 1, Existing Customer to 0
df['Attrition_Flag'] = df['Attrition_Flag'].replace({'Attrited Customer':1,'Existing Customer':0})

In [None]:
# correlation in glimps:
df_corr= df.iloc[:,0:]

plt.figure(figsize=(12,5))
df_corr.corr()['Attrition_Flag'].sort_values(ascending = False).plot(kind='bar')

#### Observations:

- Correlation of gender, marital status, education level and customer age with Attrition flag are low.
- contacts_count_12_mon, Months-Inactive_12_mon seem to be positively correlated with churn. While, credit_limit,Total_Amt_Chng_A4_Q1, Total_Relationship_count, Total_Revolving_Bal, Total_Trans_Ct seem to be negatively correlated with churn.
- It seems that Card_Category and Avg_Open_To_Buy has lower correlation whith churn.

#### Feature Selection

According to the observation made, three features namely:Total_Revolving_Bal, Total_Ct_Chng_Q4_Q1 
and Months_Inactive_12_mon has been selected to build the model.

In [None]:
df_train = df[['Total_Revolving_Bal', 'Total_Ct_Chng_Q4_Q1','Months_Inactive_12_mon']]

#### Find the optimal value for k for k-means algorithm

In [None]:
# Elbow plot

sse = []
k_rng = range(1,10)
for k in k_rng:
    km = KMeans(n_clusters=k)
    km.fit(df_train)
    sse.append(km.inertia_)
    
plt.xlabel('K')
plt.ylabel('Sum of squared error')
plt.plot(k_rng,sse,  marker='o')    

In [None]:
# Using Silhoutte score for identifying the optimum value of k
fig, ax = plt.subplots(3, 2, figsize=(15,8))
for i in [2, 3, 4, 5, 6, 7]:
    '''
    Create KMeans instance for different number of clusters
    '''
    km = KMeans(n_clusters=i, init='k-means++', n_init=10, max_iter=100, random_state=42)
    q, mod = divmod(i, 2)
    '''
    Create SilhouetteVisualizer instance with KMeans instance
    Fit the visualizer
    '''
    visualizer = SilhouetteVisualizer(km, colors='yellowbrick', ax=ax[q-1][mod])
    visualizer.fit(df_train) 

#### Training the model using K-means clustering algorithm

In [None]:
km = KMeans(n_clusters=3)
km.fit_transform(df_train)
y_pred = km.predict(df_train)
centroids = km.cluster_centers_

# Calculate Silhoutte Score
score = silhouette_score(df_train, km.labels_, metric='euclidean')
print('Silhouette Score: %.3f' % score)

In [None]:
# add the predicted value to the main and training dataset
df_train['cluster'] = y_pred
df['cluster'] = y_pred

In [None]:
# function to calculate the churn rate

def churn_rate(df, cluster_id):
    cluster = df[df['cluster']== cluster_id]
    df_count = cluster["Attrition_Flag"].value_counts()
    churn_rate = (df_count[1] / (df_count[0] + df_count[1]) * 100)
    churn_rate = churn_rate.round(decimals = 2)
    
    return churn_rate

In [None]:
# function to calculate the female %

def gender(df, cluster_id):
    cluster = df[df['cluster']== cluster_id]
    df_count = cluster["Gender"].value_counts()
    f_count = (df_count['F'] / (df_count['F'] + df_count['M']) * 100)
    f_count = f_count.round(decimals = 2)
    
    return f_count

In [None]:
# call the function to calculate the churn rate and female % for each cluster
churn_rate_info = {}
gender_info = {}
for i in range(0,3): 
    churn_rate_info[i] = churn_rate(df, i)
    gender_info[i] = gender(df, i)
    

#### Save the model and calculations locally

In [None]:
# save the model to disk
filename = '.\model\clustering_model.sav'
pickle.dump(km, open(filename, 'wb'))

# save the dictionary into a json file
# Serializing json
json_object = json.dumps(churn_rate_info, indent=4)
 
# Writing to churn_rate_info.json
with open(".\data\churn_rate_info.json", "w") as outfile:
    outfile.write(json_object)
     
# Writing to churn_rate_info.json
json_object = json.dumps(gender_info, indent=4)
with open(".\data\gender_info.json", "w") as outfile:
    outfile.write(json_object)    

#### Visualize the cluster

In [None]:
df1 = df_train[df_train.cluster==0]
df2 = df_train[df_train.cluster==1]
df3 = df_train[df_train.cluster==2]

fig = plt.figure(figsize=(10,10))
ax = fig.gca(projection='3d')

ax.scatter(centroids[:,0],centroids[:,1],centroids[:,2],c="black",s=200,label="Centers",alpha=1, marker='*')
ax.scatter(df1['Total_Revolving_Bal'],df1['Total_Ct_Chng_Q4_Q1'],df1['Months_Inactive_12_mon'],c="blue",s=40,label="C1")
ax.scatter(df2['Total_Revolving_Bal'],df2['Total_Ct_Chng_Q4_Q1'],df2['Months_Inactive_12_mon'],c="yellow",s=40,label="C2")
ax.scatter(df3['Total_Revolving_Bal'],df3['Total_Ct_Chng_Q4_Q1'],df3['Months_Inactive_12_mon'],c="red",s=40,label="C3")
plt.legend(loc="upper right",framealpha=1, frameon=True)
plt.show()