### Import the required packages

In [1]:
# Packages / libraries
import os #provides functions for interacting with the operating system
import numpy as np 
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans, k_means
import warnings

%matplotlib inline

In [2]:
# remove warnings
warnings.filterwarnings("ignore")

###  Load the Data

In [3]:
# Loading the data data
raw_data = pd.read_csv(r"C:\Users\user\Desktop\AI projects\Churn-prediction\BankChurners.csv")

# print the shape
print(raw_data.shape)
# drop unused columns
raw_data = raw_data.drop(['CLIENTNUM', 'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1', 'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'], axis=1)


(10127, 23)


### Feature selecting
In this model - Kmeans -  I will seslect just three features from the numerical ones

In [4]:
#Preprocessing numerical
numerical = raw_data.select_dtypes(exclude='object')

In [5]:
new_data = numerical.drop(['Customer_Age', 'Dependent_count','Months_on_book', 'Total_Relationship_Count',
       'Months_Inactive_12_mon', 'Contacts_Count_12_mon', 'Credit_Limit','Total_Revolving_Bal','Total_Amt_Chng_Q4_Q1','Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1' ], axis = 1)

### K-means Clustering

In [6]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
new_data = scaler.fit_transform(new_data)


In [7]:
# assign the data set for X_train
X_train = new_data

In [None]:
# Running K means with multible Ks, good model is one with low inertia AND a low number of clusters (K)

no_of_clusters = range(2,10) #[2,3,4,5,6,7,8,9]
inertia = []  # measuring the distance between each data point and its centroid 


for f in no_of_clusters:
    kmeans = KMeans(n_clusters=f, random_state=2)
    kmeans = kmeans.fit(X_train)
    u = kmeans.inertia_
    inertia.append(u)
    print("The innertia for :", f, "Clusters is:", u)

In [None]:
# Creating the plot for Intertia - elbow method
fig, (ax1) = plt.subplots(1, figsize=(10,6))
xx = np.arange(len(no_of_clusters))
ax1.plot(xx, inertia)
ax1.set_xticks(xx)
ax1.set_xticklabels(no_of_clusters, rotation='vertical')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia Score')
plt.title("Inertia Plot per k")

In [10]:
from sklearn import datasets

# Instantiate the KMeans models
#
km = KMeans(n_clusters=4, random_state=42)
#
# Fit the KMeans model
#
predictions = km.fit_predict(X_train)

# calculating the Counts of the cluster
unique, counts = np.unique(predictions, return_counts=True)
counts = counts.reshape(1,4)

# Creating a dataframe
countscldf = pd.DataFrame(counts, columns = ["Cluster 0","Cluster 1","Cluster 2", "Cluster 3"])

# display
countscldf


Unnamed: 0,Cluster 0,Cluster 1,Cluster 2,Cluster 3
0,4673,792,3318,1344


In [11]:
# Calculate Silhoutte Score
from sklearn import metrics

score = metrics.silhouette_score(X_train, km.labels_, metric='euclidean')
#
# Print the score
#
print('Silhouetter Score: %.3f' % score)

Silhouetter Score: 0.482


In [None]:

# Running PCA to Visualize the data

from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# Trying with Dimentionality reduction and then Kmeans
X = X_train
y_num = predictions

n_components = X.shape[1]

# Running PCA with all components
pca = PCA(n_components=n_components, random_state = 420)
X_r = pca.fit(X).transform(X)


# Calculating the 95% Variance
total_variance = sum(pca.explained_variance_)
print("Total Variance in our dataset is: ", total_variance)
var_95 = total_variance * 0.95
print("The 95% variance we want to have is: ", var_95)
print("")

# Creating a df with the components and explained variance
a = zip(range(0,n_components), pca.explained_variance_)
a = pd.DataFrame(a, columns=["PCA Comp", "Explained Variance"])

# Trying to hit 95%
print("Variance explain with 2 n_compononets: ", sum(a["Explained Variance"][0:2]))
print("Variance explain with 3 n_compononets: ", sum(a["Explained Variance"][0:3]))



# Plotting the Data
plt.figure(1, figsize=(14, 8))
plt.plot(pca.explained_variance_ratio_, linewidth=2, c="r")
plt.xlabel('n_components')
plt.ylabel('explained_ratio_')

plt.show()

In [None]:
X = X_train
y_num = predictions

target_names = ["Cluster 0","Cluster 1","Cluster 2","Cluster 3"]

pca = PCA(n_components=2, random_state = 42)
X_r = pca.fit(X).transform(X)


# Percentage of variance explained for each components
print('Explained variance ratio (first two components): %s' % str(pca.explained_variance_ratio_))

# Plotting the data
plt.figure()
plt.figure(figsize=(12,8))
colors = ['navy', 'turquoise', 'darkorange', 'red']
lw = 2


for color, i, target_name in zip(colors, [0, 1, 2,3], target_names):
    plt.scatter(X_r[y_num == i, 0], X_r[y_num == i, 1], color=color, alpha=.8, lw=lw,label=target_name)
    
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.6)   
plt.title('PCA of 2 Items')
plt.show()

In [12]:
# adding the cluster column to raw_data to determine each client to which cluster
raw_data['cluster']=pd.Series(predictions, index=raw_data.index)

In [20]:
#Preprocessing categorical
categorical = raw_data.select_dtypes(include='object')

In [24]:
# function to calculate all the categorical percentage

def cal_percentage(df,x):
    y =df[x].value_counts(normalize=True)*100
    print(f'The percentage of {x} is: \n {y}')

In [None]:
churn_rate_cluster0 = raw_data.loc[raw_data['cluster']== 0]
churn_rate_cluster1 = raw_data.loc[raw_data['cluster']== 1]
churn_rate_cluster2 = raw_data.loc[raw_data['cluster']== 2]
churn_rate_cluster3 = raw_data.loc[raw_data['cluster']== 3]
clusters_frames= [churn_rate_cluster2]

for df in clusters_frames:
    churn_rate = len(df.loc[df['Attrition_Flag']== 'Attrited Customer'])*100/len(raw_data)
    print(f'The churn rate of the is:  {churn_rate}')
    for f in categorical.columns:
        cal_percentage(df,f)
print('####################################################################')

In [56]:
from tabulate import tabulate
# creating charachter's table for each cluster:
cluster0_data = [["Churn-rate",'10.53%'],
                 ["Gender","53% Female" ],
                 ["Education level","31% Graduated then 20% High school"],
                 ["Marital Status","46% Marrid and 39% singels"],
                 ["Income Category", "33% less than 40k and 20% 40k-60k"],
                 ["Card Category", "97% Blue"]]

col_names = ["Charachteristics", "Information"]
#display table
print('The charachtersitics for first cluster are : \n',tabulate(cluster0_data, headers=col_names))

The charachtersitics for first cluster are : 
 Charachteristics    Information
------------------  ----------------------------------
Churn-rate          10.53%
Gender              53% Female
Education level     31% Graduated then 20% High school
Marital Status      46% Marrid and 39% singels
Incom Category      33% less than 40k and 20% 40k-60k
Card Category       97% Blue


In [57]:
# creating charachter's table for each cluster:
cluster1_data = [["Churn-rate",'0.32%'],
                 ["Gender","60% Male" ],
                 ["Education level","31% Graduated and 21% High school"],
                 ["Marital Status","45% Marrid and 39% singels"],
                 ["Incom Category", "29% less than 40k and 18% for 40k-60k, 60k-80k and 80k-120k"],
                 ["Card Category", "79% Blue and 16% Silver"]]

col_names = ["Charachteristics", "Information"]
#display table
print('The charachtersitics for second cluster are : \n',tabulate(cluster1_data, headers=col_names))

The charachtersitics for second cluster are : 
 Charachteristics    Information
------------------  -----------------------------------------------------------
Churn-rate          0.32%
Gender              60% Male
Education level     31% Graduated and 21% High school
Marital Status      45% Marrid and 39% singels
Incom Category      29% less than 40k and 18% for 40k-60k, 60k-80k and 80k-120k
Card Category       79% Blue and 16% Silver


In [58]:
# creating charachter's table for each cluster:
cluster1_data = [["Churn-rate",'2.95%'],
                 ["Gender","72% Female" ],
                 ["Education level","31% Graduated , 20% High school and 14% uneducated"],
                 ["Marital Status","49% Marrid  and 40% Singel"],
                 ["Incom Category", "54% less than 40k and 21% 40k-60k"],
                 ["Card Category", "100% Blue"]]

col_names = ["Charachteristics", "Information"]
#display table
print('The charachtersitics for third cluster are : \n',tabulate(cluster1_data, headers=col_names))

The charachtersitics for third cluster are : 
 Charachteristics    Information
------------------  --------------------------------------------------
Churn-rate          2.95%
Gender              72% Female
Education level     31% Graduated , 20% High school and 14% uneducated
Marital Status      49% Marrid  and 40% Singel
Incom Category      54% less than 40k and 21% 40k-60k
Card Category       100% Blue


In [59]:
# creating charachter's table for each cluster:
cluster1_data = [["Churn-rate",'2.25%'],
                 ["Gender","87% Male" ],
                 ["Education level","30% Graduated , 19% High school and 16% uneducated"],
                 ["Marital Status","43% singels and 40% Marrid"],
                 ["Incom Category", "40% 80k-120k and 26% 120k"],
                 ["Card Category", "71% Blue and 23% Silver"]]

col_names = ["Charachteristics", "Information"]
#display table
print('The charachtersitics for fourth cluster are : \n',tabulate(cluster1_data, headers=col_names))

The charachtersitics for fourth cluster are : 
 Charachteristics    Information
------------------  --------------------------------------------------
Churn-rate          2.25%
Gender              87% Male
Education level     30% Graduated , 19% High school and 16% uneducated
Marital Status      43% singels and 40% Marrid
Incom Category      40% 80k-120k and 26% 120k
Card Category       71% Blue and 23% Silver


In [52]:
# Saving the model into a pickel file

from joblib import dump
dump(km, 'C:\\Users\\user\\Desktop\\AI projects\\Churn-prediction\\ChurnProject\\churn_main\\KmeansModel.joblib') 


['C:\\Users\\user\\Desktop\\AI projects\\Churn-prediction\\ChurnProject\\churn_main\\KmeansModel.joblib']