In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

##Library 

In [None]:
import pandas as pd 
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

import plotly
import plotly.figure_factory as ff
from plotly.offline import plot,iplot,download_plotlyjs

from plotly.subplots import make_subplots
import plotly.express as px
import plotly.graph_objs as go
import cufflinks as cf

cf.set_config_file(sharing='public',theme='white',offline=True)

import plotly.io as pio
pio.renderers.default = 'colab'

from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.ensemble import  RandomForestClassifier,GradientBoostingClassifier
from sklearn.svm import SVC
from imblearn.over_sampling import ADASYN,SMOTE
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import classification_report,recall_score,f1_score,precision_score,confusion_matrix

from sklearn.pipeline import Pipeline

##Data reload

In [None]:
df = pd.read_csv('/kaggle/input/credit-card-customers/BankChurners.csv')

In [None]:
df

### Data description

CLIENTNUM - ID of the customer holding the credit card.

Customer_Age - Age of the customer.

Gender - Sex of the customer.

Dependent_count - Number of dependents of the customer.

Education_Level - Educational qualification of the customer.

Marital_Status - Civil status of the customer.

Income_Category - Annual income range of the customer.

Card_Category - Type of card owned by the customer.

Months_on_book - Number of months elapsed since the account opening.

Total_Relationship_Count - Total number of products held by the customer.

Months_Inactive_12_mon - Number of months with no transactions in the last year.

Contacts_Count_12_mon - Number of contacts with the bank in the last year.

Credit_Limit - Credit limit on the credit card.

Total_Revolving_Bal - Total revolving balance on the credit card.

Avg_Open_To_Buy - Average card "Open To Buy" (=credit limit - account balance) in the last year.

Total_Amt_Chng_Q4_Q1 - Change in transaction amount over the last year (Q4 over Q1).

Total_Trans_Amt - Total amount of transactions made in the last year.

Total_Trans_Ct - Number of transactions made in the last year.

Total_Ct_Chng_Q4_Q1 - Change in transaction number over the last year (Q4 over Q1).

Avg_Utilization_Ratio - Average card "Utilization ratio" (=account balance / credit limit) in the last year.

Attrition_Flag - Target variable. "Attrited Customer" if the customer closed their account, otherwise "Existing Customer".





##Data Checking



> **Any data must be checked for missing values.**

Ensuring the quality of the data set greatly affects the performance of the model.


> The filling method should be judged according to the **Real-world Business Scenario.**

> Filling methods：

*   Default value
*   Mean value
*   Mode
*   KNN filling
*   Predicting through the model(Random Forest) as label







In [None]:
df.isnull().sum()

Fortunately, we don't have a null value.


According to the data set description, delete the 21 and 22 columns of Naive_Bayes_Classifier

In [None]:
df1 = df.iloc[:, :-2]
df1.info()

#Exploratory Data Analysis 


> This part is divided into three parts:
1.  Categorical data visualization
2.  Numerical data visualization
1.  Pearson & Spearman correlation coefficient.


> Check the data structure of Dataset.
There are 20 columns in this dataset. 
We need to know which data attributes are numerical and which data attributes are categorical.



In [None]:
Numbercial_features= df1.select_dtypes(include=['float64','int64'])
Numbercial_features.sample()

In [None]:
Categorical_features = df1.select_dtypes(exclude=['float64','int64'])
Categorical_features.sample()

Categorcial Data:


Attrition_Flag, 
Gender, 
Education_Level, 
Marital_Status, 
Income_Category and Card_Category.

All the remaining data are numerical values.

##Categorcial Data visualization


###Churn Status/Attrition_Flag



In [None]:
fig = px.pie(df1,names='Attrition_Flag',title='Percentage of Existing and Attrited Customers',hole=0.3)
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()
print('Total number of customers:',df1['Attrition_Flag'].count())
print(df1['Attrition_Flag'].value_counts())

We can see the length of Attrited Customers is 1627 which means there are 1627 Attrited customers in the dataset. And the length of existing customers is 8500 which means there are 8500 existing customers in the dataset. obviously, this is an unbalanced dataset.

###Gender

In [None]:
Gender = sns.countplot(x = 'Gender',hue = 'Attrition_Flag',data=df1,palette='Set2')
Gen_att = df1.loc[df1['Attrition_Flag']=='Attrited Customer','Gender']
Gen_ex = df1.loc[df1['Attrition_Flag']=='Existing Customer','Gender']
print('Gender of Attrited customer:\n',Gen_att.value_counts())
print('-----------------------------------------------------------')
print('Gender of Existing customer:\n',Gen_ex.value_counts())
print('-----------------------------------------------------------')
print('Gender of Total customer:\n',df1.Gender.value_counts())

###Marital Status 


In [None]:
Mar_att = df1.loc[df1['Attrition_Flag']=='Attrited Customer','Marital_Status']
Mar_ex = df1.loc[df1['Attrition_Flag']=='Existing Customer','Marital_Status']
print('Marital status of Attrited customer:\n',Mar_att.value_counts())
print('-----------------------------------------------------------')
print('Marital status of Existing customer:\n',Mar_ex.value_counts())
print('-----------------------------------------------------------')
print('Marital status of Total customer:\n',df1.Marital_Status.value_counts())

In [None]:
fig = make_subplots(
    rows=2, cols=2,subplot_titles=('Total Customer','Existing Customers','Attrited Customers','Residuals'),
    vertical_spacing=0.09,
    specs=[[{"type": "pie","rowspan": 2}       ,{"type": "pie"}] ,
           [None                               ,{"type": "pie"}]            ,                                      
          ]
)

fig.add_trace(
    go.Pie(values=df1.Marital_Status.value_counts().values,
           labels=['Married ','Single ','Unknow', 'Divorced'],
           pull=[0,0.01,0.03,0.03],
           hole=0.3),
    row=1, col=1
)

fig.add_trace(
    go.Pie(
        labels=['Married ', 'Single ','Divorced', 'Unknown'],
        values=df1.query('Attrition_Flag=="Existing Customer"').Marital_Status.value_counts().values,
        pull=[0,0.01,0.05,0.05],
        hole=0.3),
    row=1, col=2
)

fig.add_trace(
    go.Pie(
        labels=['Married ', 'Single','Unknown ','Divorced '],
        values=df1.query('Attrition_Flag=="Attrited Customer"').Marital_Status.value_counts().values,
        pull=[0,0.01,0.05,0.05],
        hole=0.3),
    row=2, col=2
)



fig.update_layout(
    height=700,
    showlegend=True,
    title_text="<b>Martial Status<b>",
)
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()

The interesting point is that nearly 39% of customers are single.

### Education Level

In [None]:
Edu_att = df1.loc[df1['Attrition_Flag']=='Attrited Customer','Education_Level']
Edu_ex = df1.loc[df1['Attrition_Flag']=='Existing Customer','Education_Level']
print('Education Level of Attrited customer:\n',Edu_att.value_counts())
print('-----------------------------------------------------------')
print('Education Level of Existing customer:\n',Edu_ex.value_counts())
print('-----------------------------------------------------------')
print('Education Level of Total customer:\n',df1.Education_Level.value_counts())

In [None]:
fig = make_subplots(
    rows=2, cols=2,subplot_titles=('Total Customer','Existing Customers','Attrited Customers','Residuals'),
    vertical_spacing=0.09,
    specs=[[{"type": "pie","rowspan": 2}       ,{"type": "pie"}] ,
           [None                               ,{"type": "pie"}]            ,                                      
          ]
)

fig.add_trace(
    go.Pie(values=df1.Education_Level.value_counts().values,
           labels=['Graduate ','Hight School','Unknown','Uneducated','College','Post-Graduate','Doctorate'],
           pull=[0,0.01,0.03,0.03],
           hole=0.3),
    row=1, col=1
)

fig.add_trace(
    go.Pie(values=df1.query('Attrition_Flag=="Existing Customer"').Education_Level.value_counts().values,
        labels=['Graduate ','Hight School','Unknown','Uneducated','College','Post-Graduate','Doctorate'],
        pull=[0,0.01,0.05,0.05],
        hole=0.3),
    row=1, col=2
)

fig.add_trace(
    go.Pie(values=df1.query('Attrition_Flag=="Attrited Customer"').Education_Level.value_counts().values,
        labels=['Graduate ','Hight School','Unknown','Uneducated','College','Doctorate','Post-Graduate'],
        pull=[0,0.01,0.05,0.05],
        hole=0.3),
    row=2, col=2
)



fig.update_layout(
    height=700,
    showlegend=True,
    title_text="<b>Education Level<b>",
)
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()

From the level of education, we can conclude that if unknown is defined as uneducated, We can state that 70% of our customers have received formal education, and 30% of them have received higher education. Only 10% of the customers are postgraduate and doctors.

###Income

In [None]:
Income_att = df1.loc[df1['Attrition_Flag']=='Attrited Customer','Income_Category']
Income_ex = df1.loc[df1['Attrition_Flag']=='Existing Customer','Income_Category']
print('Income of Attrited customer:\n',Income_att.value_counts())
print('-----------------------------------------------------------')
print('Income of Existing customer:\n',Income_ex.value_counts())
print('-----------------------------------------------------------')
print('Income of Total customer:\n',df1.Income_Category.value_counts())

In [None]:
fig = make_subplots(
    rows=2, cols=2,subplot_titles=('Total Customer','Existing Customers','Attrited Customers','Residuals'),
    vertical_spacing=0.09,
    specs=[[{"type": "pie","rowspan": 2}       ,{"type": "pie"}] ,
           [None                               ,{"type": "pie"}]            ,                                      
          ]
)

fig.add_trace(
    go.Pie(values=df1.Income_Category.value_counts().values,
           labels=['Less than $40k ','$40k - $60k','$80k - $120k','$60k - $80k','Unknown','$120k +'],
           pull=[0,0.01,0.03,0.03],
           hole=0.3),
    row=1, col=1
)

fig.add_trace(
    go.Pie(values=Income_ex.value_counts().values,
        labels=['Less than $40k ','$40k - $60k','$80k - $120k','$60k - $80k','Unknown','$120k +'],
        pull=[0,0.01,0.05,0.05],
        hole=0.3),
    row=1, col=2
)

fig.add_trace(
    go.Pie(values=Income_att.value_counts().values,
        labels=['Less than $40k ','$40k - $60k','$80k - $120k','$60k - $80k','Unknown','$120k +'],
        pull=[0,0.01,0.05,0.05],
        hole=0.3),
    row=2, col=2
)



fig.update_layout(
    height=700,
    showlegend=True,
    title_text="<b>Income_Category<b>",
)
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()

Approximately 35%  of customers' income is less than 40K annual salary.

###Income category and card type

In [None]:
#Let's view the card type
df1.Card_Category.value_counts()

In [None]:
Less40K_all = df1.loc[df1['Income_Category']=='Less than $40K','Card_Category']
Less40K_ex = df1.loc[(df1['Income_Category']=='Less than $40K')&(df1['Attrition_Flag']=='Existing Customer'),'Card_Category']
Less40K_att = df1.loc[(df1['Income_Category']=='Less than $40K')&(df1['Attrition_Flag']=='Attrited Customer'),'Card_Category']

Between4060k_all = df1.loc[df1['Income_Category']=='$40K - $60K','Card_Category']
Between4060k_ex = df1.loc[(df1['Income_Category']=='$40K - $60K')&(df1['Attrition_Flag']=='Existing Customer'),'Card_Category']
Between4060k_att = df1.loc[(df1['Income_Category']=='$40K - $60K')&(df1['Attrition_Flag']=='Attrited Customer'),'Card_Category']

Between6080k_all = df1.loc[df1['Income_Category']=='$60K - $80K','Card_Category']
Between6080k_ex = df1.loc[(df1['Income_Category']=='$60K - $80K')&(df1['Attrition_Flag']=='Existing Customer'),'Card_Category']
Between6080k_att = df1.loc[(df1['Income_Category']=='$60K - $80K')&(df1['Attrition_Flag']=='Attrited Customer'),'Card_Category']

Between80120k_all = df1.loc[df1['Income_Category']=='$80K - $120K','Card_Category']
Between80120k_ex = df1.loc[(df1['Income_Category']=='$80K - $120K')&(df1['Attrition_Flag']=='Existing Customer'),'Card_Category']
Between80120k_att = df1.loc[(df1['Income_Category']=='$80K - $120K')&(df1['Attrition_Flag']=='Attrited Customer'),'Card_Category']

Over120k_all = df1.loc[(df1['Income_Category']=='$120K +'),'Card_Category']
Over120k_ex = df1.loc[(df1['Income_Category']=='$120K +')&(df1['Attrition_Flag']=='Existing Customer'),'Card_Category']
Over120k_att = df1.loc[(df1['Income_Category']=='$120K +')&(df1['Attrition_Flag']=='Attrited Customer'),'Card_Category']

Unknown_all = df1.loc[(df1['Income_Category']=='$120K +'),'Card_Category']
Unknown_ex = df1.loc[(df1['Income_Category']=='$120K +')&(df1['Attrition_Flag']=='Existing Customer'),'Card_Category']
Unknown_att = df1.loc[(df1['Income_Category']=='$120K +')&(df1['Attrition_Flag']=='Attrited Customer'),'Card_Category']


print('Types of Attrited customers whose income is less than 40K:\n',Less40K_att.value_counts())
print('-----------------------------------------------------------')
print('Types of Existing customers whose income is less than 40K:\n',Less40K_ex.value_counts())
print('-----------------------------------------------------------')
print('Types of Total customers whose income is less than 40K:\n',Less40K_all.value_counts())
print('***********************************************************')
print('Types of Attrited customers whose income is between 40k-60k:\n',Between4060k_att.value_counts())
print('-----------------------------------------------------------')
print('Types of Existing customers whose income is between 40k-60k:\n',Between4060k_ex.value_counts())
print('-----------------------------------------------------------')
print('Types of Total customers whose income is between 40k-60k:\n',Between4060k_all.value_counts())
print('***********************************************************')
print('Types of Attrited customers whose income is between 60k-80k:\n',Between6080k_att.value_counts())
print('-----------------------------------------------------------')
print('Types of Existing customers whose income is between 60k-80k:\n',Between6080k_ex.value_counts())
print('-----------------------------------------------------------')
print('Types of Total customers whose income is between 60k-80k:\n',Between6080k_all.value_counts())
print('***********************************************************')
print('Types of Attrited customers whose income is between 80k-120k:\n',Between80120k_att.value_counts())
print('-----------------------------------------------------------')
print('Types of Existing customers whose income is between 80k-120k:\n',Between80120k_ex.value_counts())
print('-----------------------------------------------------------')
print('Types of Total customers whose income is between 80k-120k:\n',Between80120k_all.value_counts())
print('***********************************************************')
print('Types of Attrited customers whose income is Unknown:\n',Unknown_att.value_counts())
print('-----------------------------------------------------------')
print('Types of Existing customers whose income is Unknown:\n',Unknown_ex.value_counts())
print('-----------------------------------------------------------')
print('Types of Total customers whose income is Unknown:\n',Unknown_all.value_counts())


In [None]:
specs = [[{'type':'xy'}, {'type':'xy'}],
         [{'type':'xy'}, {'type':'xy'}],
         [{'type':'xy'}, {'type':'xy'}],
         ]
fig = make_subplots(rows=3, cols=2, specs=specs,subplot_titles=('Types with income less than 40K',
                                                                'Types with income between 40K-60k',
                                                                'Types with income between 60k-80k',
                                                                'Types with income between 80k-120k',
                                                                'Types with income higher than 120k',
                                                                'Unknown',))

fig.add_trace(
    go.Bar(y=Less40K_all.value_counts().values,
           x=['Blue','Silver','Gold ','Platinum'],
           ),
    row=1, col=1
)

fig.add_trace(
    go.Bar(y=Between4060k_all.value_counts().values,
           x=['Blue','Silver','Gold ','Platinum'],
           ),
    row=1, col=2
)

fig.add_trace(
    go.Bar(y=Between6080k_all.value_counts().values,
           x=['Blue','Silver','Gold ','Platinum'],
           ),
    row=2, col=1
)
fig.add_trace(
    go.Bar(y=Between80120k_all.value_counts().values,
           x=['Blue','Silver','Gold ','Platinum'],
           ),
    row=2, col=2
)

fig.add_trace(
    go.Bar(y=Over120k_all.value_counts().values,
           x=['Blue','Silver','Gold ','Platinum'],
           ),
    row=3, col=1
)
fig.add_trace(
    go.Bar(y=Unknown_all.value_counts().values,
           x=['Blue','Silver','Gold ','Platinum'],
           ),
    row=3, col=2
)


fig.update_layout(
    height=1400,
    showlegend=True,
    title_text="<b>Income_Category<b>",
)

fig.update_traces(textposition='inside')
fig.show()

###Card Category

In [None]:
fig = px.pie(df1,names='Card_Category',title='Percentage of Card type',hole=0.3)
fig.update_traces(textposition='outside', textinfo='percent+label')
fig.show()
print('Total number of customers:',df1['Card_Category'].count())
print(df1['Card_Category'].value_counts())

##Numercial Data visualization


1.   Age
2.   Number of products held by customer

1.   Number of dependents of customer
2.   Number of months with no transactions in the last year.

1.   Credit limit
2.   Months of account

1.   Revolving balance on the credit card.

1.   Amount of transactions made in last year
2.   Number of transactions made in last year

1.   Change in transaction number over the last year (Q4 over Q1).



###Age

In [None]:
####Let us view the age distribution.
Age_att = df1.loc[df1['Attrition_Flag']=='Attrited Customer','Customer_Age']
Age_exi = df1.loc[df1['Attrition_Flag']=='Existing Customer','Customer_Age']
Total_Age = pd.concat([Age_exi,Age_att],axis=1)
Total_Age.columns=['Existing Customer','Attrited Customer']
Total_Age

In [None]:
Total_Age.iplot(kind='hist', keys=['Attrited Customer', 'Existing Customer'],
           colors=['grey', 'red'], histnorm='percent', opacity=0.5, bins=40,
           title='Customers\' age', xTitle='Age', yTitle='% customers')

We can see The distribution of customer age conforms to the normal distribution.

In [None]:
Numbercial_features.head()

###Number of products held by customer

In [None]:
fig = px.box(df, color="Attrition_Flag", y="Total_Relationship_Count",title='Number of products held by customer', points="all")
fig.show()

In [None]:
fig = px.histogram(df, x="Total_Relationship_Count", color="Attrition_Flag",title='Number of products held by customer')
fig.show()

###Number of dependents of customer

In [None]:
fig = px.box(df, color="Attrition_Flag", y="Dependent_count", points="all",title='Number of dependents of the customer')
fig.show()

In [None]:
fig = px.histogram(df, x="Dependent_count", color="Attrition_Flag",title='Number of dependents of the customer')
fig.show()

###Number of months with no transactions in the last year.

In [None]:
fig = px.box(df, color="Attrition_Flag", y="Months_Inactive_12_mon", points="all",title='Number of months with no transactions in the last year')
fig.show()

In [None]:
fig = px.histogram(df, x="Months_Inactive_12_mon", color="Attrition_Flag",title='Number of months with no transactions in the last year')
fig.show()

###Credit limit

In [None]:
fig = px.box(df, color="Attrition_Flag", y="Credit_Limit", points="all",title='Credit limit on the credit card')
fig.show()

In [None]:
fig = px.histogram(df, x="Credit_Limit", color="Attrition_Flag",title='Credit limit on the credit card')
fig.show()

###Months of account

In [None]:
fig = px.box(df,color="Attrition_Flag", y="Months_on_book", points="all",title='Number of months elapsed since the account opening')
fig.show()

In [None]:
fig = px.histogram(df, x="Months_on_book", color="Attrition_Flag",title='Number of months elapsed since the account opening')
fig.show()

###Revolving balance on the credit card.

In [None]:
fig = px.box(df, color="Attrition_Flag", y="Total_Revolving_Bal", points="all",title='Total revolving balance on the credit card')
fig.show()

In [None]:
fig = px.histogram(df, x="Total_Revolving_Bal", color="Attrition_Flag",title='Total revolving balance on the credit card')
fig.show()

###Number of transactions made in last year

In [None]:
df1.sample()

In [None]:
fig = fig = px.box(df, color="Attrition_Flag", y="Total_Trans_Ct", points="all",title='Number of transactions made in the last year')
fig.show()

In [None]:
fig = px.histogram(df, x="Total_Trans_Ct", color="Attrition_Flag",title='Number of transactions made in the last year')
fig.update()
fig.show()

###Amount of transactions made in last year

In [None]:
df1.sample()

In [None]:
fig = fig = px.box(df, color="Attrition_Flag", y="Total_Trans_Amt", points="all",title='Total amount of transactions made in the last year')
fig.show()

In [None]:
fig = px.histogram(df, x="Total_Trans_Amt", color="Attrition_Flag",title='Total amount of transactions made in the last year')
fig.show()

###Change in transaction number over the last year (Q4 over Q1).

In [None]:
fig = fig = px.box(df, color="Attrition_Flag", y="Total_Ct_Chng_Q4_Q1", points="all",title='Change in transaction number over the last year (Q4 over Q1)')
fig.show()

In [None]:
fig = px.histogram(df, x="Total_Ct_Chng_Q4_Q1", color="Attrition_Flag",title='Change in transaction number over the last year (Q4 over Q1)')
fig.show()

##Pearson Correlation & Spearman Correlation
The value range of Pearson's correlation coefficient is [-1, 1]:


*   When it is close to 1, it means that the two have a strong positive correlation.
*   When it is close to -1, it indicates that there is a strong negative correlation.

*   And if the value is close to 0, it means that the correlation or variable is not linearly related. But when R=0, we have to pay attention to another non-linear dependency, which can be checked with spearman coefficient.


We need to convert qualitative variables into quantitative variables at the very beginning.

In [None]:
#Convert the target value to 1,0

df1.Attrition_Flag = df1.Attrition_Flag.replace({'Existing Customer':0,'Attrited Customer':1})
df1.Gender = df1.Gender.replace({'M':0,'F':1})
print("# existing customers: {}\n".format(len(df.loc[df1['Attrition_Flag']  == 0])))
print("# attrited customers: {}\n".format(len(df.loc[df1['Attrition_Flag']  == 1])))


>   Obviously, this is an unbalanced data set. We will then upsample the original data to obtain a balanced data set.



>   Through the previous data exploration and visualization, there is no missing vaule in this data, but some values in categorical data are Unknown. To some extent, 'Unknown' can be defined as missing values.

>   For the 'Unknown' class processing depends on the actual business scenario. Sometimes it is possible to make 'Unknown' into a new class.

> But in this data set, we simply delete the 'Unknown' value.







In [None]:
#convert all categorcial features to numerical
df1 = pd.concat([df1,pd.get_dummies(df1['Education_Level']).drop(columns = ['Unknown'])],axis= 1)
df1 = pd.concat([df1,pd.get_dummies(df1['Marital_Status']).drop(columns = ['Unknown'])],axis= 1)
df1 = pd.concat([df1,pd.get_dummies(df1['Income_Category']).drop(columns = ['Unknown'])],axis= 1)
df1 = pd.concat([df1,pd.get_dummies(df1['Card_Category'])],axis= 1)
#Delete redundant columns
df1.drop(columns= ['Education_Level','Marital_Status','Income_Category','Card_Category','CLIENTNUM'],inplace= True)

In [None]:
df1.head()

In [None]:
df1_spearman_correlation = df1.corr(method='pearson')

fig = go.Figure(data=go.Heatmap(
                   x=df1_spearman_correlation.columns,
                   y=df1_spearman_correlation.index,
                   z=df1_spearman_correlation.values,
                   name='pearson',showscale=True,xgap=1,ygap=1,
                   colorscale='Blackbody'))
fig.update_layout(height=700, width=900, title_text="<b>Pearson Correlation<b>")
fig.show()

In [None]:
df1_spearman_correlation = df1.corr(method='spearman')

fig = go.Figure(data=go.Heatmap(
                   x=df1_spearman_correlation.columns,
                   y=df1_spearman_correlation.index,
                   z=df1_spearman_correlation.values,
                   name='spearman',showscale=True,xgap=1,ygap=1,
                   colorscale='Blackbody'))
fig.update_layout(height=700, width=900, title_text="<b>Spearman Correlations<b>")
fig.show()

#Classification Model


> This part is divided into four steps:
1.  Training Model in  RAW Dataset
2.  SMOTE
3.  Cross-validation
4.  Training Model in Upsample Dataset
5. Tunning parameters
6.  Feature Selection

> Experimental algorithm:
*   Random Forest
*    Support Vector Machine
*   Gradient Boosting

> Compare the performance of algorithms on **raw data** and **up-sampled data**.

##Training Model in RAW Dataset

In [None]:
df1.head(10)

In [None]:
x_RAW = df1[df1.columns[1:]]
y_RAW = df1['Attrition_Flag']
x_train_RAW,x_test_RAW,y_train_RAW,y_test_RAW = train_test_split(x_RAW,y_RAW,test_size = 0.2,random_state =42)

In [None]:
SC = StandardScaler()
x_train_RAW = SC.fit_transform(x_train_RAW)
x_test_RAW = SC.fit_transform(x_test_RAW)

###RandomForest Classifier

In [None]:
RF_RAW = RandomForestClassifier()
RF_RAW.fit(x_train_RAW,y_train_RAW)

In [None]:
RF_RAW_pre = RF_RAW.predict(x_test_RAW)

In [None]:
sns.heatmap(confusion_matrix(y_test_RAW,RF_RAW_pre), annot=True)
plt.show()

In [None]:


RF_RAW_fpr, RF_RAW_tpr, RF_RAW_thresholds = metrics.roc_curve(y_test_RAW, RF_RAW_pre)
RF_RAW_AUC = metrics.auc(RF_RAW_fpr, RF_RAW_tpr)
print('Random Forest Classifier : \n', classification_report(RF_RAW_pre, y_test_RAW))
fig = px.area(
    x=RF_RAW_fpr, y=RF_RAW_tpr,
    title='AUC of RandomForest on RAW Data:'' %0.4f'% RF_RAW_AUC,
    labels=dict(x='False Positive Rate', y='True Positive Rate'),
    width=700, height=600
)
fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1
)
fig.update_yaxes(scaleanchor="x", scaleratio=1)
fig.update_xaxes(constrain='domain')
fig.show()

###Support Vector Machine

In [None]:
SVM_RAW = Pipeline(steps =[("RF",SVC(random_state=42,kernel='rbf'))])
SVM_RAW.fit(x_train_RAW,y_train_RAW)

In [None]:
SVM_RAW_pre = SVM_RAW.predict(x_test_RAW)

In [None]:
SVM_RAW_fpr, SVM_RAW_tpr, SVM_RAW_thresholds = metrics.roc_curve(y_test_RAW, SVM_RAW_pre)
SVM_RAW_AUC = metrics.auc(SVM_RAW_fpr, SVM_RAW_tpr)
print('Support Vector Machine : \n', classification_report(SVM_RAW_pre, y_test_RAW))

fig = px.area(
    x=SVM_RAW_fpr, y=SVM_RAW_tpr,
    title='AUC of SVM on RAW Data:'' %0.4f'% SVM_RAW_AUC,
    labels=dict(x='False Positive Rate', y='True Positive Rate'),
    width=700, height=600
)
fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1
)
fig.update_yaxes(scaleanchor="x", scaleratio=1)
fig.update_xaxes(constrain='domain')
fig.show()

###Gradient Boosting

In [None]:
GBoost_RAW = Pipeline(steps=[('RF',GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=42))])

In [None]:
GBoost_RAW.fit(x_train_RAW,y_train_RAW)

In [None]:
GBoost_RAW_pre = GBoost_RAW.predict(x_test_RAW)

In [None]:
 sns.heatmap(confusion_matrix(y_test_RAW,GBoost_RAW_pre), annot=True)
 plt.show()

In [None]:
GBoost_RAW_fpr, GBoost_RAW_tpr, GBoost_RAW_thresholds = metrics.roc_curve(y_test_RAW, GBoost_RAW_pre)
GBoost_RAW_AUC = metrics.auc(GBoost_RAW_fpr, GBoost_RAW_tpr)
print('Gradient Boosting Classifier : \n', classification_report(GBoost_RAW_pre, y_test_RAW))

fig = px.area(
    x=GBoost_RAW_fpr, y=GBoost_RAW_tpr,
    title='AUC of GBoost on RAW Data:'' %0.4f'% GBoost_RAW_AUC,
    labels=dict(x='False Positive Rate', y='True Positive Rate'),
    width=700, height=600
)
fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1
)
fig.update_yaxes(scaleanchor="x", scaleratio=1)
fig.update_xaxes(constrain='domain')
fig.show()

For models trained on unbalanced data sets, we can only consider AUC as a performance indicator.

> From the AUC of the models:
*   RandomFroest = 0.8632
*   Support Vector Machine = 0.7692
*   Gradient Boosting = 0.8975


##SMOTE

 There are currently two popular methods for sampling minority categories:

*  (i) Synthetic Minority Oversampling Technique (SMOTE).
*  (ii) Adaptive Synthetic (ADASYN).



> An improved algorithm of SOMTE is used here: BoderlineSMOTE.

* SMOTE: For the minority sample A, randomly select a nearest neighbor sample B, and then randomly select a point C from the line between A and B as the new minority sample.

* BoderlineSMOTE: All minority sample points are divided into three categories: 1) noise, that is, all nearby points are heterogeneous samples. 2) In danger, at least half of the nearby points are samples of the same kind. 3) Safe points, all nearby points are samples of the same kind. Then uniformly use the second-class in danger minority samples to generate data points.





In [None]:
df1.head()

In [None]:
x= df1[df1.columns[1:]]
y = df1['Attrition_Flag']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,random_state =42)  

In [None]:
#standaration of data
x_train = SC.fit_transform(x_train)
x_test = SC.fit_transform(x_test)

We only upsample the training set of 80% of the original data to ensure that the model has not seen the remaining 20% of the data to avoid overfitting problems.

In [None]:
from imblearn.over_sampling import BorderlineSMOTE
sm = BorderlineSMOTE()
x_SM, y_SM = sm.fit_resample(x_train, y_train)

In [None]:
#Check the result of SMOTE
from collections import Counter
print(Counter(y_train))
print(Counter(y_SM))

In [None]:
SM_x = np.concatenate((x_SM, x_test))
SM_y = np.concatenate((y_SM, y_test))
feature_names = list(df1.drop('Attrition_Flag', axis=1).columns)

In [None]:
sm_df = pd.DataFrame(np.column_stack([SM_y, SM_x]), columns=['Attrition'] + feature_names)
sm_df.head()

##Cross-validation

In [None]:
RF_SMOTE = RandomForestClassifier()
SVM_SMOTE = Pipeline(steps =[("RF",SVC(random_state=42,kernel='rbf'))])
GBoost_SMOTE = Pipeline(steps=[('RF',GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=42))])

In [None]:
RF_SMOTE_Cross_validation_scores = cross_val_score(RF_SMOTE,x_SM,y_SM,cv= 10,scoring='f1')
SVM_SMOTE_Cross_validation_scores = cross_val_score(SVM_SMOTE,x_SM,y_SM,cv= 10,scoring='f1')
GBoost_SMOTE_Cross_validation_scores = cross_val_score(GBoost_SMOTE,x_SM,y_SM,cv= 10,scoring='f1')

In [None]:
'''
fig = make_subplots(rows=3, cols=1,shared_xaxes=True,subplot_titles=('Random Forest Cross Val Scores',                                     
                                                                    'SVM Cross Val Scores',
                                                                     'GBoosting Cross Val Scores'))

fig.add_trace(
    go.Scatter(x=list(range(0,len(RF_SMOTE_Cross_validation_scores))),y=RF_SMOTE_Cross_validation_scores,name='Random Forest'),
    row=1, col=1
)
fig.add_trace(
    go.Scatter(x=list(range(0,len(SVM_SMOTE_Cross_validation_scores))),y=SVM_SMOTE_Cross_validation_scores,name='SVM'),
    row=2, col=1
)
fig.add_trace(
    go.Scatter(x=list(range(0,len(GBoost_SMOTE_Cross_validation_scores))),y=GBoost_SMOTE_Cross_validation_scores,name='Gradient Boosting'),
    row=3, col=1
)

fig.update_layout(height=700, width=900, title_text="Different Model 10 Fold Cross Validation")
fig.update_yaxes(title_text="F1 Score")
fig.update_xaxes(title_text="Fold #")

fig.show()
'''

In [None]:
x_axix = [x for x in range(10)] 
plt.title('Compare F1 score of Cross-validation each model')
plt.plot(x_axix, RF_SMOTE_Cross_validation_scores, color='green', label='Random Forest F1')
plt.plot(x_axix, SVM_SMOTE_Cross_validation_scores,  color='skyblue', label='SVM F1')
plt.plot(x_axix, GBoost_SMOTE_Cross_validation_scores, color='blue', label='Gradient Boosting F1')
plt.legend()

plt.xlabel('iteration times')
plt.ylabel('F1 score')
fig = matplotlib.pyplot.gcf()
fig.set_size_inches(18.5, 10.5)
plt.show()

> We can conclude from the 10-fold cross-validation that the performance of random forest on this data set is better than SVM and Gradient Boosting. 


> But is this the case? Let's train and test the model on the upsample data set.





##Traning Model in Upsample Dataset

###RandomForest Classifier

In [None]:
RF_SMOTE.fit(x_SM,y_SM)

In [None]:
RF_SMOTE_pre = RF_SMOTE.predict(x_test)

In [None]:
from sklearn import metrics
from sklearn.metrics import classification_report

RF_SMOTE_fpr, RF_SMOTE_tpr, RF_SMOTE_thresholds = metrics.roc_curve(y_test, RF_SMOTE_pre)
RF_SMOTE_AUC = metrics.auc(RF_SMOTE_fpr, RF_SMOTE_tpr)
print('Random Forest Classifier : \n', classification_report(RF_SMOTE_pre, y_test))
fig = px.area(
    x=RF_SMOTE_fpr, y=RF_SMOTE_tpr,
    title='AUC of RandomForest on Upsample Data:'' %0.4f'% RF_SMOTE_AUC,
    labels=dict(x='False Positive Rate', y='True Positive Rate'),
    width=700, height=600
)
fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1
)
fig.update_yaxes(scaleanchor="x", scaleratio=1)
fig.update_xaxes(constrain='domain')
fig.show()

###Support Vector Machine

In [None]:
SVM_SMOTE.fit(x_SM,y_SM)

In [None]:
SVM_SMOTE_pre = SVM_SMOTE.predict(x_test)

In [None]:
SVM_SMOTE_fpr, SVM_SMOTE_tpr, SVM_SMOTE_thresholds = metrics.roc_curve(y_test, SVM_SMOTE_pre)
SVM_SMOTE_AUC = metrics.auc(SVM_SMOTE_fpr, SVM_SMOTE_tpr)
print('Support Vector Machine : \n', classification_report(SVM_SMOTE_pre, y_test))

fig = px.area(
    x=SVM_SMOTE_fpr, y=SVM_SMOTE_tpr,
    title='AUC of SVM on Upsample Data:'' %0.4f'% SVM_SMOTE_AUC,
    labels=dict(x='False Positive Rate', y='True Positive Rate'),
    width=700, height=600
)
fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1
)
fig.update_yaxes(scaleanchor="x", scaleratio=1)
fig.update_xaxes(constrain='domain')
fig.show()

###Gradient Boosting

In [None]:
GBoost_SMOTE.fit(x_SM,y_SM)

In [None]:
GBoost_SMOTE_pre = GBoost_SMOTE.predict(x_test)

In [None]:
GBoost_SMOTE_fpr, GBoost_SMOTE_tpr, GBoost_SMOTE_thresholds = metrics.roc_curve(y_test, GBoost_SMOTE_pre)
GBoost_SMOTE_AUC = metrics.auc(GBoost_SMOTE_fpr, GBoost_SMOTE_tpr)
print('Gradient Boosting Classifier : \n', classification_report(GBoost_SMOTE_pre, y_test))

fig = px.area(
    x=GBoost_SMOTE_fpr, y=GBoost_SMOTE_tpr,
    title='AUC of GBoost on Upsample Data:'' %0.4f'% GBoost_SMOTE_AUC,
    labels=dict(x='False Positive Rate', y='True Positive Rate'),
    width=700, height=600
)
fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1
)
fig.update_yaxes(scaleanchor="x", scaleratio=1)
fig.update_xaxes(constrain='domain')
fig.show()

For models trained on upsample data sets, 

> From the AUC of the models:
*   RandomFroest = 0.9261
*   Support Vector Machine = 0.8593
*   Gradient Boosting = 0.8480




> We can see the different AUC of each model after the Traning model in the upsampling dataset. 

> Interestingly, except for Gradient boosting, the other two algorithms have improved.



##Tunning parameters



1.   RandomizedSearchCV
2.   GridSearchCV



###RandomizedSearchCV

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import plot_confusion_matrix

# Number of trees in random forest 
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
#rf_random = RandomizedSearchCV(estimator = rf_clf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
#rf_random.fit(x_SM, y_SM)
#print(rf_random.best_params_)

In [None]:
RF_opt= RandomForestClassifier(n_estimators=1800, min_samples_split=2, min_samples_leaf=1, 
                            max_features='auto', max_depth=20, bootstrap=False)
RF_opt.fit(x_SM,y_SM)
RF_opt_pre=RF_opt.predict(x_test)
print('Random Forest Classifier (Optimized)')
_rf_opt=plot_confusion_matrix(RF_opt, x_test, y_test)

In [None]:
RF_opt_fpr, RF_opt_tpr, RF_opt_thresholds = metrics.roc_curve(y_test, RF_opt_pre)
RF_opt_AUC = metrics.auc(RF_opt_fpr, RF_opt_tpr)
print('Random Forest Classifier : \n', classification_report(RF_opt_pre, y_test))

fig = px.area(
    x=RF_opt_fpr, y=RF_opt_tpr,
    title='AUC of Random Forest (Optimized):'' %0.4f'% RF_opt_AUC,
    labels=dict(x='False Positive Rate', y='True Positive Rate'),
    width=700, height=600
)
fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1
)
fig.update_yaxes(scaleanchor="x", scaleratio=1)
fig.update_xaxes(constrain='domain')
fig.show()

###GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

param_test1 = {'n_estimators':range(20,81,10)}
gsearch1 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=1.0,
                                                               min_samples_split=500,
                                                               min_samples_leaf=50,
                                                               max_depth=8
                                                               ,max_features='sqrt',
                                                               subsample=0.8,
                                                               random_state=10), 
param_grid = param_test1, scoring='roc_auc',n_jobs=4, cv=10)
gsearch1.fit(x_SM,y_SM)
print(gsearch1.best_params_)

In [None]:
GB_opt=GradientBoostingClassifier(n_estimators=80,learning_rate=1.0,
                                      min_samples_split=500,min_samples_leaf=50,
                                      max_depth=8,max_features='sqrt',
                                      subsample=0.8,random_state=10)
GB_opt.fit(x_SM,y_SM)
GB_opt_pre=GB_opt.predict(x_test)
print('Gradient Boosting (Optimized)')
print(classification_report(y_test, GB_opt_pre))

In [None]:
GB_opte_fpr, GB_opt_tpr, GB_opt_thresholds = metrics.roc_curve(y_test, GB_opt_pre)
GB_opt_AUC = metrics.auc(GB_opte_fpr, GB_opt_tpr)
print('Gradient Boosting Classifier : \n', classification_report(GB_opt_pre, y_test))

fig = px.area(
    x=GB_opte_fpr, y=GB_opt_tpr,
    title='AUC of GBoost (Optimized):'' %0.4f'% GB_opt_AUC,
    labels=dict(x='False Positive Rate', y='True Positive Rate'),
    width=700, height=600
)
fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1
)
fig.update_yaxes(scaleanchor="x", scaleratio=1)
fig.update_xaxes(constrain='domain')
fig.show()

##Feature Selelction
> According to the form of feature selection, feature selection methods can be divided into three types:

> Filter: 
*  Variance  
*  Correlation Coefficient
*  chi-square test.

> Wrapper: 
* Recursive feature elimination.

> Embedded:
*   Based on machine learning algorithms and models.

> We are going to select features useing Random Forest way.


In [None]:
RF_Features = RandomForestClassifier(random_state=1234)
RF_Features.fit(x_SM, y_SM)
RF_Features.feature_importances_

In [None]:
features_to_plot = 20
importances = RF_Features.feature_importances_
indices = np.argsort(importances)
best_vars = np.array(feature_names)[indices][-features_to_plot:]
values = importances[indices][-features_to_plot:]
best_vars

In [None]:
y_ticks = np.arange(0, features_to_plot)
fig, ax = plt.subplots()
ax.barh(y_ticks, values)
ax.set_yticklabels(best_vars)
ax.set_yticks(y_ticks)
ax.set_title("Random Forest Feature Importances")
fig.tight_layout()
fig.set_size_inches(18.5, 10.5)
plt.show()



> We can clearly see that the variables **Total_Trans_Ct**, **Total_Trans_Ams**, **Total_Revolving_Bal**, **Contacts_Count_12_mon,** **Total_Ct_Chng_Q4_Q**1 have high importance in the random forest model. 

> But in order to avoid overfitting, we can't just use variables with high importance in the model training process.



#Conclusion

For models trained on unbalanced data sets, we can only consider AUC as a performance indicator.

> From the AUC of the models:
*   RandomFroest = 0.9334
*   Support Vector Machine = 0.8547
*   Gradient Boosting = 0.8710

For models trained on upsample data sets, 

> From the AUC of the models:
*   RandomFroest = 0.9261
*   Support Vector Machine = 0.8593
*   Gradient Boosting = 0.8480



> We can conclude from the 10-fold cross-validation that the performance of random forest on upsample data set is better than SVM and Gradient Boosting.


>  After Tunning parameters. (RandomizedSearchCV and GridSearchCV)
*   AUC of Random Forest : 0.9308
*   AUC of Gradient Boosting : 0.8343


After several experiments, we can learn that random forest may be the best performing model for this data set.

The actual performance of the model is unknown in the real world, because we use SMOTE to adjust the data set.


