In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
sns.set()
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# 1) Import data

In [None]:
data = pd.read_csv('/kaggle/input/credit-card-customers/BankChurners.csv')
data.head(3)

In [None]:
# Dataset provider note
data = data.iloc[:,:-2]

# Drop identity
data.drop('CLIENTNUM',axis=1, inplace=True)
data.head(3)

In [None]:
print('Input shape =',data.shape)

In [None]:
data.info()

Declare categorical, numerical columns.

In [None]:
cat_cols = data.select_dtypes(include='object')
num_cols = data.select_dtypes(exclude='object')

In [None]:
# Categorical Columns
cat_cols.head(3)

In [None]:
# Numerical Columns
num_cols.head(3)

### Mark color to each target <br>
- loss : is a color for customer who move to other company (Churn).
- still : is a color for customer who doesn't churn.

In [None]:
loss = sns.color_palette("Paired")[5]
still = sns.color_palette("Paired")[1]

# 2) Categorical columns

### Distribution of each categorical column.

In [None]:
fig,axes = plt.subplots(1,6)
fig.set_size_inches(23,6)
j=0
for col in cat_cols.columns:
    if col=='Attrition_Flag':
        color='darkorange'
    else:
        color='mediumpurple'
    sns.countplot(data=data, x=col, ax=axes[j], color=color)
    axes[j].set_xticklabels( data[col].unique(), rotation=90)
    axes[j].set_ylabel('')
    axes[j].xaxis.set_label_coords(0.5,1.05)
    j+=1

plt.show()

We see the exact same distribution of 'Education_Level' in 'Marital_Status', and vice-versa.

In [None]:
fig, ax = plt.subplots(1,2)
fig.set_size_inches(12,4)
sns.countplot(data=cat_cols, x='Education_Level', hue='Marital_Status', ax=ax[0])
ax[0].tick_params(axis='x', labelrotation= 40.0)
ax[0].xaxis.set_label_coords(0.5,1.1)
sns.countplot(data=cat_cols, hue='Education_Level', x='Marital_Status', ax=ax[1], hue_order=['Uneducated', 'High School', 'College', 'Graduate', 'Post-Graduate', 'Doctorate'])
ax[1].tick_params(axis='x', labelrotation= 40.0)
ax[1].xaxis.set_label_coords(0.5,1.1)
plt.show()

We see the same distribution of 'Income' in all 'Education_Level', and vice-versa. <br>
Most people have $<40K income in all types of 'Education_Level' even in 'Doctorate'. <br>
In all Income range, Graduate have the largest number.

In [None]:
fig, ax = plt.subplots(1,2)
fig.set_size_inches(12,4)

sns.countplot(ax=ax[0], data=cat_cols, x='Education_Level', hue='Income_Category', color='indigo',hue_order=['Less than $40K', '$40K - $60K', '$60K - $80K', '$80K - $120K', '$120K +'])
ax[0].tick_params('x', labelrotation=30)
ax[0].xaxis.set_label_coords(0.5,1.1)

sns.countplot(ax=ax[1], data=cat_cols, hue='Education_Level', x='Income_Category')
ax[1].set_xticklabels(['Less than $40K', '$40K - $60K', '$60K - $80K', '$80K - $120K', '$120K +'], Rotation= 30)
ax[1].xaxis.set_label_coords(0.5,1.1)

plt.legend(bbox_to_anchor=(1, 1), loc='upper left')

plt.show()

In [None]:
fig, ax = plt.subplots(1,2)
fig.set_size_inches(12,4)
sns.countplot(data=cat_cols, x='Marital_Status', hue='Income_Category', ax=ax[0], color='indigo',hue_order=['Less than $40K', '$40K - $60K', '$60K - $80K', '$80K - $120K', '$120K +'])
ax[0].tick_params(axis='x', labelrotation= 40.0)
ax[0].xaxis.set_label_coords(0.5,1.1)
sns.countplot(data=cat_cols, x='Income_Category', hue='Marital_Status', ax=ax[1])
ax[1].set_xticklabels(['Less than $40K', '$40K - $60K', '$60K - $80K', '$80K - $120K', '$120K +'], Rotation= 40)
ax[1].xaxis.set_label_coords(0.5,1.1)
plt.show()

### Missing values

In [None]:
cat_cols.isnull().sum()

We don't see any missing value which is kinf of wierd. <br>

Let's try looking into each value in each category.

In [None]:
for col in cat_cols.columns:
    print('---------'+col+'---------')
    print(cat_cols[col].value_counts())

We see that missing values are denoted with 'unknown'. So, we have to replace 'unknown' with NaN.

In [None]:
# Display the missing values
pd.Series(cat_cols.replace('Unknown', np.nan).isnull().sum()/len(cat_cols)*100, name='Missing value').apply(lambda x:round(x,4)).apply(lambda x:str(x)+' %')

## 2.1) Ordinal encoding, manually

We'll do ordinal encoding to 'Education_Level', 'Income_Category', and 'Card_Category' feature. For 'Unknown' category, we will replace it with average of order.

In [None]:
Edu_level = {'Uneducated':0, 'High School':1, 'College':2, 'Graduate':3, 'Post-Graduate':4, 'Doctorate':5, 'Unknown':15/6}
Income_cat = {'Less than $40K':0, '$40K - $60K':1, '$60K - $80K':2, '$80K - $120K':3,  '$120K +':4, 'Unknown':10/4}
Card_cat = {'Blue':0, 'Silver':1, 'Gold':2, 'Platinum':3}

data_ordinal_encoded = data.copy()
data_ordinal_encoded['Education_Level'] = data_ordinal_encoded['Education_Level'].map(Edu_level)
data_ordinal_encoded['Income_Category'] = data_ordinal_encoded['Income_Category'].map(Income_cat)
data_ordinal_encoded['Card_Category'] = data_ordinal_encoded['Card_Category'].map(Card_cat)
data_ordinal_encoded.head(3)

Other categorical columns will be one-hot encoded in the later section. This means we consider 'Unknown' value in 'Marital_Status' as a new class.

# 3) Continuous columns

### Pairwise scatterplot of continuous variables
The picture is in ..\scatterplot\\.png

### Missing value

In [None]:
num_cols.isnull().sum()

### Correlation

In [None]:
fig,ax=plt.subplots()
fig.set_size_inches(10,10)
sns.heatmap(data=num_cols.corr(), ax=ax, annot=True, fmt='.1f')
plt.show()

We see that **Months_on_book** and **Customer_Age**  , **Total_Trans_Amt** and **Total_Trans_Ct** are highly linearly correlated.

### Variance.

In [None]:
data_var = pd.DataFrame(data.var(), columns=['Var']).apply(lambda x:round(x,2))
data_var.style.background_gradient(sns.light_palette('green',as_cmap=True))

We will deal with cantinuous features using RFE(Recursive Feature Elimination) in the next section. <br>
Recursive feature elimination (RFE) is a **feature selection** method that do 
1. fitting a model
2. removes the weakest feature(s) by feature importance.<br>Features' importances are ranked by the model’s **coef_** or **feature\_importances\_** attributes, and by recursively eliminating a small number of features per loop, RFE attempts to eliminate dependencies and collinearity that may exist in the model.
3. repeat until the specified number of features is reached


# 4) Feature selection, One-hot encoding, and Scaling

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, plot_roc_curve

In [None]:
X = data_ordinal_encoded.drop('Attrition_Flag',axis=1)
Y = data_ordinal_encoded['Attrition_Flag']

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, stratify=Y)

ct = ColumnTransformer(
    [("one hot encode",OneHotEncoder(sparse=False),[1,4]),
     ("scale", StandardScaler(), [e for e in range(len(X.columns)) if e not in {1,4}])],
    remainder='passthrough')
X_train = ct.fit_transform(X_train)
X_test = ct.transform(X_test)

le=LabelEncoder()
Y_train = le.fit_transform(Y_train)
Y_test = le.transform(Y_test)

### Note: <br>
Lucklily we don't have a problem here.<br> 
For small dataset, I recommend **fit the OneHotEncoder object to entire dataset** instead of X_train because ,after splitting the data, X_train sometimes might not contain all the unique values of all categories. If so, there will be a problem when trying to transform X_test since OneHotEncoder object doesn't know some values occured in X_test (but didn't occur in X_train that it fits).

In [None]:
rfe = RFE(estimator = RandomForestClassifier(), n_features_to_select=11, verbose=1)
rfe.fit(X_train, Y_train)

## Feature Selected

In [None]:
X_train = X_train[:,rfe.support_]
X_test = X_test[:,rfe.support_]

# 5) Building models

We'll use Random Forest Classifier with its default parameter.

In [None]:
rf = RandomForestClassifier().fit(X_train, Y_train)

# Predict the test set

In [None]:
y_pred = rf.predict(X_test)

print('Confusion matrix\n',confusion_matrix(y_pred, Y_test))
print('\nroc_auc_score\n',roc_auc_score(Y_test, y_pred))
print('\nClassification report\n',classification_report(y_pred, Y_test))

In [None]:
plot_roc_curve(rf, X_test, Y_test)
plt.show()

We see that Random Forest has done a very good job!!