In [None]:
import os
import random
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import tensorflow as tf
from scipy import special #comb, factorial
from keras import backend as K
from scipy.stats import uniform
from matplotlib import pyplot as plt
from sklearn import tree
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.preprocessing import MinMaxScaler, StandardScaler,LabelEncoder
from sklearn.metrics import classification_report, roc_auc_score, recall_score, make_scorer, plot_confusion_matrix, confusion_matrix, accuracy_score,f1_score



for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
sns.set_style('whitegrid')

In [None]:
df = pd.read_csv('/kaggle/input/credit-card-customers/BankChurners.csv')

In [None]:
df.head()

The column `CLIENTNUM` is not of much use, so we remove it. Furthermore, quoting the person who uploaded the dataset:
> PLEASE IGNORE THE LAST 2 COLUMNS (NAIVE BAYES CLAS…). I SUGGEST TO RATHER DELETE IT BEFORE DOING ANYTHING"

We will do exactly that.

In [None]:
df.drop(['CLIENTNUM',
        'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',
        'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'],
        axis=1,
        inplace=True)

Get basic info about the dataset.

In [None]:
df.info()

Do we have any nulls?

In [None]:
df.isnull().sum()

Luckily, no nulls.

In [None]:
df.head()

In our case, target variable is `Attrition_Flag`. Let's look at it's distribution.

In [None]:
df['Attrition_Flag'].value_counts()

Normalizing distribution gives:

In [None]:
df['Attrition_Flag'].value_counts(normalize=True)

Unfortunately, target variable is highly unbalanced.

Now let's look at some features separately.

In [None]:
dataframe = df
feature = 'Customer_Age'
sns.set_style('ticks')
plt.figure(figsize=(10,7))
dataframe[feature].hist(bins=20)
plt.title(f"Distribution of the feature `{feature}`",fontsize=25)
plt.show()

# How does age affect attrition?

In [None]:
#BOX
dataframe = df
feature_1 = 'Attrition_Flag'
feature_2 = 'Customer_Age'
plt.figure(figsize=(7,7))
sns.boxplot(x=feature_1, y=feature_2, data=dataframe)
plt.show()

As box plot suggests, the two groups (those with closed and open accounts) don't have any substantial differences in the age distribution.

Now let's have a look at the gender distribution

In [None]:
df['Gender'].value_counts()

In [None]:
df['Gender'].value_counts(normalize=True)

# Does gender affect attrition rate?

In [None]:
dataframe = df
feature_1 = 'Gender'
feature_2 = 'Attrition_Flag'
plt.figure(figsize=(7,7))
sns.countplot(data=df,x=feature_1,hue=feature_2)
plt.show()

In [None]:
dataframe = df
feature_1 = 'Gender' # FEATURE
feature_2 = 'Attrition_Flag' # LABEL
to_sort = True # `True` would be useful if label is binary

cs = pd.crosstab(dataframe[feature_1],
                 dataframe[feature_2],
                 normalize='index')
if to_sort == True:
    cs.sort_values(by=[cs.columns[0]],inplace=True)
cs.plot.bar(stacked=True,figsize=(10,6))
plt.title(f'Conditional distributions of `{feature_2}`')
plt.show()

And the answer is: **no**, gender doesn't seem to be a good predictor of an attrition rate, as both distributions are roughly the same (although should be noted that in the female category, attrition rate is a little bit higher)

In [None]:
#Histogram
dataframe = df
feature1 = 'Dependent_count'
sns.countplot(dataframe[feature1], order=df[feature1].value_counts().sort_index().index)
plt.title(f"Distribution of {feature1}")

# How does dependent count affect attrition rate?

In [None]:
#Propotional Barplot (stacked)

dataframe = df
feature_1 = 'Dependent_count' # FEATURE
feature_2 = 'Attrition_Flag' # LABEL
to_sort = False

cs = pd.crosstab(dataframe[feature_1],
                 dataframe[feature_2],
                 normalize='index')
if to_sort == True:
    cs.sort_values(by=[cs.columns[0]],inplace=True)
cs.plot.bar(stacked=True,figsize=(10,6))
plt.title(f'Conditional distributions of `{feature_2}`')
plt.show()

Similarly, dependent count is not really predictive of an attrition rate.

Now let's have an education level.

In [None]:
df['Education_Level'].value_counts()

# Does education level affect attrition rate?

In [None]:
dataframe = df
feature_1 = 'Education_Level'
feature_2 = 'Attrition_Flag'
plt.figure(figsize=(7,7))
sns.countplot(data=df,x=feature_1,hue=feature_2)
plt.show()

In [None]:
dataframe = df
feature_1 = 'Education_Level' # FEATURE
feature_2 = 'Attrition_Flag' # LABEL
to_sort = True # `True` would be useful if label is binary



cs = pd.crosstab(dataframe[feature_1],
                 dataframe[feature_2],
                 normalize='index')
if to_sort == True:
    cs.sort_values(by=[cs.columns[0]],inplace=True)
cs.plot.bar(stacked=True,figsize=(10,6))
plt.title(f'Conditional distributions of `{feature_2}`')
plt.show()

As the graphs show, education level (alone) doesn't allow your to predict whether a customer will close an account or not.

Now let's have a look at the marital status.

In [None]:
df['Marital_Status'].value_counts()

# Does marital status affect attrition rate?

In [None]:
dataframe = df
feature_1 = 'Marital_Status'
feature_2 = 'Attrition_Flag'
plt.figure(figsize=(7,7))
sns.countplot(data=df,x=feature_1,hue=feature_2)
plt.show()

In [None]:
dataframe = df
feature_1 = 'Marital_Status' # FEATURE
feature_2 = 'Attrition_Flag' # LABEL
to_sort = True # `True` would be useful if label is binary



cs = pd.crosstab(dataframe[feature_1],
                 dataframe[feature_2],
                 normalize='index')
if to_sort == True:
    cs.sort_values(by=[cs.columns[0]],inplace=True)
cs.plot.bar(stacked=True,figsize=(10,6))
plt.title(f'Conditional distributions of `{feature_2}`')
plt.show()

No, the marital status doesn't seem to be affecting attrition rate in any substantial way.

Let's have a look at the income category.

In [None]:
df['Income_Category'].value_counts()

# Does income level affect attrition rate?

In [None]:
dataframe = df
feature_1 = 'Income_Category'
feature_2 = 'Attrition_Flag'
plt.figure(figsize=(7,7))
sns.countplot(data=df,x=feature_1,hue=feature_2)
plt.show()

In [None]:
dataframe = df
feature_1 = 'Income_Category' # FEATURE
feature_2 = 'Attrition_Flag' # LABEL
to_sort = True # `True` would be useful if label is binary



cs = pd.crosstab(dataframe[feature_1],
                 dataframe[feature_2],
                 normalize='index')
if to_sort == True:
    cs.sort_values(by=[cs.columns[0]],inplace=True)
cs.plot.bar(stacked=True,figsize=(10,6))
plt.title(f'Conditional distributions of `{feature_2}`')
plt.show()

And yet one more time, we see that the income doesn't have a good predictive power.

# Conclusions so far:

We have been looking at a handful of features so far, namely: age, gender, dependent count, marital status, education level and income level. And as the results show, the aforementioned features are independent from the target variable (i.e., the features are bad predictors of the target variable).

Continuing by looking at the remaining features one by one will be a tedious task, so to spare us time, we will now use Random Forest to help us find out the features with the best predicting power.

# Which features give us the best predicting power? (determined by Random Forest)

In [None]:
X, y = df.drop(['Attrition_Flag'],axis=1).copy(), df['Attrition_Flag'].copy()

non_numeric_features = ['Gender',
'Education_Level',
'Marital_Status', 
'Income_Category',
'Card_Category']

for feature in non_numeric_features:
    lb = LabelEncoder()
    X[feature] = lb.fit_transform(X[feature])

In [None]:
sns.set_style('darkgrid')

forest_clf = RandomForestClassifier(n_estimators=100)
forest_clf.fit(X, y)

importances = forest_clf.feature_importances_
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(7,7))
plt.bar(range(len(indices)),importances[indices])
plt.xticks(range(len(indices)), indices)
plt.title("Feature importance (Random Forest)")
plt.xlabel('Index of a feature')
plt.ylabel('Feature importance')
plt.show()

Let's take a look at top 6 features. Top 6 features are following:

In [None]:
list(X.iloc[:,indices[:6]].columns)

Let's have a look at each of the top 6 features separately. 

We begin with `Total_Trans_Amt`, which stands for total amount of transactions over last 12 months.

In [None]:
dataframe = df
feature = 'Total_Trans_Amt'
sns.set_style('ticks')
plt.figure(figsize=(10,7))
dataframe[feature].hist()
plt.title(f"Distribution of the feature `{feature}`",fontsize=25)
plt.show()

# Does the total transcation amount over the last 12 months differ between attrited customers and existing ones?

In [None]:
dataframe = df
feature_1 = 'Attrition_Flag'
feature_2 = 'Total_Trans_Amt'
plt.figure(figsize=(7,7))
sns.boxplot(x=feature_1, y=feature_2, data=dataframe)
plt.show()

As we see, attrited customers tend to have way less amount of transactions.

The next best feature is `Total_Trans_Ct`, which is the total **count** of transactions over the last 12 months. Even without looking at any graphs, we would expect that the attrited customers tend to have smaller count of the transactions than the existing customers. Let's verify it:

# Is it true that existing customers make more transactions than attrited customers (over the last 12 months)? 

In [None]:
dataframe = df
feature_1 = 'Attrition_Flag'
feature_2 = 'Total_Trans_Ct'
plt.figure(figsize=(7,7))
sns.boxplot(x=feature_1, y=feature_2, data=dataframe)
plt.show()

And indeed, attrited customers tend to have way smaller transaction frequency than the existing customers over the last 12 months.

Next up is `Total_Ct_Chng_Q4_Q1`, which the the difference between the count of the transactions in Q4 and Q1. It wasn't specified how the values in `Total_Ct_Chng_Q4_Q1` were calculated, but it is likely the formula used was:

$$\frac{\text{Count of transactions in Q4}}{\text{Count of transcations in Q1}}$$

# How does the change in transaction frequency differ between attrited customers and existing ones?

In [None]:
dataframe = df
feature_1 = 'Attrition_Flag'
feature_2 = 'Total_Ct_Chng_Q4_Q1'
plt.figure(figsize=(7,7))
sns.boxplot(x=feature_1, y=feature_2, data=dataframe)
plt.show()

Assuming that the formula to calculate the change was $\frac{\text{Count of transactions in Q4}}{\text{Count of transcations in Q1}}$, we can see that most existing customers and attrited customers made less transactions in $Q4$ than in $Q1$. However, attrited customers saw a bigger slump in a transaction frequncy (signified by the lower median and the fact that customers with $0$ transactions in $Q4$ are not considered to be an outliers as far as attrited customers are concerned).

The next feature is `Total_Revolving_Bal`, which is revolving balance on the credit cart (i.e., the amount of money (excluding interest) that you borrowed last month but didn't repay).

Let's look at the distribution:

In [None]:
df['Total_Revolving_Bal'].describe()

In [None]:
#Histogram
dataframe = df
feature = 'Total_Revolving_Bal'
sns.set_style('ticks')
plt.figure(figsize=(10,7))
dataframe[feature].hist()
plt.title(f"Distribution of the feature `{feature}`",fontsize=25)
plt.show()

In [None]:
dataframe = df
feature_1 = 'Attrition_Flag'
feature_2 = 'Total_Revolving_Bal'
plt.figure(figsize=(7,7))
sns.boxplot(x=feature_1, y=feature_2, data=dataframe)
plt.show()

In [None]:
#HISTOGRAM
dataframe = df
cat_feat = 'Attrition_Flag'
cont_feat = 'Total_Revolving_Bal'
plt.figure(figsize=(7,7))
for value in df[cat_feat].unique():
    sns.distplot(df[df[cat_feat] == value][cont_feat], label=value)
plt.legend()
plt.title(f"Distribution of `{cont_feat}` conditional on `{cat_feat}`")
plt.show()

As we see, attrited customers tend to have less revolving balance than existing customers.

The next good predictor is `Avg_Utilization_Ratio`, which stands for the utilization ratio of the user's credit card. Utilization ratio is calculated as:

$$\frac{\text{amount borrowed}}{\text{total available credit }}$$

In [None]:
dataframe = df
feature_1 = 'Attrition_Flag'
feature_2 = 'Avg_Utilization_Ratio'
plt.figure(figsize=(7,7))
sns.boxplot(x=feature_1, y=feature_2, data=dataframe)
plt.show()

Unsurpursingly (partly due to the fact that the utililization ratio and revolving balance both reflect the borrowing habit of a customer), attrited customers tend to borrow less money than the existing customers.

Lastly, we will take a look at `Total_Relationship_Count`, which according to the dataset description, stands for "Total no. of products held by the customer". Let's see the conditional distribution:

In [None]:
dataframe = df
feature_1 = 'Attrition_Flag'
feature_2 = 'Total_Relationship_Count'
plt.figure(figsize=(7,7))
sns.boxplot(x=feature_1, y=feature_2, data=dataframe)
plt.show()

As we see, attrited customers tend to purchase amount of services/product from a bank

# Conclusions:

Having taken a look at the relation between top-6 features and the target, we can reach following conclusions:

1. Attrited customers tend to make less transactions (signified by the lower transaction frequency and lower total transaction amount over the last 12 months)

2. Attrited customers tend to borrow less money from a bank they are about to close the account in (signified by the lower utilization ratio and revolving balance)

3. Attrited customers tend to use less of the bank's services than the existing customers.

# Now let's try to classify.

We will be using top 6 features determined by the Random Forest.

In [None]:
features = ['Total_Trans_Amt',
             'Total_Trans_Ct',
             'Total_Ct_Chng_Q4_Q1',
             'Total_Revolving_Bal',
             'Total_Relationship_Count',
             'Total_Amt_Chng_Q4_Q1']

X = df.loc[:,features]
y = df['Attrition_Flag']

Let's have a look at the features' distribution, to determine which transformation/scaling technique to use.

In [None]:
fig, ax = plt.subplots(2,3,figsize=(10,10))
ax = ax.flatten()

for i,col in enumerate(X.columns):
    ax[i].hist(X[col])
    ax[i].set_title(f'Distribution of {col}')
    


All features besides `Total_Trans_Amt` are (roughly) normally distributed. To normalize `Total_Trans_Amt`, we will use logarithmic transform.

In [None]:
X['Total_Trans_Amt'] = np.log(X['Total_Trans_Amt']+1)

Now the distribution looks like this:

In [None]:
plt.hist(X['Total_Trans_Amt'])
plt.title('Distribution of `Total_Trans_Amt`')
plt.show()

Now we will split and standartize.

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=11)

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)


In [None]:
scoring = make_scorer(f1_score,pos_label='Existing Customer')

# Naive Bayes

In [None]:
nb_clf = GaussianNB().fit(X_train,y_train)
print(classification_report(y_true=y_test, y_pred=nb_clf.predict(X_test)))
plot_confusion_matrix(nb_clf, X_test, y_test)

# Logistic regression

In [None]:
log_random_state = None
log_clf = LogisticRegression(random_state=log_random_state).fit(X_train, y_train)
print(classification_report(y_true=y_test, y_pred=log_clf.predict(X_test)))
plot_confusion_matrix(log_clf, X_test, y_test)

# KNN

We will use Grid Search (where metric we are trying to optimize is f1 score with `Attrited Customer` being a positive label)

In [None]:
MIN = 1 #Min number of neighbors
MAX = 30 #Max number of neighbors
knn_estimator = KNeighborsClassifier()
knn_clf = GridSearchCV(knn_estimator,
                       {'n_neighbors': range(MIN,MAX+1)}
                       ,scoring=scoring).fit(X_train, y_train)
print(f"Best estimator: {knn_clf.best_estimator_}")
print(classification_report(y_true=y_test, y_pred=knn_clf.predict(X_test)))
plot_confusion_matrix(knn_clf, X_test, y_test)

# Random Forest (with grid search)

In [None]:
estimator = RandomForestClassifier(random_state=13)
rf_clf = GridSearchCV(estimator,
                      param_grid={'n_estimators':[10,20,50,100], 'criterion': ['entropy','gini']},
                      scoring=scoring).fit(X_train, y_train)

print(classification_report(y_true=y_test, y_pred=rf_clf.predict(X_test)))
plot_confusion_matrix(rf_clf, X_test, y_test)

In [None]:
model = ['NB','Logistic','KNN','RF']
clfs = [nb_clf,log_clf,knn_clf,rf_clf]
f1_att = [f1_score(y_true=y_test,
                   y_pred=x.predict(X_test),
                   pos_label='Attrited Customer') for x in clfs]
f1_ex = [f1_score(y_true=y_test,
                   y_pred=x.predict(X_test),
                   pos_label='Existing Customer') for x in clfs]

f1_att = [round(x,2) for x in f1_att]
f1_ex = [round(x,2) for x in f1_ex]

In [None]:
fig, ax = plt.subplots(1,2,figsize=(10,6))
ax = ax.flatten()


ax[0].bar(model,f1_att)
ax[0].set_title('f1 score (Positive label: Attrited customer)')

ax[1].bar(model,f1_ex)
ax[1].set_title('f1 score (Positive label: Existing customer)')

plt.show()

We see that random forest gives us the most accurate prediction.