# Task 01
- Improve performance of predicting churned customers - recall need to be higher

# Task 02
- An in-depth Exploratory Data Analysis that can help to visualize where the difference lies between churning and non-churning customers.

__Features Details__
- Clientnum	-	Client number. Unique identifier for the customer holding the account
- Attrition_Flag	-	Internal event (customer activity) variable - if the account is closed then 1 else 0
- Customer_Age	-	Demographic variable - Customer's Age in Years
- Gender	-	Demographic variable - M=Male, F=Female
- Dependent_count	-	Demographic variable - Number of dependents
- Education_Level	-	Demographic variable - Educational Qualification of the account holder (example: high school, college graduate, etc.)
- Marital_Status	-	Demographic variable - Married, Single, Unknown
- Income_Category	-	Demographic variable - Annual Income Category of the account holder (< $40K, $40K - 60K, $60K - $80K, $80K-$120K, > $120K, Unknown)
- Card_Category	-	Product Variable - Type of Card (Blue, Silver, Gold, Platinum)
- Months_on_book	-	Months on book (Time of Relationship)
- Total_Relationship_Count	-	Total no. of products held by the customer
- Months_Inactive_12_mon	-	No. of months inactive in the last 12 months
- Contacts_Count_12_mon	-	No. of Contacts in the last 12 months
- Credit_Limit	-	Credit Limit on the Credit Card
- Total_Revolving_Bal	-	Total Revolving Balance on the Credit Card
- Avg_Open_To_Buy	-	Open to Buy Credit Line (Average of last 12 months)
- Total_Amt_Chng_Q4_Q1	-	Change in Transaction Amount (Q4 over Q1) 
- Total_Trans_Amt	-	Total Transaction Amount (Last 12 months)
- Total_Trans_Ct	-	Total Transaction Count (Last 12 months)
- Total_Ct_Chng_Q4_Q1	Num	Change in Transaction Count (Q4 over Q1) 
- Avg_Utilization_Ratio	-	Average Card Utilization Ratio

In [None]:
%matplotlib inline

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
#pd.set_option('display.max_rows', None)

import gc

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.utils import class_weight
from sklearn.metrics import classification_report, roc_auc_score, recall_score, confusion_matrix
from sklearn.preprocessing import StandardScaler, LabelEncoder

import matplotlib.pyplot as plt
import seaborn as sns

from plotly.offline import iplot
#to link plotly to pandas
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline = False, world_readable = True)

from IPython.display import display

plt.rcParams["figure.figsize"] = (12, 8)
plt.rcParams['axes.titlesize'] = 16
plt.style.use('seaborn-whitegrid')
sns.set_palette('Set2')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

from time import time, strftime, gmtime
start = time()
import datetime
print(str(datetime.datetime.now()))

import warnings
warnings.simplefilter('ignore')

In [None]:
df = pd.read_csv('/kaggle/input/credit-card-customers/BankChurners.csv')
print(df.shape)
df.rename(columns = {'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1': 'NBC 12_1', 
                    'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2': 'NBC 12_2'}, inplace = True)
df.head()

# Task 02 - EDA

In [None]:
df['CLIENTNUM'].nunique()

- All client numbers are unique.

- Target feature is 'Attrition_Flag' in which 'Attrited Customer' means it's a churn (1) and 'Existing Customer' means there is no churn (0)
- First we map these values for the target

In [None]:
df['Attrition_Flag'] = df['Attrition_Flag'].map({'Existing Customer': 0, 'Attrited Customer': 1})

In [None]:
ax = sns.countplot(data = df, x = 'Attrition_Flag')
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))

- The dataset is highly imbalanced
- We will have to do upsampling or downsampling

In [None]:
df.describe().T

In [None]:
df.info()

__Check for missing values__

In [None]:
df.isna().sum()

Thera are no missing values in the dataset

__Outliers Check__

In [None]:
num_cols = [c for c in df.columns if (df[c].dtype != 'object') & (c != 'Attrition_Flag')]
cat_cols = [c for c in df.columns if (c not in num_cols) & (c != 'Attrition_Flag')]
len(num_cols), cat_cols

In [None]:
fig, ax = plt.subplots(int(18 / 2), 2, figsize = (16, 24))
ax = ax.flatten()

for i, c in enumerate(num_cols):
    sns.boxplot(x = df[c], ax = ax[i])
plt.suptitle('Outlier Analysis using BoxPlots', fontsize = 25, y = 1)
plt.delaxes()
fig.tight_layout()

- We can remove the two NBC features as NBC 12_1 values are closer to 1 and NBC 12_2 is closer to 0

In [None]:
df.drop(['NBC 12_1', 'NBC 12_2'], axis = 1, inplace = True)
num_cols = [x for x in num_cols if x not in ['NBC 12_1', 'NBC 12_2']]
df.shape, num_cols

__Customer Age__

In [None]:
df['Customer_Age'].iplot(
    kind = 'hist',
    bins = 50,
    xTitle = 'Customer Age',
    yTitle = 'Count',
    title = 'Customer Age Distribution'
)

In [None]:
df['Customer_Age'].max(), df['Customer_Age'].min()

- Max age is 73 which is not an outlier

In [None]:
plt.title('Unique Values Count of Gender')
plt.pie(df['Gender'].value_counts().values, labels = df['Gender'].value_counts().index, autopct = '%1.2f%%', 
explode = [0, 0.05], shadow = True);

In [None]:
plt.title('Unique Values Count of Education_Level')
pct = df['Education_Level'].value_counts().values / np.sum(df['Education_Level'].value_counts()) * 100

plt.pie(df['Education_Level'].value_counts().values, labels = df['Education_Level'].value_counts().index, autopct = '%1.2f%%', 
 explode = (pct == max(pct)) * 0.1, shadow = True);

In [None]:
plt.title('Unique Values Count of Income_Category')
pct = df['Income_Category'].value_counts().values / np.sum(df['Income_Category'].value_counts()) * 100

plt.pie(df['Income_Category'].value_counts().values, labels = df['Income_Category'].value_counts().index, autopct = '%1.2f%%', 
 explode = (pct == max(pct)) * 0.05, shadow = True);

In [None]:
def show_value(x):
    a  = np.round(x / 100.0 * np.sum(df['Marital_Status'].value_counts().values), 2)
    return a

plt.title('Unique Values Count of Marital_Status')
pct = df['Marital_Status'].value_counts().values / np.sum(df['Marital_Status'].value_counts()) * 100

plt.pie(df['Marital_Status'].value_counts().values, labels = df['Marital_Status'].value_counts().index, autopct = show_value, 
 explode = (pct == max(pct)) * 0.05, shadow = True);

- There are 749 customers whose marital status is given as Unknown, let's check their age before deciding whether to impute this category or leave it as a value

In [None]:
df[df['Marital_Status'] == 'Unknown']['Customer_Age'].iplot(kind = 'hist', bins = 50, linecolor = 'black')

The age distribution is between 25 and 65 years
- We can consider one of the two options for Unknown category
    1. Make the value as 'Married' for ages above 35, below 35 as 'Single'
    2. Keep it as it is

In [None]:
plt.title('Unique Values Count of Card_Category')

ax = sns.countplot(data = df, x = 'Card_Category')
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))

- Blue category is highest - it must be entry-level card

In [None]:
plt.title('Unique Values Count of Dependent_count')

ax = sns.countplot(data = df, x = 'Dependent_count')
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))

__Total_Relationship_Count is the	Total no. of products held by the customer__

In [None]:
plt.title('Unique Values Count of Total_Relationship_Count')

ax = sns.countplot(data = df, x = 'Total_Relationship_Count')
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))

In [None]:
df['Total_Trans_Ct'].iplot(
    kind = 'hist',
    bins = 100,
    xTitle = 'Total Trans Count',
    yTitle = 'Count',
    title = 'Total Transaction Count Distribution for the last 12 months',
    linecolor = 'black'
)

In [None]:
df['Total_Trans_Amt'].iplot(
    kind = 'hist',
    bins = 100,
    xTitle = 'Total Trans Amount',
    yTitle = 'Count',
    title = 'nsaction Amount Distribution for the last 12 months',
    linecolor = 'black'
)

In [None]:
plt.title('Number of Customers Inactive - Months_Inactive_12_mon')

ax = sns.countplot(data = df, x = 'Months_Inactive_12_mon')
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))

In [None]:
plt.title('Number of Customers Contacted - Contacts_Count_12_mon')

ax = sns.countplot(data = df, x = 'Contacts_Count_12_mon')
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))

In [None]:
plt.figure(figsize = (16, 10))
plt.title('Number of Customers holding the card for months')
ax = sns.countplot(data = df, x = 'Months_on_book')
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))

- There are 2463 customers who are holding the card for 36 months
- There are 103 customers who are holding the catd for 56 months
- 70 customers are holding for 13 months

In [None]:
temp = pd.pivot_table(data = df, index = 'Card_Category', columns = ['Income_Category', 'Marital_Status', 'Education_Level', 'Gender'], 
                      values = ['Attrition_Flag'], aggfunc = 'mean', fill_value = 0)
temp.columns = temp.columns.ravel()
#Drop columns with all 0
#temp = temp.loc[:, temp.sum(axis = 0) != 0].T
temp = temp.T
temp.style.background_gradient(sns.light_palette('#2ecc71', as_cmap = True))

In [None]:
temp.sum(axis = 0)

- The pivot table above gives a good insight into churning customers based on the card category by their income, marital status and their education level
- The Blue card holders are ones which is churning more followed by Silver card holders
- The churn is minimal in Platinum category

In [None]:
temp.loc[temp['Blue'] > 0.25].style.background_gradient(sns.light_palette('#2ecc71', as_cmap = True))

- The above gives the category where the mean churn is more than 0.25
- Highest customer churn for Blue card holders is in the categories 80𝐾−120K Income, Unknown marital status with Post-Graduate education level and male
- All the mean Attrition value equal to 1 is single cusotmer
- It'll be interesting to select the customers(rows) based on these category groups

- Let's create a dict with the index of temp as values

In [None]:
categories = []
for each in temp.index.values:
    #print(each)
    categories.append('_'.join([each[1], each[2], each[3], each[4]]))
#print(len(categories), categories[:10])
cat_dict = {categories[i]: i for i, c in enumerate(categories)}
cat_dict

In [None]:
df_new = df.copy()

In [None]:
df_new['new_cat'] = df_new['Income_Category'] + '_' + df_new['Marital_Status'] + '_' + df_new['Education_Level'] + '_' + df_new['Gender']
df_new.drop(['Income_Category', 'Marital_Status', 'Education_Level', 'Gender'], axis = 1, inplace = True)
df_new['new_cat'] = df_new['new_cat'].map(cat_dict)
df_new.head()

In [None]:
pi_values = ['Customer_Age', 'Months_on_book', 'Total_Relationship_Count', 'Months_Inactive_12_mon', 
            'Credit_Limit', 'Total_Revolving_Bal', 'Contacts_Count_12_mon', 'Avg_Open_To_Buy', 'Total_Trans_Amt', 
            'Total_Trans_Ct', 'Avg_Utilization_Ratio', 'Dependent_count']

In [None]:
pivot1 = pd.pivot_table(data = df_new[df_new['Attrition_Flag'] == 1], index = 'new_cat', 
               values = pi_values, aggfunc = 'mean', fill_value = 0).T
#pivot1.head()

In [None]:
pivot0 = pd.pivot_table(data = df_new[df_new['Attrition_Flag'] == 0], index = 'new_cat', 
               values = pi_values, aggfunc = 'mean', fill_value = 0).T
#pivot0.head()

__Using amazing Plotly Express to plot interactive charts with pandas__

In [None]:
import plotly.express as px

def visualize_plotly(a, b, title):
    t = pd.concat([a, b], axis = 1)
    t.columns = ['Churn', 'NoChurn']
    t['New Category'] = t.index
    fig = px.line(t, x = 'New Category', y = ['Churn', 'NoChurn'], title = title, 
                 color_discrete_sequence = ['cyan', 'coral'])
    fig.update_layout({'plot_bgcolor': 'white'})
    fig.update_xaxes(showgrid = True, gridwidth = 1, gridcolor = 'floralwhite')
    fig.update_yaxes(showgrid = True, gridwidth = 1, gridcolor = 'floralwhite')
    fig.show()

In [None]:
for c in pivot1.index[:4]:
    visualize_plotly(pivot1.loc[c, :], pivot0.loc[c, :], title = c)

# Obeservations from Plotly Trendlines

*__Avg_Open_To_Buy - Open to buy Credit Line__*
- Both the Churn and No churn customers' trendline are almost the same for this feature

*__Avg_Utilization_Ratio__*
- Its clear that this feature follows a lower trendline for Churned customer when compared to the No churn cusotmers

*__Contacts_Count_12_mon__*
- The bank has contacted the churning customers more frequently than the no churn cusotmers
- Maybe they would have seen the open to buy Credit Line customers' trend

*__Credit_Limit__*
- Not much difference between the two segments

In [None]:
for c in pivot1.index[4: 8]:
    visualize_plotly(pivot1.loc[c, :], pivot0.loc[c, :], title = c)

*__Customer_Age__*
- Customers' age doesn't play a part in churning accodring to the plot above

*__Months_Inactive_12_mon__*
- Customers who tend to churn were more inactive in the last 12 months than the customers who do not churn

*__Months_on_book__*
- On an average most of the customers who stay with the bank - no churn - are with then between 35 to 40 months

In [None]:
for c in pivot1.index[8:]:
    visualize_plotly(pivot1.loc[c, :], pivot0.loc[c, :], title = c)

*__Total_Relationship_Count__*
- There is a clear trend that customers who hold more products from the same bank stay with the bank while the customers holding fewer products churn

*__Total_Revolving_Balance__*
- Customers with lower revolving balance are churning more

*__Total_Trans_Amt__*
- Churning customers spend lesser than non churning customers

*__Total_Trans_Ct__*
- Customers who churn make fewer transactions using the bank's card than the customers stay with the bank

# Task 01
- Improve performance of predicting churned customers - recall need to be higher

In [None]:
df.head()

In [None]:
#Dropping CLIENTNUM
df.drop('CLIENTNUM', axis = 1, inplace = True)
num_cols.remove('CLIENTNUM')

__Lets check the correlation between the Numerical Features__

In [None]:
corr1 = df[num_cols].corr(method = 'pearson')
corr2 = df[num_cols].corr(method = 'spearman')

fig = plt.figure(figsize = (10, 8))
mask = np.triu(np.ones_like(corr1, dtype = bool))
sns.heatmap(corr1, mask = mask, annot = True, cmap = 'PiYG', vmin = -1, vmax = +1)
plt.title('Pearson Correlation')
plt.xticks(rotation = 50)
plt.show()

fig = plt.figure(figsize = (10, 8))
mask = np.triu(np.ones_like(corr2, dtype = bool))
sns.heatmap(corr2, mask = mask, annot = True, cmap = 'PiYG', vmin = -1, vmax = +1)
plt.title('Spearman Correlation')
plt.xticks(rotation = 50)
plt.show()

- Obviously Total_Trans_Amt and Total_Amt_Chng_Q4_Q1 are highly correlated as they are interrelated, same for Total_Trans_Ct/Total_Ct_Chng_Q4_Q1

In [None]:
# example of scatter plot - we pick pair having highest (Pearson) correlation
sns.jointplot(data = df, x = 'Total_Trans_Amt', y = 'Total_Amt_Chng_Q4_Q1', hue = 'Attrition_Flag')
plt.show()

In [None]:
sns.jointplot(data = df, x = 'Total_Trans_Ct', y = 'Total_Ct_Chng_Q4_Q1', hue = 'Attrition_Flag')
plt.show()

In [None]:
#High Negative Correlation
sns.jointplot(data = df, x = 'Avg_Utilization_Ratio', y = 'Avg_Open_To_Buy', hue = 'Attrition_Flag')
plt.show()

__Label Encode Categorical Features__

In [None]:
lbl = LabelEncoder()

for c in df[cat_cols]:
    print(f"Label Encoding Categorical Feature - {c.upper()}")
    df[c] = lbl.fit_transform(df[c])
print('Label Encoding done...')
df[cat_cols].head()

__Standardize Numerical Features__

In [None]:
std = StandardScaler()

df[num_cols] = std.fit_transform(df[num_cols])
print('Standardizing Numerical Features done...')
df[num_cols].head()

In [None]:
X = df.drop('Attrition_Flag', axis = 1)
y = df['Attrition_Flag'].copy()

#Xtrain, Xvalid, ytrain, yvalid = train_test_split(X, y, stratify = y, random_state = 2021)
#print(Xtrain.shape, ytrain.shape, Xvalid.shape, yvalid.shape)

In [None]:
def plot_confusion(mat):
    plt.figure(figsize = (8, 4))
    sns.heatmap(pd.DataFrame(mat), annot = True, annot_kws = {"size": 25}, cmap = 'PiYG', fmt = 'g')
    plt.title('Confusion matrix', y = 1.1, fontsize = 22)
    plt.ylabel('Actual', fontsize = 18)
    plt.xlabel('Predicted', fontsize = 18)
    plt.show()

In [None]:
num_pos_samples = y.value_counts().values[1]
num_neg_samples = y.value_counts().values[0]
num_neg_samples / num_pos_samples

In [None]:
from xgboost import XGBClassifier
import xgboost as xgb

xgb_params = {
         'objective': 'binary:logistic',
         'lambda': 0.0030282073258141168, 
         'alpha': 0.01563845128469084,
         'colsample_bytree': 0.55,
         'subsample': 0.7,
         'learning_rate': 0.01,
         'max_depth': 9,
         'random_state': 2020, 
         'min_child_weight': 257,
         'eval_metric': 'auc',
         'seed': 2021,
         'scale_pos_weight': num_neg_samples / num_pos_samples
         }

In [None]:
n_folds = 5
preds_xg = []
mean_recall = []

skf = StratifiedKFold(n_splits = n_folds)

for i, (trn_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"Fold: {i + 1}")
    Xtrain, ytrain = X.iloc[trn_idx], y[trn_idx]
    Xvalid, yvalid = X.iloc[val_idx], y[val_idx]
    print(Xtrain.shape, ytrain.shape, Xvalid.shape, yvalid.shape)
    
    xg_train = xgb.DMatrix(Xtrain, label = ytrain)
    xg_valid = xgb.DMatrix(Xvalid, label = yvalid)

    xgboost_sim = xgb.train(xgb_params,
                              xg_train,
                              10000,
                              verbose_eval = 200,
                              evals = [(xg_train, 'train'), (xg_valid, 'valid')],
                              early_stopping_rounds = 100)

    valid_preds = xgboost_sim.predict(xg_valid)
    print('XGBOOST ROC_AUC_SCORE: ', roc_auc_score(yvalid, valid_preds))
    print('XGBOOST RECALL SCORE: ', recall_score(yvalid, valid_preds > 0.5, average = 'macro'))
    mean_recall.append(recall_score(yvalid, valid_preds > 0.5, average = 'macro'))
    conf_mat = confusion_matrix(yvalid, valid_preds > 0.5)
    plot_confusion(conf_mat)
    
    preds_xg.append(valid_preds)
    print()
print(f"Mean Recall Score: {round(np.mean(mean_recall), 2)}")

# Managing Imbalanced Dataset
There are two ways to manage imbalanced dataset:

- OverSampling
- UnderSampling

In [None]:
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

from collections import Counter

# OverSampling

__SMOTE__

In [None]:
n_folds = 5
preds_xg = []
mean_recall = []
skf = StratifiedKFold(n_splits = n_folds)

print(f"Original Dataset class count: {Counter(y)}")
print('OverSampling...')
smote = SMOTE(random_state = 2021)
X_sm, y_sm = smote.fit_resample(X, y)
print(X_sm.shape, y_sm.shape)
print(f"OverSampled Dataset class count: {Counter(y_sm)}")
    
for i, (trn_idx, val_idx) in enumerate(skf.split(X_sm, y_sm)):
    print(f"Fold: {i + 1}")
    Xtrain, ytrain = X_sm.iloc[trn_idx], y_sm[trn_idx]
    Xvalid, yvalid = X_sm.iloc[val_idx], y_sm[val_idx]
    print(Xtrain.shape, ytrain.shape, Xvalid.shape, yvalid.shape)
    #print(f"Original Dataset class count: {Counter(ytrain)}")
    
    
    xg_train = xgb.DMatrix(Xtrain, label = ytrain)
    xg_valid = xgb.DMatrix(Xvalid, label = yvalid)

    xgboost = xgb.train(xgb_params,
                              xg_train,
                              10000,
                              verbose_eval = 200,
                              evals = [(xg_train, 'train'), (xg_valid, 'valid')],
                              early_stopping_rounds = 100)

    valid_preds = xgboost.predict(xg_valid)
    print('XGBOOST ROC_AUC_SCORE - OverSampled: ', roc_auc_score(yvalid, valid_preds))
    print('XGBOOST RECALL SCORE - OverSampled: ', recall_score(yvalid, valid_preds > 0.5))
    mean_recall.append(recall_score(yvalid, valid_preds > 0.5))
    conf_mat = confusion_matrix(yvalid, valid_preds > 0.5)
    plot_confusion(conf_mat)
    
    preds_xg.append(valid_preds)
    print()
print(f"Mean Recall Score: {round(np.mean(mean_recall), 4)}")

# Feature Importance Using SHAP

In [None]:
import shap

shap.initjs()

In [None]:
explainer = shap.TreeExplainer(xgboost_sim)
shap_values = explainer.shap_values(Xvalid)

In [None]:
shap.summary_plot(shap_values, Xvalid)

In [None]:
shap.force_plot(explainer.expected_value, shap_values[1], Xvalid.iloc[0, :])

- Features in red color influence positively, i.e. drag the prediction value closer to 1, features in blue color - the opposite
- Bigger Total_Trans_Ct, Total_Trans_Amt leads to prediction towards '0' - which we can see from the plotly plots for the same
- Each arrow’s size represents the magnitude of the corresponding feature’s effect
- The “base value” marks the model’s average prediction over the training set

In [None]:
finish = time()
print(strftime("%H:%M:%S", gmtime(finish - start)))