In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

This project describes the demographic analysis of customers who stop using credit card services at a bank and predicts customers who will stop using credit card services. From the results that I did, I can predict with a Recall Score 95% and Accuracy Score 93%.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from plotly.offline import iplot
import warnings
warnings.filterwarnings("ignore")

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, RandomizedSearchCV, GridSearchCV

from sklearn.metrics import recall_score, classification_report, accuracy_score


In [None]:
data=pd.read_csv('../input/credit-card-customers/BankChurners.csv')

**DATA CLEANING**

From the information, we should drop last two columns.

In [None]:
data.drop(data.columns[[21, 22]], axis = 1, inplace = True)

In [None]:
data.info()

There are 21 columns without missing values.

In [None]:
# Data Categorical
col_obj = list(data.select_dtypes(['object']).columns.drop('Attrition_Flag'))
col_obj

In [None]:
#Data Numerical
col_num = list(data.select_dtypes(['int64','float64']).columns.drop('CLIENTNUM'))
col_num

# EDA

### Categorical Features

In [None]:
#all population
df_all = data.copy()

#churned population
df_churned = data[data['Attrition_Flag'] == "Attrited Customer"]

#non churned population
df_nonchurned = data[data['Attrition_Flag'] == "Existing Customer"]

In [None]:
def plot_pie(column, title=""):
    data = df_all[column].value_counts()
    plt.pie(data,autopct='%1.2f%%',labels=data.index)
    plt.title(title)
    plt.show()

**Attrition Flag**
* Attrition_Flag: Internal event (customer activity) variable - if the account is closed then 1(Attrited Customer) else 0(Existing Customer)

In [None]:
plot_pie("Attrition_Flag", title = 'Percentage of Existing and Attrited Customers')
print('Number of Credit Card Service Customers: ',df_all['Attrition_Flag'].count())
print('Dengan rincian: ')
print(df_all['Attrition_Flag'].value_counts())

There are 16.07% of customers who have stopped credit card services and 83.93% of customers who still use credit card services.

**GENDER**
*  Gender: Demographic variable - M=Male, F=Female

In [None]:
plot_pie('Gender',"All Population")

From All Customers, there are 52.91% with Female gender and 47.09% with Male gender, this figure is balanced.

In [None]:
plt.figure(figsize=(10,10))
ax= sns.countplot(data = df_all, x='Gender', hue = 'Attrition_Flag')
for p in ax.patches:
                height = p.get_height()
                ax.text(p.get_x()+p.get_width()/2.,
                        height + 120,
                        '{:1.2f}%'.format(height/len(df_all)*100),
                        ha="center")
                ax.text(p.get_x()+p.get_width()/2.,
                        height + 3,
                        str(p.get_height()),
                        ha="center")

- There is no significant effect of Gender on Customer Attrition.
- For customers who are no longer using a credit card, the Female Gender is 2.3% greater than Male Gender.
- Comparison of the Male Gender Ratio for Attrition compared to 0.14.
- Comparison of the Female Gender Ratio for Attrition compared to 0.17.

**Education Level**
* Education_Level: Demographic variable - Educational Qualification of the account holder (example: high school, college graduate, etc.)

In [None]:
plot_pie('Education_Level', title="All Population")

In [None]:
plt.figure(figsize=(10,10))
ax= sns.countplot(data = df_all, x='Education_Level', hue = 'Attrition_Flag')
for p in ax.patches:
                height = p.get_height()
                ax.text(p.get_x()+p.get_width()/2.,
                        height + 80,
                        '{:1.2f}%'.format(height/len(df_all)*100),
                        ha="center")
                ax.text(p.get_x()+p.get_width()/2.,
                        height + 20,
                        str(p.get_height()),
                        ha="center")

- There is no significant effect of Educational Level on Customer Attrition.
- Credit card customers with the highest number of Attrition at the Graduate Education level with a percentage of 4.81%.
- Comparison of Education Unknown Ratio for Attrition compared to Existing of 0.16.
- Comparison of High School Education Ratio for Attrition compared to Existing at 0.15.
- Comparison of the ratio of Education Graduate to Attrition compared to Existing of 0.15.
- Comparison of Education Uneducated Ratio for Attrition compared to Existing at 0.15.
- Comparison of Education College Ratio for Attrition compared to Existing at 0.15.
- Comparison of Education Post-Graduate Ratio for Attrition compared to Existing at 0.17.
- Comparison of Education Doctorate Ratio for Attrition compared to Existing of 0.21. **Highest**

**Marital Status**
* Marital_Status: Demographic variable - Married, Single, Divorced, Unknown

In [None]:
plot_pie('Marital_Status', title="All Population")

In [None]:
plt.figure(figsize=(8,10))
ax= sns.countplot(data = df_all, x='Marital_Status', hue = 'Attrition_Flag')
for p in ax.patches:
                height = p.get_height()
                ax.text(p.get_x()+p.get_width()/2.,
                        height + 120,
                        '{:1.2f}%'.format(height/len(df_all)*100),
                        ha="center")
                ax.text(p.get_x()+p.get_width()/2.,
                        height + 20,
                        str(p.get_height()),
                        ha="center")

- There is no significant effect of Marital Status on Customer Attrition, it can be seen in the graph that the proportion that is there is greater than Attribution in each category.
- Credit card customers with the highest amount of attractiveness are married with a proportion of 7%.
- Comparison of the Ratio of Marital Status to Attractiveness compared to the Existing is 0.15
- Comparison of Single Status Marital Ratio for Attrition compared to the existing 0.17.
- Comparison of the Ratio of Divorced Marital Status for Attrition compared to Existing at 0.16.
- Comparison of Unknown Marital Status Ratio for Attrition compared to Existing 0.17.

**Income Category**
* Income_Category: Demographic variable - Annual Income Category of the account holder (< 40𝐾, 40K - 60K, 60𝐾− 80K, 80𝐾− 120K,> $ 120K, Unknown)

In [None]:
plot_pie('Income_Category', title="All Population")

In [None]:
plt.figure(figsize=(8,10))
ax= sns.countplot(data = df_all, x='Income_Category', hue = 'Attrition_Flag')
for p in ax.patches:
                height = p.get_height()
                ax.text(p.get_x()+p.get_width()/2.,
                        height + 80,
                        '{:1.2f}%'.format(height/len(df_all)*100),
                        ha="center") 
                ax.text(p.get_x()+p.get_width()/2.,
                        height + 20,
                        str(p.get_height()),
                        ha="center")

- There is no significant effect of Marital Status on Customer Attrition.
- Credit card customers with the highest amount of traction at Less than $40k with a proportion of 6.04%.

- Comparison of Category Revenue Ratio of Less than $40k for Attrition compared to Existing of 0.17.
- Comparison of 40k-60k Category Income Ratio for Attrition compared to Existing at 0.15.
- Comparison of 60k-80k Category Income Ratio for Attrition compared to Existing at 0.13.
- Comparison of 80k-120k Category Income Ratio for Attrition compared to Existing at 0.15.
- Comparison of 120k + Category Income Ratio for Attrition compared to Existing at 0.17.

**CARD CATEGORY**
* Card_Category: Product Variable - Type of Card (Blue, Silver, Gold, Platinum)

In [None]:
plot_pie('Card_Category', title="All Population")

In [None]:
plt.figure(figsize=(8,10))
ax= sns.countplot(data = df_all, x='Card_Category', hue = 'Attrition_Flag')
for p in ax.patches:
                height = p.get_height()
                ax.text(p.get_x()+p.get_width()/2.,
                        height + 170,
                        '{:1.2f}%'.format(height/len(df_all)*100),
                        ha="center")
                ax.text(p.get_x()+p.get_width()/2.,
                        height + 20,
                        str(p.get_height()),
                        ha="center")

- There is no significant effect from the Card Category on Customer Attrition.
- The credit card customer with the highest amount of Attrition on the Blue Card with a percentage of 15%.
- Comparison of Card Category Blue Ratio for Attrition compared to Existing of 0.16.
- Comparison of Card Category Silver Ratio for Attrition compared to Existing of 0.14.
- Comparison of Card Category Gold Ratio for Attrition compared to Existing at 0.18.
- Comparison of Card Category Platinum Ratio for Attrition compared to Existing of 0.25.

### Numerical Features

**Customer Age**
* Customer_Age: Demographic variable - Customer's Age in Years

In [None]:
plt.figure(figsize = (16, 10))
plt.title('Number of Age All Bank Customer')
ax = sns.countplot(data = df_all, x = 'Customer_Age')
plt.xlabel('Age All Customer')
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))

- Bank customers as a whole from around 26 - 73 years of age.
- The largest distribution of Bank Customers with ages 42 - 50.
- The highest number of Bank Customers was at the age of 44 years as many as 500 people and the lowest was at the age of 70 and 73 years as many as 1 person.

In [None]:
plt.figure(figsize = (16, 10))
plt.title('Number of Existing Customer Age')
ax = sns.countplot(data = df_all,hue = 'Attrition_Flag', x = 'Customer_Age')
plt.xlabel('Existing Customer Age')
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))

- Existing customers as a whole from the age of 26 - 73 years.
- The largest distribution of existing customers, aged 44 - 53.
- The highest number of existing customers was at the age of 44 and 49 years as many as 416 people and the lowest was at the ages of 66, 68, 70, and 73 years as many as 1 person.

- Attrited customers as a whole from the age of 26 - 68 years.
- Distribution of most attrited Bank Customers aged 40 - 54.
- The highest number of Bank Customers at the age of 43 and 49 were 85 people and the lowest was at the ages of 28, 66, and 68 years at 1 person.

In [None]:
plt.figure(figsize=(16, 10))
sns.distplot(df_all[df_all['Attrition_Flag']=='Existing Customer']['Customer_Age'],label='Existing Customer', rug = True)
sns.distplot(df_all[df_all['Attrition_Flag']=='Attrited Customer']['Customer_Age'],label='Attrited Customer', rug = True)
plt.legend()

Distribution of Bank Customers who tend to be attrited in the largest age range from 43 - 46 years and 48 - 57 years. There is no significant effect.

**DEPENDENT COUNT**
* Dependent_count: Demographic variable - Number of dependents

In [None]:
plt.figure(figsize = (8, 5))
plt.title('Number of Dependent by The Customer')
ax = sns.countplot(data = df_all, x = 'Dependent_count')
plt.xlabel('Dependent All Customer')
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))

- Distribution of Dependent Counts for all Customer Banks from 0 -5.
- The highest number is at 3 and the lowest is at 5.

In [None]:
plt.figure(figsize=(8,10))
ax= sns.countplot(data = df_all, x='Dependent_count', hue = 'Attrition_Flag')
for p in ax.patches:
                height = p.get_height()
                ax.text(p.get_x()+p.get_width()/2.,
                        height + 70,
                        '{:1.2f}%'.format(height/len(df_all)*100),
                        ha="center")
                ax.text(p.get_x()+p.get_width()/2.,
                        height + 20,
                        str(p.get_height()),
                        ha="center")

- There is no significant effect of Dependent Count on Customer Attrition.
- The credit card customer with the highest amount of Attrition on Dependent Count 3 with a percentage of 4.76%.
- Comparison of Dependent Count 0 ratio for Attrition compared to Existing ratio of 0.14.
- Comparison of Dependent Ratio Count 1 for Attrition compared to Existing of 0.14.
- Comparison of Dependent Ratio Count 2 for Attrition compared to Existing of 0.15.
- Comparison of Dependent Count 3 Ratio for Attrition compared to Existing of 0.17.
- Comparison of Dependent Count 4 Ratio for Attrition compared to Existing of 0.16.
- Comparison of Dependent Count 5 ratio for Attrition compared to Existing ratio of 0.15.

**MONTH ON BOOK**
* Period of relationship with bank

In [None]:
plt.figure(figsize = (16, 10))
plt.title('Number of Customers holding the card for months of Existing Customer')
ax = sns.countplot(data = df_all, x = 'Months_on_book')
plt.xlabel(' Customers holding the card for months of Existing Customer')
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))

- Most customers use 36 months of usage.
- Customers with the lowest usage, namely 13 months, as many as 70 people.
- Customers with the longest usage, namely 56 months as many as 103 people.

In [None]:
plt.figure(figsize = (16, 10))
plt.title('Number of Customers holding the card for months')
ax = sns.countplot(data = df_all, x = 'Months_on_book', hue = 'Attrition_Flag')
plt.xlabel('All Customers holding the card for months')
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))

- Most existing customers use 36 months of usage.
- Existing customers with the lowest usage, namely 13 months as many as 63 people.
- Existing customers with the longest usage, namely 56 months as many as 86 people.

- Most attrated customers use 36 months of usage.
- Attrited Customers with the lowest usage, namely 13 months, as many as 7 people.
- Attrited Customers with the longest usage, which is 56 months, totaling 17 people.

In [None]:
plt.figure(figsize=(16, 10))
sns.distplot(df_all[df_all['Attrition_Flag']=='Existing Customer']['Months_on_book'],label='Existing Customer', rug = True)
sns.distplot(df_all[df_all['Attrition_Flag']=='Attrited Customer']['Months_on_book'],label='Attrited Customer', rug = True)
plt.legend()

There is no significant effect of Customer Age on Customer Attrition.

**Total Relationship Count**
* Total number of products held by the customer.

In [None]:
plt.figure(figsize = (8, 5))
plt.title('Number of All Customers Relationship')
ax = sns.countplot(data = df_all, x = 'Total_Relationship_Count')
plt.xlabel('All Customers Relationship')
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))

- The highest number of customers using products, amounting to 3. In numbers 4, 5, and 6, the numbers were relatively similar.
- The lowest customer uses 1 product.

In [None]:
plt.figure(figsize=(8,10))
ax= sns.countplot(data = df_all, x='Total_Relationship_Count', hue = 'Attrition_Flag')
for p in ax.patches:
                height = p.get_height()
                ax.text(p.get_x()+p.get_width()/2.,
                        height + 60,
                        '{:1.2f}%'.format(height/len(df_all)*100),
                        ha="center")
                ax.text(p.get_x()+p.get_width()/2.,
                        height + 15,
                        str(p.get_height()),
                        ha="center")

- There is no significant effect of the Total Relationship Count on Customer Attrition.
- Credit card customers with the highest number of Attrition on Total Relationship Count 3 with a percentage of 3.95%.
- Comparison of Count 1 Total Relationship Ratio for Attrition compared to Existing of 0.25.
- Comparison of Count 2 Total Relationship Ratio for Attrition compared to Existing of 0.27.
- Comparison of Total Relationship Count 3 ratio for Attrition compared to Existing ratio of 0.17.
- Comparison of Total Relationship Count 4 for Attrition compared to Existing ratio of 0.11.
- Comparison of Total Relationship Count 5 ratio for Attrition compared to Existing ratio of 0.12.
- Comparison of Total Relationship Count 6 for Attrition compared to Existing ratio of 0.10.

**Months_Inactive_12_mon**
* No. of months inactive in the last 12 months

In [None]:
plt.figure(figsize = (8, 5))
plt.title('Number of Month Inactive All Customers in 12 Months')
ax = sns.countplot(data = df_all, x = 'Months_Inactive_12_mon')
plt.xlabel('Month Inactive All Customers in 12 Months')
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))

Customers with the highest number of months of inactivity were 3 in the last 12 months.

In [None]:
plt.figure(figsize=(8,10))
ax= sns.countplot(data = df_all, x='Months_Inactive_12_mon', hue = 'Attrition_Flag')
for p in ax.patches:
                height = p.get_height()
                ax.text(p.get_x()+p.get_width()/2.,
                        height + 80,
                        '{:1.2f}%'.format(height/len(df_all)*100),
                        ha="center")
                ax.text(p.get_x()+p.get_width()/2.,
                        height + 15,
                        str(p.get_height()),
                        ha="center")

- There is no significant influence from Months Inactive 12 Months to Customer Attrition.
- Credit card customers with the highest amount of Attrition on Months Inactive 12 Months in 3 Months with a percentage of 8.16%.
- Comparison of the Ratio of Inactive 12 Months to 0 Months for Attrition compared to Existing of 0.51.
- Comparison of the Ratio of Inactive 12 Months in 1 Month for Attrition compared to Existing at 0.04.
- Comparison of the Ratio of Inactive 12 Months to 2 Months for Attrition compared to Existing at 0.15.
- Comparison of the Ratio of Inactive 12 Months to 3 Months for Attrition compared to Existing of 0.21.
- Comparison of the Ratio of Inactive 12 Months to 4 Months for Attrition compared to Existing of 0.29.
- Comparison of the Ratio of Inactive 12 Months to 5 Months for Attrition compared to Existing at 0.18.
- Comparison of the Ratio of Inactive 12 Months to 6 Months for Attrition compared to Existing at 0.15.

**Contacts_Count_12_mon**
* No. of Contacts in the last 12 months

In [None]:
plt.figure(figsize = (8, 5))
plt.title('Number of Contacts Count All Customers in 12 Months')
ax = sns.countplot(data = df_all, x = 'Contacts_Count_12_mon')
plt.xlabel('Contacts Count All Customers in 12 Months')
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))

The highest number of customer contacts was 3 in the last 12 months.

In [None]:
plt.figure(figsize=(8,10))
ax= sns.countplot(data = df_all, x='Contacts_Count_12_mon', hue = 'Attrition_Flag')
for p in ax.patches:
                height = p.get_height()
                ax.text(p.get_x()+p.get_width()/2.,
                        height + 80,
                        '{:1.2f}%'.format(height/len(df_all)*100),
                        ha="center")
                ax.text(p.get_x()+p.get_width()/2.,
                        height + 15,
                        str(p.get_height()),
                        ha="center")

- There is no significant impact from Contacts Count 12 Months on Customer Attrition.
- The credit card customer with the highest number of interests in the Contacts Count 12 Months at 3 with a percentage of 6.72%.
- Comparison of the Ratio of Contacts Count for 12 Months at 0 for Attrition compared to Existing at 0.17.
- Comparison of the Ratio of Contacts Count for 12 Months at 1 for Attrition compared to Existing at 0.07.
- Comparison of the Ratio of Contacts Count 12 Months at 2 for Attrition compared to Existing at 0.12.
- Comparison of the Ratio of Contacts Count 12 Months at 3 for Attrition compared to Existing at 0.20.
- Comparison of the Ratio of Contacts Count for 12 Months at 4 for Attrition compared to Existing at 0.22.
- Comparison of the Ratio of Contacts Count 12 Months at 5 for Attrition compared to Existing at 0.33.
- Comparison of the Ratio of Contacts Count for 12 Months at 6 for Attrition compared to Existing at 1.

**Credit_Limit**
* Credit Limit on the Credit Card

In [None]:
plt.figure(figsize = (16, 10))
plt.title('Number of Credit Limit All Customers')
ax = plt.hist(df_all['Credit_Limit'], bins=50, density = False)
plt.xlabel('Credit Limit All Customers')

The highest number of customers with a credit limit of around 1000, then the number significantly decreased to a limit of around 33000. Then the number increased quite rapidly on the credit limit of 33000 - 33500.

In [None]:
plt.figure(figsize=(16,10))
sns.distplot(df_all[df_all['Attrition_Flag']=='Existing Customer']['Credit_Limit'],label='Existing Customer')
sns.distplot(df_all[df_all['Attrition_Flag']=='Attrited Customer']['Credit_Limit'],label='Attrited Customer')
plt.legend()

- There is a tendency for attrited customers to have a credit limit of less than 1500.
- There is no significant effect of Credit Limit on Attrition Customers.

**Total_Revolving_Bal**
* Total Revolving Balance on the Credit Card

In [None]:
plt.figure(figsize = (16, 10))
plt.title('Number of Total_Revolving_Bal All Customers')
ax = plt.hist(df_all['Total_Revolving_Bal'], bins=50, density = False)
plt.xlabel('Total_Revolving_Bal All Customers')

The highest number of customers. Revolving balance in the range 0-100. 

In [None]:
plt.figure(figsize = (16, 10))
plt.title('Number of Total_Revolving_Bal Existing Customers')
plt.hist(df_nonchurned['Total_Revolving_Bal'], bins=50, density = False)
plt.hist(df_churned['Total_Revolving_Bal'], bins=50, density = False)
plt.xlabel('Total_Revolving_Bal Existing Customers')

Existing Customer dan Attrited Customer jumlah tertinggi pada range 0-100.

In [None]:
plt.figure(figsize=(16,10))
sns.distplot(df_all[df_all['Attrition_Flag']=='Existing Customer'][ 'Total_Revolving_Bal'],label='Existing Customer')
sns.distplot(df_all[df_all['Attrition_Flag']=='Attrited Customer']['Total_Revolving_Bal'],label='Attrited Customer')
plt.legend()

There is a tendency for attracted customers to have a revolving balance in the range 0 - 500.

**Avg_Open_To_Buy**
* Open to Buy Credit Line (Average of last 12 months)

In [None]:
plt.figure(figsize = (16, 10))
plt.title('Number of Avg_Open_To_Buy All Customers')
ax = plt.hist(df_all['Avg_Open_To_Buy'], bins=50, density = False)
plt.xlabel('Avg_Open_To_Buy All Customers')

The number of Customers Open to Buy within 12 months has increased in the range of 0-2500. Then there was a significant decline to the range 32000, after that there was an increase back to the range 34500.

In [None]:
plt.figure(figsize = (16, 10))
plt.title('Number of Avg_Open_To_Buy Existing And Attrited Customers')
plt.hist(df_nonchurned['Avg_Open_To_Buy'], bins=50, density = False, color='y')
plt.hist(df_churned['Avg_Open_To_Buy'], bins=50, density = False)
plt.xlabel('Avg_Open_To_Buy Existing Customers')

- Existing Customer distribution is no different from All Customers.
- Attrited Customer increases in the range 0-3500. Then there was a significant decline to the range 32000, after that there was an increase back to the range 34500.
- The number of Existing Customers is more than Attrited Customers.

In [None]:
plt.figure(figsize=(16,10))
sns.distplot(df_all[df_all['Attrition_Flag']=='Existing Customer'][ 'Avg_Open_To_Buy'],label='Existing Customer')
sns.distplot(df_all[df_all['Attrition_Flag']=='Attrited Customer']['Avg_Open_To_Buy'],label='Attrited Customer')
plt.legend()

There is no significant effect of Avg Open to Buy with Customer Attrition.

**Total_Amt_Chng_Q4_Q1**
* Change in Transaction Amount (Q4 over Q1)

In [None]:
plt.figure(figsize = (16, 10))
plt.title('Number of Total_Amt_Chng_Q4_Q1 All Customers')
ax = plt.hist(df_all['Total_Amt_Chng_Q4_Q1'], bins=50, density = False)
plt.xlabel('Total_Amt_Chng_Q4_Q1 All Customers')

The highest number of customers with Change in Transaction Q4 over Q1 is at the percentage of 0-0.75%, then there is a decrease in the number of the percentage. However, there is no minus percentage, meaning that there is no decrease in transactions each year.

In [None]:
plt.figure(figsize = (16, 10))
plt.title('Number of Total_Amt_Chng_Q4_Q1 Existing and Attrited Customers')
plt.hist(df_nonchurned['Total_Amt_Chng_Q4_Q1'], bins=50, density = False)
plt.hist(df_churned['Total_Amt_Chng_Q4_Q1'], bins=50, density = False)
plt.xlabel('Total_Amt_Chng_Q4_Q1 Existing and Attrited Customers')

The distribution of Existing and Attrited has no significant difference with the distribution of the number of All Customers.

In [None]:
plt.figure(figsize=(16,10))
sns.distplot(df_all[df_all['Attrition_Flag']=='Existing Customer'][ 'Total_Amt_Chng_Q4_Q1'],label='Existing Customer')
sns.distplot(df_all[df_all['Attrition_Flag']=='Attrited Customer']['Total_Amt_Chng_Q4_Q1'],label='Attrited Customer')
plt.legend()

There is an effect of Change in Transaction Amount (Q4 over Q1) at a percentage of 0 - 0.5% and 1 - 1.2%.

**Total_Trans_Amt**
* Total Transaction Amount (Last 12 months)

In [None]:
plt.figure(figsize = (16, 10))
plt.title('Number of Total_Trans_Amt All Customers')
ax = plt.hist(df_all['Total_Trans_Amt'], bins=50, density = False)
plt.xlabel('Total_Trans_Amt All Customers')

The distribution of the highest number of transactions is in the range 4000 - 4500.

In [None]:
plt.figure(figsize = (16, 10))
plt.title('Number of Total_Trans_Amt Existing Customers')
plt.hist(df_nonchurned['Total_Trans_Amt'], bins=50, density = False)
plt.hist(df_churned['Total_Trans_Amt'], bins=50, density = False)
plt.xlabel('Total_Trans_Amt Existing Customers')

- The distribution of the number of transactions between existing customers and all customers is not much different.
- Distribution of the highest number of transactions on Attrited Customers in the amount of 2000 - 2500.

In [None]:
plt.figure(figsize=(16,10))
sns.distplot(df_all[df_all['Attrition_Flag']=='Existing Customer'][ 'Total_Trans_Amt'],label='Existing Customer')
sns.distplot(df_all[df_all['Attrition_Flag']=='Attrited Customer']['Total_Trans_Amt'],label='Attrited Customer')
plt.legend()

There is a tendency for attrited customers to total transactions less than 2500 and in the range 7500 - 10500. However, the highest is less than 2500.

**Total_Trans_Ct**
* Total Transaction Count (Last 12 months)

In [None]:
plt.figure(figsize = (16, 10))
plt.title('Number of Total_Trans_Ct All Customers')
ax = plt.hist(df_all['Total_Trans_Ct'],bins=50, density = False)
plt.xlabel('Total_Trans_Ct All Customers')

The highest total transactions of All Customers, on average, are 60-80 times.

In [None]:
plt.figure(figsize = (16, 10))
plt.title('Number of Total_Trans_Ct Customers')
plt.hist(df_nonchurned['Total_Trans_Ct'], bins=50, density = False)
plt.hist(df_churned['Total_Trans_Ct'], bins=50, density = False)
plt.xlabel('Total_Trans_Ct Customers')

- The distribution of the number of transactions between existing customers and all customers is not much different.
- Distribution of the highest number of transactions to Attrited Customers in the range 35 - 45.

In [None]:
plt.figure(figsize=(16,10))
sns.distplot(df_all[df_all['Attrition_Flag']=='Existing Customer']['Total_Trans_Ct'],label='Existing Customer')
sns.distplot(df_all[df_all['Attrition_Flag']=='Attrited Customer']['Total_Trans_Ct'],label='Attrited Customer')
plt.legend()

There is a tendency for attracted customers to total transactions of less than 55.

**Total_Ct_Chng_Q4_Q1**
* Change in Transaction Count (Q4 over Q1)

In [None]:
plt.figure(figsize = (16, 10))
plt.title('Number of Total_Ct_Chng_Q4_Q1 All Customers')
ax = plt.hist(df_all['Total_Ct_Chng_Q4_Q1'], bins=50, density = False)
plt.xlabel('Total_Ct_Chng_Q4_Q1 All Customers')

The highest number of customers with Change in Transaction Count Q4 over Q1 at the percentage of 0-0.75%, then there is a decrease in the number of the percentage. However, there is no minus percentage, meaning that there is no decrease in transactions each year.

In [None]:
plt.figure(figsize = (16, 10))
plt.title('Number of Total_Ct_Chng_Q4_Q1 Existing Customers')
plt.hist(df_nonchurned['Total_Ct_Chng_Q4_Q1'], bins=50, density = False)
plt.hist(df_churned['Total_Ct_Chng_Q4_Q1'], bins=50, density = False)
plt.xlabel('Total_Ct_Chng_Q4_Q1 Existing Customers')
plt.ylabel('Count')

- The percentage distribution of Change in Transaction Count on Existing Customers and All Customers is not much different.
- The percentage distribution of Change in Transaction Count on the highest attrited customers is in the range 0.4 - 0.6.

In [None]:
plt.figure(figsize=(16,10))
sns.distplot(df_all[df_all['Attrition_Flag']=='Existing Customer']['Total_Ct_Chng_Q4_Q1'],label='Existing Customer')
sns.distplot(df_all[df_all['Attrition_Flag']=='Attrited Customer']['Total_Ct_Chng_Q4_Q1'],label='Attrited Customer')
plt.legend()

There is a tendency for attracted customers to have a Change in Transaction Count percentage of less than 0.5.

**Avg_Utilization_Ratio**
* Average Card Utilization Ratio

In [None]:
plt.figure(figsize = (16, 10))
plt.title('Number of Avg_Utilization_Ratio All Customers')
ax = plt.hist(df_all['Avg_Utilization_Ratio'], bins=50, density = False)
plt.xlabel('Avg_Utilization_Ratio All Customers')
plt.ylabel('Count')

In [None]:
plt.figure(figsize = (16, 10))
plt.title('Number of Avg_Utilization_Ratio')
plt.hist(df_nonchurned['Avg_Utilization_Ratio'], bins=50, density = False)
plt.hist(df_churned['Avg_Utilization_Ratio'], bins=50, density = False)
plt.xlabel('Avg_Utilization_Ratio Customers')
plt.ylabel('Count')

The number of Existing Customers in the Avg_Utilization_Ratio feature is higher than the Attrited Customers.

In [None]:
plt.figure(figsize=(16,10))
sns.distplot(df_all[df_all['Attrition_Flag']=='Existing Customer']['Avg_Utilization_Ratio'],label='Existing Customer')
sns.distplot(df_all[df_all['Attrition_Flag']=='Attrited Customer']['Avg_Utilization_Ratio'],label='Attrited Customer')
plt.legend()

Average Card Utilization Ratio 0.1 tends to Attrition.

# Analysis

In [None]:
df_corr = df_all.copy()
df_corr['Attrition_Flag'] = np.where(df_corr['Attrition_Flag'] == 'Existing Customer',1,0)

In [None]:
col_num_cor = list(df_corr.select_dtypes(['int64','float64','int32']))
col_num_cor

In [None]:
corr_data = df_corr.loc[:, col_num_cor].corr(method='spearman')

plt.figure(figsize=(15,10))
sns.heatmap(corr_data.abs(), annot=True, fmt='.3f',cmap='viridis',square=True)
plt.show()

In numeric features, there are several features that have a greater positive correlation than other features on the Attrition Flag, namely:
1. Total_Revolving_Bal
2. Total_Trans_Amt
3. Total_Trans_Ct
4. Total_Ct_Chng_Q4_Q1
5. Avg_Utilization_Ratio

# Summary

## Categorical Data

1. In Categorical feature, Attrition Customer is significantly lower than Existing Customer.
2. Highest attrition number from each feature:
    * GENDER
        - The number of customers with the highest attrition in Feature Gender is female with a greater percentage of 2.3%. The Gender Female Ratio is also greater than Existing, with a value of 0.17. This figure is greater than the Male Gender for Attrition with a value of 0.14. However, this figure does not affect.
    * EDUCATION LEVEL
        - The number of customers with the highest attrition at the Feature Education Level is Graduate with a percentage of 4.81%. For Education Level, the ratio of subscribers to Attrition is more or less the same from High School to College levels. And, slowly there is an increase in the ratio from postgraduate to doctoral level.
        - The increase in this ratio is because the numbers decrease when the Educational Level increases, so it cannot be said that a high ratio tends for greater Attrition. It needs to be seen from other aspects.
    * MARITAL STATUS
        - The number of customers with the highest attrition on the Feature Marital Status is Married with a percentage of 7% and Single at 6.6%. The ratio for Single is 0.16, Divorce and Unknown is 0.17. The ratio numbers for Single, Divorced, and Unknown is bigger than Married with a value of 0.15. So, the Single category has a bigger chance for Attrition, because the ratio is bigger than Married and with an amount that is not much different than Married.
        - The tendency for Attrition in Marital Status Single can be triggered because they do not have more dependents than Married and Divorced ones.
    * Income Category
        - The number of subscribers with the highest attrition in the Feature Income Category is Less than 40k with a percentage of 6.04%. The higher the income category, the lower the number of Customers.
        - The lowest attrition ratio in the Income Category is 60k-80k with a value of 0.18 compared to other categories.
        - The income category ratio of 120k + is the same as Less than 40k with a value of 0.17, this ratio is the highest compared to other categories. The ratio at 120k + is high because the income level is the highest and the number is the least, therefore the attrition ratio is high even though the number of attractive customers is 4.8% different compared to Less than 40k.
    * Card Category
        - The number of subscribers with the highest Attrition in the feature Card Category is Blue, with a very large percentage of 15%, this figure is closer to the total of Attrition Customers of 16.07%.
        - You could also say that the majority of Customer Attrition on Category Blue Cards.

## Numeric Data

1. Total Revolving Balance
    - Difference between Credit Limit and Average Open to Buy. This means that the remaining debt from borrowing that has not been paid each month, in 12 months with a monthly average of less than 750 is more likely for Attrition.
    - I hypothesize that users who have a small Revolving Balance every month do not tend to use it according to the portion. Users with income levels according to their daily needs. So, users with stable income like this feel disadvantaged by using credit cards with interest rates, so they tend to quit or move to other banks with lower interest rates.
    
2. Total Transaction Amount
    - High Customer Attrition on the number of transactions less than 2500 dollars per customer on average in the last 12 months. These customers feel that they rarely use credit cards which makes them more likely to stop.

3. Total Transaction Count
    - High Customer Attrition on the number of transactions less than 50 times per customer in the last 12 months. These customers feel that they rarely use credit cards which makes them more likely to stop.
    
4. Change in Transaction Count (Q4 over Q1)
    - Change in Transaction Count is the ratio of the increase in the use of the number of credit cards compared to each quarter, the highest Customer Attrition with a value of less than 0.5. This means that there is no change in the number of credit card uses every quarter.

5. Average Utilization Ratio
    - This feature explains the ratio of how useful the credit card is for its customers. The highest attrition ratio is at less than 0.05. Visibility means low customer usage can trigger Attrition.
    

## General Summary and What Solution maybe that I can Help

From the analysis I have done, it can be concluded that the majority of users and the majority of Attrition are:
1. Level of education from high school to S1.
2. Marital Status Married and Single.
3. Income Category Less Than 40 dollars.
4. Card Category Blue.
5. The highest age ranges from 37 to 59 years.
6. Number of dependents 2 and 3.
7. Highest card usage for 36 months.
8. Highest customers use 3 products.
9. Customers are not actively using the card for a maximum of 3 months.
10. And the lack of a card usage ratio in daily activities.

From the existing problems, we can take the solution to suppress the Attrition Customer number:
1. Expand the use of credit cards, especially for Category Blue cards from a wider variety of sectors, marketplaces, various merchants, property, health, education, transportation, and others. Create a promotion that can generate interest in using a credit card. In this way, the ratio of credit card usage automatically makes customers think again when they stop using it.
2. Make a requirement if the card is not used at all in 2 months, an administration fee will be charged.
3. Make the requirements again if you have subscribed to a credit card and if you stop later, the promos that the old customers get will not be valid again when registering again.

## Pre Processing

First we drop Clientinum feature.

In [None]:
df = data.copy()

In [None]:
df.drop(data.columns[0], axis = 1, inplace = True)

### Outliers Check

In [None]:
def plot_box(column):
    plt.boxplot(df_all[column])
    plt.title(column)
    plt.show()

for i in col_num:
    plot_box(i)

Terdapat outliers signifikan pada feature:
1. Months_on_book
2. Credit_Limit
3. Avg_Open_To_Buy
4. Total_Amt_Chng_Q4_Q1
5. Total_Trans_Amt
6. Total_Ct_Chng_Q4_Q1

Preprocessing Scheme
- OneHot: Gender, Education_Level, Marital_Status, Income_Category, Card_Category
- Min Max Scaller: Months_on_book, Credit_Limit, Avg_Open_To_Buy, Total_Amt_Chng_Q4_Q1, Total_Trans_Amt, Total_Ct_Chng_Q4_Q1 
- Passthrogh: 'Customer_Age', 'Dependent_count', 'Total_Relationship_Count', 'Months_Inactive_12_mon', 'Contacts_Count_12_mon' ,'Total_Revolving_Bal', 'Total_Trans_Ct', 'Avg_Utilization_Ratio'

* Attrited Customers = 1
* Existing Customers = 0

* * 0 = Keep Use *
* * 1 = Off *

         - TN: Some customers are predicted with Keep Use and in fact Off
         - TP: Some customers are predicted to be Off and in fact, it is Off
         - FP: Some customers are predicted to be Off, even though it's Keep Use
         - FN: Some customers are predicted to use Keep Used even though it is Off

Action:
* FP: The wrong prediction, it doesn't matter what
* FN: The company loses due to the loss of customers

-> What will be pressed is FN, recall

In [None]:
df['Attrition_Flag'] = np.where(df['Attrition_Flag'] == 'Attrited Customer', 1, 0)

In [None]:
transformer= ColumnTransformer([
    ('one_hot',OneHotEncoder(drop='first'),['Gender','Education_Level','Marital_Status','Income_Category','Card_Category']),
    ('scale',MinMaxScaler(), ['Months_on_book', 'Credit_Limit', 'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt', 'Total_Ct_Chng_Q4_Q1'])]
    , remainder='passthrough')

## Split Data

In [None]:
X=df.drop('Attrition_Flag',axis=1)
y=df['Attrition_Flag']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                   stratify = y,
                                                   test_size = 0.3,
                                                   random_state = 2020)

## Modeling Data

*Define Model*
- I use 4 models to predict:
    * Logistic Regression
    * Decision Tree Classifier
    * K-Nearest Neighbor
    * Random Forest

In [None]:
logreg = LogisticRegression(solver='liblinear')
tree = DecisionTreeClassifier(random_state = 1010)
knn = KNeighborsClassifier(n_neighbors = 2)
rf = RandomForestClassifier(random_state = 1010)

## Cross Validation

In [None]:
logreg_pipe = Pipeline([('transformer', transformer), ('logreg', logreg)])
tree_pipe = Pipeline([('transformer', transformer), ('tree', tree)])
knn_pipe = Pipeline([('transformer', transformer), ('knn', knn)])
rf_pipe = Pipeline([('transformer', transformer), ('rf', rf)])

def model_evaluation(model, metric):
    skfold = StratifiedKFold(n_splits = 5)
    model_cv = cross_val_score(model, X_train, y_train, cv = skfold, scoring = metric)
    return model_cv

logreg_pipe_cv = model_evaluation(logreg_pipe, 'recall')
tree_pipe_cv = model_evaluation(tree_pipe, 'recall')
knn_pipe_cv = model_evaluation(knn_pipe, 'recall')
rf_pipe_cv = model_evaluation(rf_pipe, 'recall')

for model in [logreg_pipe, tree_pipe, knn_pipe, rf_pipe]:
    model.fit(X_train, y_train)
    
score_mean = [logreg_pipe_cv.mean(), tree_pipe_cv.mean(), knn_pipe_cv.mean(), rf_pipe_cv.mean()]
score_std = [logreg_pipe_cv.std(), tree_pipe_cv.std(), knn_pipe_cv.std(), rf_pipe_cv.std()]
score_recall_score = [recall_score(y_test, logreg_pipe.predict(X_test)),
            recall_score(y_test, tree_pipe.predict(X_test)), 
            recall_score(y_test, knn_pipe.predict(X_test)), 
            recall_score(y_test, rf_pipe.predict(X_test))]
method_name = ['Logistic Regression', 'Decision Tree Classifier', 'KNN Classifier', 'Random Forest Classifier']
cv_summary = pd.DataFrame({
    'method': method_name,
    'mean score': score_mean,
    'std score': score_std,
    'recall score': score_recall_score
})
cv_summary

From this method, we get the highest recall value on the Decision Tree Classifier and Random Forest Classifier.

In [None]:
print(classification_report(y_test, tree_pipe.predict(X_test)))

In [None]:
print(classification_report(y_test, rf_pipe.predict(X_test)))

## Handling Imbalance

Because of the imbalanced data, now we try handling with Under Sampling and Over Sampling.

### Random Over Sampling

In [None]:
ros = RandomOverSampler(random_state = 1010)
X_over, y_over = ros.fit_resample(X_train, y_train) 

logreg_pipe_over = Pipeline([('transformer', transformer), ('ros', ros), ('logreg', logreg)])
tree_pipe_over = Pipeline([('transformer', transformer), ('ros', ros), ('tree', tree)])
knn_pipe_over = Pipeline([('transformer', transformer), ('ros', ros), ('knn', knn)])
rf_pipe_over = Pipeline([('transformer', transformer), ('ros', ros), ('rf', rf)])

def model_evaluation(model, metric):
    skfold = StratifiedKFold(n_splits = 5)
    model_cv = cross_val_score(model, X_train, y_train, cv = skfold, scoring = metric)
    return model_cv

logreg_over_cv = model_evaluation(logreg_pipe_over, 'recall') 
tree_over_cv = model_evaluation(tree_pipe_over, 'recall')
knn_over_cv = model_evaluation(knn_pipe_over, 'recall')
rf_over_cv = model_evaluation(rf_pipe_over, 'recall')

for model in [logreg_pipe_over, tree_pipe_over, knn_pipe_over, rf_pipe_over]:
    model.fit(X_train, y_train)
    
score_mean = [logreg_over_cv.mean(), tree_over_cv.mean(), knn_over_cv.mean(),
              rf_over_cv.mean()]
score_std = [logreg_over_cv.std(), tree_over_cv.std(), knn_over_cv.std(),
             rf_over_cv.std()]
score_recall_score = [recall_score(y_test, logreg_pipe_over.predict(X_test)),
            recall_score(y_test, tree_pipe_over.predict(X_test)), 
            recall_score(y_test, knn_pipe_over.predict(X_test)), 
            recall_score(y_test, rf_pipe_over.predict(X_test))]
method_name = ['Logistic Regression OverSampling', 'Decision Tree Classifier OverSampling',
              'KNN Classifier OverSampling', 'Random Forest Classifier OverSampling']
over_summary = pd.DataFrame({
    'method': method_name,
    'mean score': score_mean,
    'std score': score_std,
    'recall score': score_recall_score
})
over_summary

In [None]:
print(classification_report(y_test, tree_pipe_over.predict(X_test)))

In [None]:
print(classification_report(y_test, rf_pipe_over.predict(X_test)))

### Random Under Sampling

In [None]:
rus = RandomUnderSampler(random_state = 1010)
X_under, y_under = rus.fit_resample(X_train, y_train) 

logreg_pipe_under = Pipeline([('transformer', transformer), ('rus', rus), ('logreg', logreg)])
tree_pipe_under = Pipeline([('transformer', transformer), ('rus', rus), ('tree', tree)])
knn_pipe_under = Pipeline([('transformer', transformer), ('rus', rus), ('knn', knn)])
rf_pipe_under = Pipeline([('transformer', transformer), ('rus', rus), ('rf', rf)])

def model_evaluation(model, metric):
    skfold = StratifiedKFold(n_splits = 5)
    model_cv = cross_val_score(model, X_train, y_train, cv = skfold, scoring = metric)
    return model_cv

logreg_under_cv = model_evaluation(logreg_pipe_under, 'recall') 
tree_under_cv = model_evaluation(tree_pipe_under, 'recall')
knn_under_cv = model_evaluation(knn_pipe_under, 'recall')
rf_under_cv = model_evaluation(rf_pipe_under, 'recall')

for model in [logreg_pipe_under, tree_pipe_under, knn_pipe_under, rf_pipe_under]:
    model.fit(X_train, y_train)
    
score_mean = [logreg_under_cv.mean(), tree_under_cv.mean(), knn_under_cv.mean(),
              rf_under_cv.mean()]
score_std = [logreg_under_cv.std(), tree_under_cv.std(), knn_under_cv.std(),
             rf_under_cv.std()]
score_recall_score = [recall_score(y_test, logreg_pipe_under.predict(X_test)),
            recall_score(y_test, tree_pipe_under.predict(X_test)), 
            recall_score(y_test, knn_pipe_under.predict(X_test)), 
            recall_score(y_test, rf_pipe_under.predict(X_test))]
method_name = ['Logistic Regression UnderSampling', 'Decision Tree Classifier UnderSampling',
              'KNN Classifier UnderSampling', 'Random Forest Classifier UnderSampling']
under_summary = pd.DataFrame({
    'method': method_name,
    'mean score': score_mean,
    'std score': score_std,
    'recall score': score_recall_score
})
under_summary

Recall Score Random Forest Classifier UnderSampling naik menjadi 0.950820 dari sebelumnya 0.790984.

In [None]:
print(classification_report(y_test, tree_pipe_under.predict(X_test)))

In [None]:
print(classification_report(y_test, rf_pipe_under.predict(X_test)))

## Hyperparameter Tunning

In [None]:
estimator = Pipeline([
    ('transformer', transformer),
    ('balancing', rus),
    ('model', rf)
])

hyperparam_space={
    'model__max_depth':[2,5,7,10],
    'model__min_samples_leaf':[1,5,10,20,50,100],
    'model__min_samples_split':[1,5,10,20,50,100],
    'model__criterion':['gini','entropy']
}


skfold=StratifiedKFold(n_splits=5)

grid_search=GridSearchCV(
    estimator,
    param_grid=hyperparam_space,
    cv=skfold,
    scoring='recall',
    n_jobs=-1
)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
print('best score', grid_search.best_score_)
print('best param', grid_search.best_params_)

In [None]:
print('Recall Score After Hyperparameter Tuning:')
grid_search.best_estimator_.fit(X_train, y_train)
y_predict = grid_search.best_estimator_.predict(X_test)
recall_score(y_test, y_predict)

In [None]:
y_predict = rf_pipe_under.predict(X_test)
recall_score(y_test, y_predict)

Recall score after tuning is lower than before tuning.

In [None]:
print('Accuracy Score:')
accuracy_score(y_test, y_predict)