In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as ex
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import plotly.offline as pyo
pyo.init_notebook_mode()
sns.set_style('whitegrid')
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score as f1
from sklearn.metrics import confusion_matrix
#import scikitplot as skplt

plt.rc('figure',figsize=(18,9))
%pip install imbalanced-learn
from imblearn.over_sampling import SMOTE

In [None]:
df = pd.read_csv("BankChurners.csv")

In [None]:
df.head(5)
# As it is shown we dont need the last 2 columns for the analysis

In [None]:
# Dataset to use
data = df[df.columns[:-2]]
data

# Explolatory Data Analysis

In [None]:
# Age Bar Chart

In [None]:
age = data["Customer_Age"]

In [None]:
bins = range(20, 100, 10)

In [None]:
# Plot the histogram
plt.figure(figsize=(10, 6))
plt.hist(age, bins=bins, edgecolor='black', alpha=0.7)
plt.title('Age Distribution of Customers', fontsize=16)
plt.xlabel('Age', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.xticks(bins)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

# As it is depticted most clients fall in the catefory from 40 till 60 years old and especially between the age of 40-50 are the most clients

In [None]:
# Card Distribution - group the data based on the card category and Gender
card_distribution = data.groupby(['Card_Category', 'Gender']).size().unstack()

In [None]:
# Calculate the percentage for each type whether it is the customer male of female
card_gender = (card_distribution.T / card_distribution.sum(axis=1)).T * 100

In [None]:
#Pie Chart
# Plot a pie chart for each card category
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(14, 10))

# Flatten the axes for easy iteration
axes = axes.flatten()

# Card categories
card_categories = card_gender.index

# Create pie charts for each card category
for i, card in enumerate(card_categories):
    if i < len(axes):
        ax = axes[i]
        card_gender.loc[card].plot.pie(
            autopct='%1.1f%%', ax=ax, startangle=90, labels=card_gender.columns, colors=['skyblue', 'lightcoral']
        )
        ax.set_ylabel('')
        ax.set_title(f"Card: {card}", fontsize=14)

# Adjust layout
plt.tight_layout()
plt.show()

#### Across all types of cards there is a sligth difference between dender distribution we can see that men have mostly Blue cards but in all other types there are more women

In [None]:
# Count how many people on each education level
counter = data['Education_Level'].value_counts()

plt.figure(figsize=(10, 6))
counter.plot(kind='bar', color='skyblue', edgecolor='black', alpha=0.8)
plt.title('Education Level Distribution', fontsize=16)
plt.xlabel('Education Level', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

## As it is depicted we can clearly see that most of our clients have graduated and have at least a level of formal education.

In [None]:
# Marital Status
counter = data['Marital_Status'].value_counts()
marital_status = (counter / counter.sum()) * 100

plt.figure(figsize=(8, 5))
marital_status[['Married', 'Single']].plot(kind='bar', color=['skyblue', 'lightcoral'], edgecolor='black', alpha=0.8)
plt.title('Percentage of Married vs Single Customers', fontsize=16)
plt.xlabel('Marital Status', fontsize=14)
plt.ylabel('Percentage', fontsize=14)
plt.xticks(rotation=0)
plt.ylim(0, 100)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()


## It is clearly depicted that in the Dataset we have more Married than Single customer

In [None]:
income_levels = {
    'Less than $40K': 20,
    '$40K - $60K': 50,
    '$60K - $80K': 70,
    '$80K - $120K': 100,
    '$120K +': 140,
    'Unknown': None
}

In [None]:
data['Income_Numeric'] = data['Income_Category'].map(income_levels)

In [None]:
# Bin the income levels
bins = [0, 20, 40, 60, 80, 100, 140]
labels = ['<20', '20-40', '40-60', '60-80', '80-100', '100+']
data['Income_Binned'] = pd.cut(data['Income_Numeric'], bins=bins, labels=labels, right=False)


In [None]:
# Pie Chart
bin_counts = data['Income_Binned'].value_counts(sort=False)
plt.figure(figsize=(8, 8))
bin_counts.plot.pie(autopct='%1.1f%%', startangle=90, colors=plt.cm.Paired.colors, wedgeprops=dict(edgecolor='black'))
plt.title('Income Level Distribution', fontsize=16)
plt.ylabel('')  # Remove the y-axis label
plt.show()

## 43 percent of our clients fall for the category of 20-40 thousand

# Different Card Categories

In [None]:
counter = data['Card_Category'].value_counts()
plt.figure(figsize=(10, 6))
counter.plot(kind='bar', color='lightblue', edgecolor='black', alpha=0.8)
plt.title('Number of Users by Card Category', fontsize=16)
plt.xlabel('Card Category', fontsize=14)
plt.ylabel('Number of Users', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

### As we could see more people use Blue cards

In [None]:
# Counter of Customers 
counter = data['Attrition_Flag'].value_counts()

plt.figure(figsize=(8, 8))
counter.plot.pie(autopct='%1.1f%%', startangle=90, colors=['skyblue', 'lightcoral'], wedgeprops=dict(edgecolor='black'))
plt.title('Customer Churn Distribution (Existing vs Churned)', fontsize=16)
plt.ylabel('')  # Remove the y-axis label
plt.show()

### In this dataset more customers are existing than staying

# Data Preprocessing

In [None]:
data_processed = df[df.columns[:-2]]
data_processed
data_processed['Attrition_Flag'] = data_processed['Attrition_Flag'].map({'Attrited Customer': 1, 'Existing Customer': 0})
data_processed['Gender'] = data_processed['Gender'].map({'F': 1, 'M': 0})
categorical_columns = ['Education_Level', 'Income_Category', 'Marital_Status', 'Card_Category']
data_processed = pd.get_dummies(data_processed, columns=categorical_columns, drop_first=True)
columns_to_drop = ['CLIENTNUM']
data_processed.drop(columns=columns_to_drop, inplace=True)

In [None]:
data_processed

## Data Unsampling Using SMOTE

In [None]:
over_sampler = SMOTE()
X, y = over_sampler.fit_resample(data_processed.drop(columns=['Attrition_Flag']), data_processed['Attrition_Flag'])

In [None]:
upsampled_df = X.copy()
upsampled_df['Churn'] = y

In [None]:
data_from_smote = upsampled_df[upsampled_df.columns[15:-1]].copy()
upsampled_df = upsampled_df.drop(columns=upsampled_df.columns[15:-1])

# Principal Component Analysis

## We will use principal component analysis to reduce the dimensionality of the one-hot encoded categorical variables losing some of the variances, but simultaneously, using a couple of principal components instead of tens of one-hot encoded features will help me construct a better model.



In [None]:
N_COMPONENTS = 4

pca_model = PCA(n_components = N_COMPONENTS )

pc_matrix = pca_model.fit_transform(data_from_smote)

evr = pca_model.explained_variance_ratio_
total_var = evr.sum() * 100
cumsum_evr = np.cumsum(evr)

trace1 = {
    "name": "individual explained variance", 
    "type": "bar", 
    'y':evr}
trace2 = {
    "name": "cumulative explained variance", 
    "type": "scatter", 
     'y':cumsum_evr}
data = [trace1, trace2]
layout = {
    "xaxis": {"title": "Principal components"}, 
    "yaxis": {"title": "Explained variance ratio"},
  }
fig = go.Figure(data=data, layout=layout)
fig.update_layout(     title='Explained Variance Using {} Dimensions'.format(N_COMPONENTS))
fig.show()

In [None]:
usampled_df_with_pcs = pd.concat([upsampled_df,pd.DataFrame(pc_matrix,columns=['PC-{}'.format(i) for i in range(0,N_COMPONENTS)])],axis=1)
usampled_df_with_pcs

In [None]:
X_features = ['Total_Trans_Ct','PC-3','PC-1','PC-0','PC-2','Total_Ct_Chng_Q4_Q1','Total_Relationship_Count']

In [None]:
X = usampled_df_with_pcs[X_features]
y = usampled_df_with_pcs['Churn']

In [None]:
train_x,test_x,train_y,test_y = train_test_split(X,y,random_state=42)

In [None]:
rf_algo = RandomForestClassifier(random_state = 42)
rf_algo.fit(train_x, test_x)