<h1 align="center" style="background-color:#0b0504;color:white;border-radius: 8px; padding:12px">Bank Customer Churn Prediction using SVM</h1>

<h2 style="background-color:#0b0504;color:white;border-radius: 8px; padding:12px">1. Import Libraries and Load Dataset</h2>

In [1]:
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", 120)
pd.set_option("display.max_rows", 120)
pd.set_option('display.float_format', '{:.2f}'.format)

import warnings 
warnings.simplefilter(action='ignore', category=FutureWarning)
# warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv('/kaggle/input/bank-customer-churn-prediction/Churn_Modelling.csv')

<h2 style="background-color:#0b0504;color:white;border-radius: 8px; padding:12px">2. Dataset Preview</h2>

In [3]:
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42.0,2,0.0,1,1.0,1.0,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41.0,1,83807.86,1,0.0,1.0,112542.58,0
2,3,15619304,Onio,502,France,Female,42.0,8,159660.8,3,1.0,0.0,113931.57,1
3,4,15701354,Boni,699,France,Female,39.0,1,0.0,2,0.0,0.0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43.0,2,125510.82,1,,1.0,79084.1,0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10002 entries, 0 to 10001
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10002 non-null  int64  
 1   CustomerId       10002 non-null  int64  
 2   Surname          10002 non-null  object 
 3   CreditScore      10002 non-null  int64  
 4   Geography        10001 non-null  object 
 5   Gender           10002 non-null  object 
 6   Age              10001 non-null  float64
 7   Tenure           10002 non-null  int64  
 8   Balance          10002 non-null  float64
 9   NumOfProducts    10002 non-null  int64  
 10  HasCrCard        10001 non-null  float64
 11  IsActiveMember   10001 non-null  float64
 12  EstimatedSalary  10002 non-null  float64
 13  Exited           10002 non-null  int64  
dtypes: float64(5), int64(6), object(3)
memory usage: 1.1+ MB


In [5]:
data['Exited'].unique()

array([1, 0])

In [6]:
data.describe()

Unnamed: 0,RowNumber,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10002.0,10002.0,10002.0,10001.0,10002.0,10002.0,10002.0,10001.0,10001.0,10002.0,10002.0
mean,5001.5,15690933.45,650.56,38.92,5.01,76491.11,1.53,0.71,0.51,100083.33,0.2
std,2887.47,71931.77,96.66,10.49,2.89,62393.47,0.58,0.46,0.5,57508.12,0.4
min,1.0,15565701.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,2501.25,15628524.75,584.0,32.0,3.0,0.0,1.0,0.0,0.0,50983.75,0.0
50%,5001.5,15690732.0,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100185.24,0.0
75%,7501.75,15753225.5,718.0,44.0,7.0,127647.84,2.0,1.0,1.0,149383.65,0.0
max,10000.0,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


<h2 style="background-color:#0b0504;color:white;border-radius: 8px; padding:12px">3. Data Wrangling</h2>

<h4 style="background-color:#0b0504;color:white;border-radius: 8px; padding:12px">3.1. Drop ID columns</h4>

In [7]:
data = data.drop(columns=['RowNumber', 'CustomerId'], axis=1)

<h4 style="background-color:#0b0504;color:white;border-radius: 8px; padding:12px">3.2. Drop NaNs</h4>

In [8]:
data = data.dropna()

<h4 style="background-color:#0b0504;color:white;border-radius: 8px; padding:12px">3.3. Change Column DataType</h4>

In [9]:
columns_to_convert = ['HasCrCard', 'IsActiveMember', 'Tenure']

for column in columns_to_convert:
    data[column] = data[column].astype('int').astype('object')

<h2 style="background-color:#0b0504;color:white;border-radius: 8px; padding:12px">4. Exploratory Data Analysis</h2>

In [10]:
# from fasteda import fast_eda
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
sns.set_style("darkgrid")

In [11]:
def custom_show(fig):
    fig.update_layout(title_x=0.5, title_y=0.9)
#     fig.show('png', width=1000, height=550)

In [12]:
plot_color = ['#FF6F61', '#008080']
sns.set_palette(['#008080', 'black'])

In [13]:
# !pip install -U kaleido

In [14]:
def custom_barplot(dataframe: pd.DataFrame, column: str, titles: list, labels: dict) -> go.Figure:
    """create a barplot for visualizing a column distribution and 
       its relationship between the target column 
       
       df: the data from which we intend to create the plot -> pandas DataFrame
       
       column: the x-axis column name -> str
       
       titles: A list of the plot and x-axis title respectively -> list
       
       label: a mapping of the x-label and y-label -> dict
    """
    df = pd.crosstab(dataframe[column], dataframe['Exited'])
    data_normalized = df.div(df.sum(1).astype('float'), axis=0).sort_values(by=0)

    fig = px.bar(data_normalized,
                 x=data_normalized.index,
                 y=[0, 1],
                 color_discrete_sequence=plot_color,
                 title=titles[0],
                 labels=labels,
                 width=800,
                 height=500)

    fig.update_layout(xaxis_title=titles[1],
                      yaxis_title='',
                      legend_title='Customer Churn?',
                      title_x=0.5)

    return fig

<h4 style="background-color:#0b0504;color:white;border-radius: 8px; padding:12px">4.1. Univariate Analysis</h4>

**Exited**

In [15]:
exited_count = data['Exited'].value_counts()
exited_count

Exited
0    7960
1    2038
Name: count, dtype: int64

In [16]:
fig = px.pie(
    exited_count, values=exited_count.values, names=['No', 'Yes'],
    title='Proportion of customer churn',
    color_discrete_sequence=plot_color,
)

fig.update_layout(legend_title_text='Customer churn?', title_x=0.3, title_y=0.95,)
custom_show(fig)

<h4 style="background-color:#0b0504;color:white;border-radius: 8px; padding:12px">4.2. Bivariate Analysis</h4>

**Age**

In [17]:
fig = px.histogram(data, x='Age', color='Exited', title='Age distribution', color_discrete_sequence=plot_color)
fig.update_layout(title_x=0.5, title_y=0.9, legend_title='Customer Churn')
custom_show(fig)

**Gender**

In [18]:
data['Gender'].unique()

array(['Female', 'Male'], dtype=object)

In [19]:
fig = px.histogram(data, x='Gender', color='Exited', 
                   color_discrete_sequence=plot_color,
                   title="Gender Churn", width=700, height=500)
fig.update_layout(title_x=0.5, legend_title='Customer Churn')
custom_show(fig)

**Tenure**

In [20]:
data['Tenure'].unique()

array([2, 1, 8, 4, 6, 3, 10, 5, 7, 9, 0], dtype=object)

In [21]:
titles = ['Proportion of customer churn by tenure', 'Tenure']
labels={'index': 'Tenure', 'value': 'Proportion'}

custom_barplot(data, 'Tenure', titles, labels)

There appears to be no correlation between `tenure` and `customer churn`. This column is best one-hot-encoded rather than ordinal encoded.

**NumOfProducts**

In [22]:
titles = ['Proportion of customer churn by number of products', 'NumOfProducts']
labels={'index': 'Number of Products', 'value': 'Proportion'}

custom_barplot(data, 'NumOfProducts', titles, labels)

There appears to be no correlation between `tenure` and `number of products`. Also, this column is best one-hot-encoded rather than ordinal encoded.

<h2 style="background-color:#0b0504;color:white;border-radius: 8px; padding:12px">5. Data Preparation and Preprocessing</h2>

In [23]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

from sklearn import set_config
set_config(display='diagram')

<h4 style="background-color:#0b0504;color:white;border-radius: 8px; padding:12px">5.1. Ordinal Encode Gender Column</h4>

In [24]:
mapping = {'Male': 0, 'Female': 1}

In [25]:
data['Gender'] = data['Gender'].map(mapping)

<h4 style="background-color:#0b0504;color:white;border-radius: 8px; padding:12px">5.2. Data Preprocessing Pipeline</h4>

In [26]:
X = data.drop(columns=['Exited'], axis=1)
y = data['Exited']

In [27]:
num_cols = X.select_dtypes(include=np.number).columns.to_list()
cat_cols = X.select_dtypes(exclude=np.number).columns.to_list()

print(num_cols, end='\n\n')
print(cat_cols)

['CreditScore', 'Gender', 'Age', 'Balance', 'NumOfProducts', 'EstimatedSalary']

['Surname', 'Geography', 'Tenure', 'HasCrCard', 'IsActiveMember']


In [28]:
num_pipeline = Pipeline(steps=[
    ('scaler', MinMaxScaler())
])

cat_pipeline = Pipeline(steps=[
    ('one_hot_enc', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

In [29]:
col_trans = ColumnTransformer(transformers=[
    ('num_pipeline', num_pipeline, num_cols),
    ('cat_pipeline', cat_pipeline, cat_cols),
    ],
    remainder='drop',
    n_jobs=-1
)

In [30]:
set_config(display='diagram')
display(col_trans)

In [31]:
col_trans.fit(X)

In [32]:
X_preprocessed = col_trans.transform(X)

In [33]:
X_preprocessed.shape, y.shape

((9998, 2956), (9998,))

<h4 style="background-color:#0b0504;color:white;border-radius: 8px; padding:12px">5.3. Partition Dataset into Training and Test Set</h4>

In [34]:
y = np.where(y == 0, -1, 1)

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

<h2 style="background-color:#0b0504;color:white;border-radius: 8px; padding:12px">6. Support Vector Machine Classifier</h2>

In [36]:
class SVM:
    def __init__(self, learning_rate=0.001, lambda_param=0.01, n_iters=1000):
        self.learning_rate = learning_rate
        self.lambda_param = lambda_param
        self.n_iters = n_iters
        self.w = None
        self.b = None

    def fit(self, X, y):
        n_samples, n_features = X.shape
        y_ = np.where(y <= 0, -1, 1)  # Ensure the labels are -1 and 1

        self.w = np.zeros(n_features)
        self.b = 0

        for _ in range(self.n_iters):
            for idx, x_i in enumerate(X):
                condition = y_[idx] * (np.dot(x_i, self.w) - self.b) >= 1
                if condition:
                    self.w -= self.learning_rate * (2 * self.lambda_param * self.w)
                else:
                    self.w -= self.learning_rate * (2 * self.lambda_param * self.w - np.dot(x_i, y_[idx]))
                    self.b -= self.learning_rate * y_[idx]

    def predict(self, X):
        linear_output = np.dot(X, self.w) - self.b
        return np.sign(linear_output)

def accuracy(y_true, y_pred):
#     return accuracy_score(y_true, y_pred)
    return np.mean(y_true == y_pred)

In [37]:
svm = SVM()
svm.fit(X_train, y_train)

In [38]:
predictions = svm.predict(X_test)

In [39]:
acc = accuracy(y_test, predictions)
print(f'Accuracy: {acc}')

Accuracy: 0.786
