In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Load Data

In [None]:
df = pd.read_csv("./Churn_Modelling.csv")

In [None]:
df.shape

In [None]:
df.head(10)

EDA

In [None]:
df.describe()

In [None]:
df.describe(include=['O'])

In [None]:
df.isnull().sum()

In [None]:
df.shape[0] == df.CustomerId.nunique()

In [None]:
print(df['Geography'].value_counts())
print(df['Gender'].value_counts())
print(df['HasCrCard'].value_counts())
print(df['IsActiveMember'].value_counts())

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(10, 10))

sns.countplot(x='Geography', data=df, ax=axes[0, 0])
axes[0, 0].set_title('Count of Customers by Geography')

sns.countplot(x='Gender', data=df, ax=axes[0, 1])
axes[0, 1].set_title('Count of Customers by Gender')

sns.countplot(x='HasCrCard', data=df, ax=axes[1, 0])
axes[1, 0].set_title('Count of Customers by HasCrCard')

sns.countplot(x='IsActiveMember', data=df, ax=axes[1, 1])
axes[1, 1].set_title('Count of Customers by IsActiveMember')

plt.tight_layout()
plt.show()

In [None]:
df.hist(['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts',
        'EstimatedSalary'], bins=50, figsize=(10, 10))
plt.show()

In [None]:
numeric_df = df.select_dtypes(include=['int64', 'float64'])
corr_matrix = numeric_df.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.show()

In [None]:
sns.pairplot(df[['CreditScore', 'Age', 'Tenure', 'Balance',
             'NumOfProducts', 'EstimatedSalary', 'Exited']])
plt.show()

In [None]:
target_var = ['Exited']
cols_to_remove = ['RowNumber', 'CustomerId']
num_feats = ['CreditScore', 'Age', 'Tenure',
             'Balance', 'NumOfProducts', 'EstimatedSalary']
cat_feats = ['Surname', 'Geography', 'Gender', 'HasCrCard', 'IsActiveMember']

In [None]:
y = df[target_var].values
df.drop(cols_to_remove, axis=1, inplace=True)

# encoding

In [None]:
df_encoded = df.copy()

Label Encoding for Gender

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df_encoded['Gender'] = le.fit_transform(df_encoded['Gender'])

One Hot Encoding for Gegraphy

In [None]:
df_encoded = pd.get_dummies(df, columns=['Geography'])
df_encoded[['Geography_France', 'Geography_Germany', 'Geography_Spain']] = df_encoded[[
    'Geography_France', 'Geography_Germany', 'Geography_Spain']].astype(int)

In [None]:
df_encoded.head(10)

Target encoding for Surname

In [None]:
freqs = df_encoded.groupby(['Surname']).size()
df_encoded['Surname_freq'] = df_encoded.Surname.map(freqs)

In [None]:
means = df_encoded.groupby(['Surname']).Exited.mean()
means.head()

In [None]:
global_mean = df_encoded['Exited'].mean()
global_mean

In [None]:
df_encoded['Surname_mean_churn'] = df_encoded.Surname.map(means)
df_encoded['Surname_mean_churn'].fillna(global_mean, inplace=True)

In [None]:
df_encoded['Surname_enc'] = ((df_encoded.Surname_freq * df_encoded.Surname_mean_churn) -
                             df_encoded.Exited)/(df_encoded.Surname_freq - 1)
df_encoded.head(10)

In [None]:
df_encoded.head()

In [None]:
from sklearn.model_selection import train_test_split
df_train_val, df_test, y_train_val, y_test = train_test_split(
    df, y.ravel(), test_size=0.1, random_state=42)

df_train, df_val, y_train, y_val = train_test_split(
    df_train_val, y_train_val, test_size=0.12, random_state=42)

In [None]:
sns.violinplot(y=df_train['CreditScore'])

In [None]:
sns.violinplot(y=df_train['Age'])

In [None]:
sns.violinplot(y=df_train.Tenure)

In [None]:
sns.violinplot(y=df_train['Balance'])

In [None]:
sns.histplot(df_train.NumOfProducts, kde=False)

In [None]:
sns.kdeplot(df_train.EstimatedSalary)

In [None]:
means = df_train.groupby(['Surname']).Exited.mean()
means.head()