# Exploratory Data Analysis (EDA) for Bank Churn Prediction

## 1. Load the dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv('BankChurners.csv')

# Display the first few rows of the dataframe
df.head()

## 2. Initial Data Inspection

In [None]:
# Get the shape of the dataframe
print(f"Shape of the dataset: {df.shape}")

In [None]:
# Get information about the columns, data types, and non-null values
df.info()

In [None]:
# Get summary statistics for numerical columns
df.describe()

In [None]:
# Check for missing values
df.isnull().sum()

The dataset I downloaded has columns: `RowNumber`, `CustomerId`, `Surname`, `CreditScore`, `Geography`, `Gender`, `Age`, `Tenure`, `Balance`, `NumOfProducts`, `HasCrCard`, `IsActiveMember`, `EstimatedSalary`, `Exited`. The target variable is `Exited`. `RowNumber`, `CustomerId`, and `Surname` are not useful for prediction and will be dropped.

In [None]:
df_cleaned = df.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)
df_cleaned.head()

## 3. Target Variable Analysis

In [None]:
# Analyze the distribution of the target variable 'Exited'
sns.set_theme(style="darkgrid")
ax = sns.countplot(x='Exited', data=df_cleaned)
ax.set_title('Distribution of Target Variable (Exited)')
plt.show()

print(df_cleaned['Exited'].value_counts(normalize=True))

The target variable is imbalanced. About 20% of customers have churned.

## 4. Numerical Feature Analysis

In [None]:
# Select numerical features
numerical_features = df_cleaned.select_dtypes(include=np.number).columns.tolist()
numerical_features.remove('Exited') # remove target variable
print(f"Numerical features: {numerical_features}")

# Plot histograms for numerical features
df_cleaned[numerical_features].hist(bins=30, figsize=(15, 10))
plt.suptitle('Histograms of Numerical Features')
plt.show()

In [None]:
# Plot boxplots for numerical features vs. target
plt.figure(figsize=(15, 10))
for i, col in enumerate(numerical_features):
    plt.subplot(3, 3, i + 1)
    sns.boxplot(x='Exited', y=col, data=df_cleaned)
plt.tight_layout()
plt.show()

## 5. Categorical Feature Analysis

In [None]:
# Select categorical features
categorical_features = df_cleaned.select_dtypes(include='object').columns.tolist()
print(f"Categorical features: {categorical_features}")

# Plot barplots for categorical features vs. target
plt.figure(figsize=(10, 5))
for i, col in enumerate(categorical_features):
    plt.subplot(1, 2, i + 1)
    sns.countplot(x=col, hue='Exited', data=df_cleaned)
plt.tight_layout()
plt.show()

## 6. Correlation Analysis

In [None]:
# Calculate the correlation matrix for numerical features
corr_matrix = df_cleaned.corr(numeric_only=True)

# Plot the heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()

## 7. Business Insights
- **Age and Churn**: Older customers seem more likely to churn.
- **Balance and Churn**: Customers with a higher balance are more likely to churn. Customers with a zero balance have a lower churn rate.
- **Geography and Churn**: Customers from Germany have a significantly higher churn rate.
- **Gender and Churn**: Female customers have a slightly higher churn rate.
- **Number of Products**: Customers with 3 or 4 products have a very high churn rate, suggesting these might be less popular or problematic products.
- **Active Members**: As expected, non-active members are much more likely to churn.