# Bank Marketing Dataset - Exploratory Data Analysis
## Ensemble Sampling Model for Predictive Class Imbalance Classification

This notebook explores the Bank Marketing dataset from the UCI Machine Learning Repository.

In [None]:
# Import libraries
import sys
import os
sys.path.insert(0, os.path.join(os.getcwd(), '..', 'src'))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from data_loader import BankMarketingDataLoader

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

## 1. Load the Dataset

In [None]:
# Initialize data loader
loader = BankMarketingDataLoader(data_dir='../data')

# Load the dataset
df = loader.load_data()

# Display basic information
print(f"Dataset shape: {df.shape}")
print(f"\nColumn names: {df.columns.tolist()}")

## 2. Dataset Overview

In [None]:
# Display first few rows
df.head(10)

In [None]:
# Dataset information
df.info()

In [None]:
# Statistical summary
df.describe()

## 3. Class Imbalance Analysis

In [None]:
# Target variable distribution
target_counts = df['y'].value_counts()
print("Target Variable Distribution:")
print(target_counts)
print(f"\nClass Imbalance Ratio: {target_counts['no'] / target_counts['yes']:.2f}:1")

# Visualize class distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bar plot
target_counts.plot(kind='bar', ax=axes[0], color=['#ff9999', '#66b3ff'])
axes[0].set_title('Target Variable Distribution', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Subscribed Term Deposit', fontsize=12)
axes[0].set_ylabel('Count', fontsize=12)
axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=0)

# Pie chart
axes[1].pie(target_counts, labels=target_counts.index, autopct='%1.1f%%', 
            colors=['#ff9999', '#66b3ff'], startangle=90)
axes[1].set_title('Target Variable Proportion', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

## 4. Feature Analysis

In [None]:
# Categorical features
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
categorical_cols.remove('y')  # Remove target

print(f"Categorical Features ({len(categorical_cols)}): {categorical_cols}")

# Numerical features
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
print(f"\nNumerical Features ({len(numerical_cols)}): {numerical_cols}")

In [None]:
# Distribution of numerical features
fig, axes = plt.subplots(3, 3, figsize=(15, 12))
axes = axes.ravel()

for idx, col in enumerate(numerical_cols[:9]):
    axes[idx].hist(df[col], bins=30, edgecolor='black', alpha=0.7)
    axes[idx].set_title(col, fontweight='bold')
    axes[idx].set_xlabel('Value')
    axes[idx].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Categorical feature distributions
n_cols = 3
n_rows = (len(categorical_cols) + n_cols - 1) // n_cols

fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, n_rows*4))
axes = axes.ravel()

for idx, col in enumerate(categorical_cols):
    df[col].value_counts().plot(kind='bar', ax=axes[idx], color='steelblue')
    axes[idx].set_title(f'{col} Distribution', fontweight='bold')
    axes[idx].set_xlabel('')
    axes[idx].set_ylabel('Count')
    axes[idx].tick_params(axis='x', rotation=45)

# Hide extra subplots
for idx in range(len(categorical_cols), len(axes)):
    axes[idx].axis('off')

plt.tight_layout()
plt.show()

## 5. Correlation Analysis

In [None]:
# Preprocess data for correlation analysis
X, y = loader.preprocess_data(df)

# Add target to features for correlation
data_for_corr = X.copy()
data_for_corr['target'] = y

# Calculate correlation matrix
correlation_matrix = data_for_corr.corr()

# Plot correlation heatmap (top 20 features)
plt.figure(figsize=(12, 10))
top_features = correlation_matrix['target'].abs().sort_values(ascending=False)[:20].index
sns.heatmap(correlation_matrix.loc[top_features, top_features], 
            annot=True, fmt='.2f', cmap='coolwarm', center=0,
            square=True, linewidths=1)
plt.title('Correlation Matrix - Top 20 Features', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
# Features most correlated with target
target_corr = correlation_matrix['target'].drop('target').sort_values(ascending=False)

plt.figure(figsize=(10, 8))
target_corr[:15].plot(kind='barh', color='green', alpha=0.7)
plt.title('Top 15 Features Positively Correlated with Target', fontsize=14, fontweight='bold')
plt.xlabel('Correlation Coefficient')
plt.tight_layout()
plt.show()

plt.figure(figsize=(10, 8))
target_corr[-15:].plot(kind='barh', color='red', alpha=0.7)
plt.title('Top 15 Features Negatively Correlated with Target', fontsize=14, fontweight='bold')
plt.xlabel('Correlation Coefficient')
plt.tight_layout()
plt.show()

## 6. Missing Values Analysis

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing Count': missing_values,
    'Percentage': missing_percentage
})

missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)

if len(missing_df) > 0:
    print("Missing Values:")
    print(missing_df)
else:
    print("No missing values found in the dataset!")

## 7. Summary Statistics by Target

In [None]:
# Compare numerical features by target class
for col in numerical_cols[:5]:  # Show first 5 numerical columns
    plt.figure(figsize=(10, 4))
    
    # Box plot
    df.boxplot(column=col, by='y', ax=plt.gca())
    plt.title(f'{col} Distribution by Target', fontweight='bold')
    plt.suptitle('')
    plt.xlabel('Subscribed Term Deposit')
    plt.ylabel(col)
    plt.tight_layout()
    plt.show()

## 8. Key Findings

Based on the exploratory data analysis:

1. **Class Imbalance**: The dataset exhibits significant class imbalance, with the majority class (no subscription) being much larger than the minority class (subscription). This justifies the use of ensemble sampling techniques.

2. **Feature Types**: The dataset contains both numerical and categorical features that need to be preprocessed for machine learning models.

3. **Data Quality**: The dataset appears to have no missing values, which simplifies preprocessing.

4. **Feature Importance**: Some features show stronger correlation with the target variable, which could be important for prediction.

5. **Next Steps**: 
   - Apply various sampling techniques (SMOTE, random oversampling, etc.)
   - Train multiple classifiers
   - Evaluate using appropriate metrics for imbalanced classification (Precision, Recall, F1-Score, ROC-AUC)