<a href="https://colab.research.google.com/github/Sazul19/bank-marketing-prediction/blob/Data-preperation/notebooks/data_preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
!pip install ucimlrepo

## Import libraries

import pandas as pd
from ucimlrepo import fetch_ucirepo
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from imblearn.combine import SMOTETomek






In [11]:


# fetch dataset
bank_marketing = fetch_ucirepo(id=222)

# data (as pandas dataframes)
X = bank_marketing.data.features
y = bank_marketing.data.targets
# Combine features (X) and target (y) into one DataFrame
df = X.copy()
df['y'] = y


#categorical columns
categorical_columns=X.select_dtypes(include=['object']).columns.tolist()
print(categorical_columns)

#numerical columns
numerical_columns=X.select_dtypes(include=['int64']).columns.tolist()
print(numerical_columns)



['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
['age', 'balance', 'day_of_week', 'duration', 'campaign', 'pdays', 'previous']


#Handle Missing Values

In [12]:
#Handling missing values

#job
df['job'] = df['job'].fillna('unknown')

#education
df['education'] = df['education'].fillna('unknown')

#contact
df['contact'] = df['contact'].fillna('unknown')

#poutcome
df['poutcome'] = df['poutcome'].fillna('unknown')

#Drop Columns

In [14]:
from sklearn.preprocessing import StandardScaler
# Drop 'duration' for realistic modeling
df= df.drop(columns=['day_of_week','contact'])

# Process 'pdays'
df['pdays_flag'] = (df['pdays'] == -1).astype(int)
df['pdays'] = df['pdays'].replace(-1, 0)

scaler = StandardScaler()
df[['campaign', 'previous']] = scaler.fit_transform(df[['campaign', 'previous']])


#Outliers Handling

In [17]:

# Outlier thresholds
thresholds = {


    'campaign': {'lower': 0, 'upper': 6},
    'pdays': {'lower': -1, 'upper': -1},  # Special case
    'previous': {'lower': 0, 'upper': 0},  # Special case
}

# Function to handle outliers
def handle_outliers(df, thresholds, method="cap"):
    """
    Handles outliers by capping or removing based on thresholds.

    Args:
        df (pd.DataFrame): The dataset.
        thresholds (dict): Dictionary of thresholds for each column.
        method (str): "cap" or "remove".

    Returns:
        pd.DataFrame: Updated dataset.
        dict: Summary of outliers handled.
    """
    df = df.copy()
    summary = {}

    for column, bounds in thresholds.items():
        lower, upper = bounds['lower'], bounds['upper']
        if column in df.columns:
            if method == "cap":
                # Cap the outliers
                num_capped_lower = (df[column] < lower).sum()
                num_capped_upper = (df[column] > upper).sum()
                df[column] = np.where(df[column] < lower, lower, df[column])
                df[column] = np.where(df[column] > upper, upper, df[column])
                summary[column] = {'capped_lower': num_capped_lower, 'capped_upper': num_capped_upper}
            elif method == "remove":
                # Remove rows with outliers
                num_removed = ((df[column] < lower) | (df[column] > upper)).sum()
                df = df[(df[column] >= lower) & (df[column] <= upper)]
                summary[column] = {'removed': num_removed}
            else:
                raise ValueError(f"Invalid method '{method}'. Use 'cap' or 'remove'.")
    return df, summary


# Visualize before handling
print("Before Handling Outliers:")
print(df)

# Handle outliers (choose 'cap' or 'remove')
df_cleaned = handle_outliers(df, thresholds, method="cap")

# Visualize after handling
print("\nAfter Handling Outliers (Capping):")
print(df_cleaned)



# Visualize boxplots for numerical columns
numeric_cols = ['age', 'balance', 'duration', 'campaign']



Before Handling Outliers:
       age           job   marital  education default  balance housing loan  \
0       58    management   married   tertiary      no     2143     yes   no   
1       44    technician    single  secondary      no       29     yes   no   
2       33  entrepreneur   married  secondary      no        2     yes  yes   
3       47   blue-collar   married    unknown      no     1506     yes   no   
4       33       unknown    single    unknown      no        1      no   no   
...    ...           ...       ...        ...     ...      ...     ...  ...   
45206   51    technician   married   tertiary      no      825      no   no   
45207   71       retired  divorced    primary      no     1729      no   no   
45208   72       retired   married  secondary      no     5715      no   no   
45209   57   blue-collar   married  secondary      no      668      no   no   
45210   37  entrepreneur   married  secondary      no     2971      no   no   

      month  duration  ca

#Encoding Categorocal Variables


In [4]:

def encode_features(df):
    """
    Encodes categorical features in the DataFrame using different encoding strategies.

    Parameters:
        df (pd.DataFrame): The input DataFrame containing categorical features.

    Returns:
        pd.DataFrame: The DataFrame with encoded features.
    """
    # 1. One-Hot Encoding for high/moderate cardinality nominal features
    one_hot_features = ['job', 'marital', 'education','contact', 'month','day_of_week' ,'poutcome','education']
    df_one_hot = pd.get_dummies(df[one_hot_features], drop_first=True)



    # 3. Binary Encoding for binary features
    binary_features = ['default', 'housing', 'loan']
    for feature in binary_features:
        df[feature + '_encoded'] = df[feature].map({'no': 0, 'yes': 1})

    # Combine all encoded features
    df_encoded = pd.concat([df.drop(columns=one_hot_features + binary_features), df_one_hot], axis=1)

    return df_encoded




# Call the function
df_encoded = encode_features(df)

# Display the final encoded dataset
print(df_encoded.head())

   age  balance  duration  campaign  pdays  previous   y  default_encoded  \
0   58     2143       261         1     -1         0  no                0   
1   44       29       151         1     -1         0  no                0   
2   33        2        76         1     -1         0  no                0   
3   47     1506        92         1     -1         0  no                0   
4   33        1       198         1     -1         0  no                0   

   housing_encoded  loan_encoded  ...  month_may  month_nov  month_oct  \
0                1             0  ...       True      False      False   
1                1             0  ...       True      False      False   
2                1             1  ...       True      False      False   
3                1             0  ...       True      False      False   
4                0             0  ...       True      False      False   

   month_sep  poutcome_other  poutcome_success  poutcome_unknown  \
0      False           F

#Splitting and Balancing

In [None]:

def split_features_and_target(df, target_column):
    """
    Splits the DataFrame into features and target variable.

    Parameters:
        df (pd.DataFrame): The input DataFrame.
        target_column (str): The name of the target column.

    Returns:
        X (pd.DataFrame): Features.
        y (pd.Series): Target variable.
    """
    X = df.drop(columns=[target_column])
    y = df[target_column].map({'no': 0, 'yes': 1})

    # Save the initial split data
    X.to_csv("X_initial.csv", index=False)
    y.to_csv("y_initial.csv", index=False)
    print("Initial features and target saved as CSV.")

    return X, y



In [None]:
def train_test_split_stratified(X, y, test_size=0.2, random_state=42):
    """
    Splits the data into training and testing sets using stratified sampling.

    Parameters:
        X (pd.DataFrame): Features.
        y (pd.Series): Target variable.
        test_size (float): Proportion of the dataset to include in the test split.
        random_state (int): Random seed for reproducibility.

    Returns:
        X_train, X_test, y_train, y_test: Split datasets.
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y)

    # Save train-test split data
    X_train.to_csv("X_train.csv", index=False)
    X_test.to_csv("X_test.csv", index=False)
    y_train.to_csv("y_train.csv", index=False)
    y_test.to_csv("y_test.csv", index=False)
    print("Train-test split data saved as CSV.")

    return X_train, X_test, y_train, y_test




In [None]:
def apply_smote_tomek(X_train, y_train):
    """
    Applies SMOTE + Tomek Links to the training data to handle class imbalance.

    Parameters:
        X_train (pd.DataFrame): Training features.
        y_train (pd.Series): Training target variable.

    Returns:
        X_train_resampled, y_train_resampled: Resampled datasets.
    """
    smote_tomek = SMOTETomek(random_state=42)
    X_train_resampled, y_train_resampled = smote_tomek.fit_resample(X_train, y_train)

    # Save resampled data
    X_train_resampled.to_csv("X_train_resampled.csv", index=False)
    y_train_resampled.to_csv("y_train_resampled.csv", index=False)
    print("Resampled training data saved as CSV.")

    return X_train_resampled, y_train_resampled

In [None]:

X, y = split_features_and_target(df_encoded, target_column="y")
X_train, X_test, y_train, y_test = train_test_split_stratified(X, y)
X_train_resampled, y_train_resampled = apply_smote_tomek(X_train, y_train)
