In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Load the dataset
file_path = r"C:\Users\samia\OneDrive\Documents\GitHub\springboard\Project Proposal\bank+marketing\bank-additional\bank-additional\bank-additional-full.csv"

# Check if the file exists and load the data
if os.path.exists(file_path):
    bank_additional_full = pd.read_csv(file_path, sep=';')  # Ensure correct separator
else:
    print(f"File not found: {file_path}")
    exit()

# Step 1: Check if there are any missing values in the dataset
print(bank_additional_full.isnull().sum())

# Step 2: Handle missing values
bank_additional_full.fillna(bank_additional_full.median(numeric_only=True), inplace=True)

# Step 3: Identify feature columns
target_column = "y"  # Update if needed
if target_column not in bank_additional_full.columns:
    print("Error: Target column not found.")
    exit()

X = bank_additional_full.drop(columns=[target_column])
y = bank_additional_full[target_column].map({'yes': 1, 'no': 0})  # Convert target to binary

# Step 4: Train-Test Split BEFORE transformations
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Step 5: Convert categorical columns to dummies separately
categorical_columns = X_train.select_dtypes(include=['object']).columns

X_train = pd.get_dummies(X_train, columns=categorical_columns, drop_first=True)
X_test = pd.get_dummies(X_test, columns=categorical_columns, drop_first=True)

# Ensure both train & test have the same dummy columns
missing_cols = set(X_train.columns) - set(X_test.columns)
for col in missing_cols:
    X_test[col] = 0  # Add missing columns in test set

X_test = X_test[X_train.columns]  # Reorder columns to match training set

# Step 6: Standardize Numeric Features (Only on Train, then Apply to Test)
numeric_columns = X_train.select_dtypes(include=['int64', 'float64']).columns

if numeric_columns.empty:
    print("Warning: No numeric columns found.")
else:
    scaler = StandardScaler()
    X_train.loc[:, numeric_columns] = scaler.fit_transform(X_train[numeric_columns])
    X_test.loc[:, numeric_columns] = scaler.transform(X_test[numeric_columns])  # Apply same scaling

# Step 7: Verify Data
print("First 5 rows after preprocessing:\n", X_train.head())

# Step 8: Save Preprocessed Data (Optional)
X_train.to_csv("X_train_preprocessed.csv", index=False)
X_test.to_csv("X_test_preprocessed.csv", index=False)

print("Preprocessed data saved successfully.")


age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64
First 5 rows after preprocessing:
             age  duration  campaign     pdays  previous  emp.var.rate  \
25611  0.863739 -0.120196  0.522981  0.196584 -0.350127     -0.114858   
26010 -0.289722 -0.216732 -0.203688  0.196584  1.653813     -0.114858   
40194  3.651268  3.436173 -0.567023  0.196584 -0.350127     -1.133161   
297   -0.385843 -0.533368 -0.203688  0.196584 -0.350127      0.648868   
36344  1.824956  0.424264 -0.203688  0.196584 -0.350127     -1.896888   

       cons.price.idx  cons.conf.idx  euribor3m  nr.employed  ...  month_may  \
25611       -