<a href="https://colab.research.google.com/github/Senuth-Per/ML_Coursework_Banking_Systerm/blob/main/Ml_CW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Load the dataset with the proper delimiter (semicolon)
data_full_additional = pd.read_csv('/content/drive/MyDrive/ML_CW/Colab Notebooks/bank-additional-full - Copy.csv', delimiter=';')

# Step 1: Remove duplicates
print(f"Initial dataset shape: {data_full_additional.shape}")
data_full_additional = data_full_additional.drop_duplicates()
print(f"Shape after removing duplicates: {data_full_additional.shape}")

# Step 2: Handle "unknown" values in categorical features
categorical_cols = [
    'job', 'marital', 'education', 'default', 'housing',
    'loan', 'contact', 'month', 'day_of_week', 'poutcome'
]

# Add binary flag columns to indicate "unknown" values
for col in categorical_cols:
    if data_full_additional[col].str.contains('unknown').any():
        data_full_additional[f'is_unknown_{col}'] = (data_full_additional[col] == 'unknown').astype(int)

# Replace "unknown" with NaN for imputation
data_full_additional[categorical_cols] = data_full_additional[categorical_cols].replace('unknown', np.nan)

# Step 3: Handle missing values
# Impute categorical columns with the most frequent value
cat_imputer = SimpleImputer(strategy='most_frequent')
data_full_additional[categorical_cols] = cat_imputer.fit_transform(data_full_additional[categorical_cols])

# Step 4: One-hot encoding for non-ordinal categorical variables
non_ordinal_cols = ['job', 'marital', 'contact', 'poutcome', 'month', 'day_of_week']
data_full_additional = pd.get_dummies(data_full_additional, columns=non_ordinal_cols, drop_first=True)

# Step 5: Map ordinal categorical variables
education_mapping = {
    'basic.4y': 1, 'basic.6y': 2, 'basic.9y': 3, 'high.school': 4,
    'illiterate': 5, 'professional.course': 6, 'university.degree': 7
}
default_mapping = {'no': 0, 'yes': 1}
housing_mapping = {'no': 0, 'yes': 1}
loan_mapping = {'no': 0, 'yes': 1}

data_full_additional['education'] = data_full_additional['education'].map(education_mapping)
data_full_additional['default'] = data_full_additional['default'].map(default_mapping)
data_full_additional['housing'] = data_full_additional['housing'].map(housing_mapping)
data_full_additional['loan'] = data_full_additional['loan'].map(loan_mapping)

# Step 6: Exclude the 'duration' feature for realistic predictive modeling
data_full_additional = data_full_additional.drop(columns=['duration'])

# Step 7: Scale numerical features
numerical_cols = [
    'age', 'campaign', 'pdays', 'previous', 'emp.var.rate',
    'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed'
]
scaler = StandardScaler()
data_full_additional[numerical_cols] = scaler.fit_transform(data_full_additional[numerical_cols])

# Step 8: Target encoding
# Encode the target variable (y)
data_full_additional['y'] = data_full_additional['y'].map({'yes': 1, 'no': 0})

# Step 9: Add derived features from numerical data
data_full_additional['interaction_emp_cons'] = data_full_additional['emp.var.rate'] * data_full_additional['cons.price.idx']
data_full_additional['interaction_nr_employed_conf'] = data_full_additional['nr.employed'] * data_full_additional['cons.conf.idx']

# Step 10: Split features and target variable
X_full_additional = data_full_additional.drop(columns=['y'])
y_full_additional = data_full_additional['y']

# Step 11: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X_full_additional, y_full_additional, test_size=0.2, random_state=42
)

# Save the entire preprocessed dataset
processed_data_path = '/content/drive/MyDrive/banking_system/pre_processed_data.csv'
data_full_additional.to_csv(processed_data_path, index=False)
print(f"Preprocessed dataset saved to {processed_data_path}")

# Step 12: Save processed data for model training if needed
X_train.to_csv('/content/drive/MyDrive/banking_system/processed_X_train.csv', index=False)
y_train.to_csv('/content/drive/MyDrive/banking_system/processed_y_train.csv', index=False)
X_test.to_csv('/content/drive/MyDrive/banking_system/processed_X_test.csv', index=False)
y_test.to_csv('/content/drive/MyDrive/banking_system/processed_y_test.csv', index=False)

# Display dataset shapes
print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")


Initial dataset shape: (41188, 21)
Shape after removing duplicates: (41176, 21)
Preprocessed dataset saved to /content/drive/MyDrive/banking_system/pre_processed_data.csv
Training data shape: (32940, 49)
Testing data shape: (8236, 49)
