In [1]:
import pandas as pd
import os

# Step 1: Specify the file path
file_path = '/content/bank-additional-full.csv'  # Replace with your actual file path

# Step 2: Load the dataset with error handling
try:
    # Read the CSV file, handle bad lines, and remove quotes
    df = pd.read_csv(
        file_path,
        sep=';',
        header=0,
        on_bad_lines='skip',
        quotechar='"',
        engine='python'
    )

    # Step 3: Clean column names (remove leading/trailing whitespaces)
    df.columns = [col.strip() for col in df.columns]

    # Step 4: Remove quotes from string values and clean the data
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].str.replace('"', '', regex=False).str.strip()

    # Step 5: Save the cleaned dataset as a new CSV file in the current directory
    output_file = os.path.join(os.getcwd(), 'bank_additional_full_dataset.csv')  # Save in the current directory
    df.to_csv(output_file, index=False)

    print(f"Cleaned dataset saved as: {output_file}")

except Exception as e:
    print(f"An error occurred: {e}")


Cleaned dataset saved as: /content/bank_additional_full_dataset.csv


In [2]:
# Step 1: Revert to Compatible Versions of Libraries
!pip install numpy==1.26.4 pandas==2.2.2 tensorflow==2.17.1 scikit-learn matplotlib seaborn

# Step 2: Verify Installed Versions
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib

print(f"Numpy version: {np.__version__}")
print(f"Pandas version: {pd.__version__}")
print(f"TensorFlow version: {tf.__version__}")
print(f"Matplotlib version: {matplotlib.__version__}")


# Step 3: Suppress Future Warnings
import warnings
warnings.filterwarnings("ignore")



Numpy version: 1.26.4
Pandas version: 2.2.2
TensorFlow version: 2.17.1
Matplotlib version: 3.8.0


## Try to see How many numerical features

In [3]:
# Load the dataset (replace 'file.csv' with your dataset file)
df = pd.read_csv('/content/bank_additional_full_dataset.csv')

# Check data types
print("Data Types:\n", df.dtypes)

# Extract numerical columns
numerical_features = df.select_dtypes(include=['int64', 'float64']).columns
print("\nNumerical Features:\n", numerical_features)

# Summary statistics for numerical features
print("\nSummary Statistics for Numerical Features:\n", df[numerical_features].describe())

Data Types:
 age                 int64
job                object
marital            object
education          object
default            object
housing            object
loan               object
contact            object
month              object
day_of_week        object
duration            int64
campaign            int64
pdays               int64
previous            int64
poutcome           object
emp.var.rate      float64
cons.price.idx    float64
cons.conf.idx     float64
euribor3m         float64
nr.employed       float64
y                  object
dtype: object

Numerical Features:
 Index(['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate',
       'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed'],
      dtype='object')

Summary Statistics for Numerical Features:
                age      duration      campaign         pdays      previous  \
count  41188.00000  41188.000000  41188.000000  41188.000000  41188.000000   
mean      40.02406    258.285010    

# Try to see How many categorical features

In [4]:
# Extract categorical columns
categorical_features = df.select_dtypes(include=['object', 'category']).columns
print("\nCategorical Features:\n", categorical_features)


# Analyze each categorical feature
for feature in categorical_features:
    print(f"\nFeature: {feature}")
    print(f"Unique Values: {df[feature].nunique()}")
    print(f"Most Frequent Categories:\n{df[feature].value_counts().head()}")


Categorical Features:
 Index(['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
       'month', 'day_of_week', 'poutcome', 'y'],
      dtype='object')

Feature: job
Unique Values: 12
Most Frequent Categories:
job
admin.         10422
blue-collar     9254
technician      6743
services        3969
management      2924
Name: count, dtype: int64

Feature: marital
Unique Values: 4
Most Frequent Categories:
marital
married     24928
single      11568
divorced     4612
unknown        80
Name: count, dtype: int64

Feature: education
Unique Values: 8
Most Frequent Categories:
education
university.degree      12168
high.school             9515
basic.9y                6045
professional.course     5243
basic.4y                4176
Name: count, dtype: int64

Feature: default
Unique Values: 3
Most Frequent Categories:
default
no         32588
unknown     8597
yes            3
Name: count, dtype: int64

Feature: housing
Unique Values: 3
Most Frequent Categories:
housing
yes    

# Check Missing Values

In [5]:
import pandas as pd

# Step 1: Load the dataset
file_path = "/content/bank_additional_full_dataset.csv"  # Update this with the path to your CSV file
df = pd.read_csv(file_path)

# Step 2: Check for missing values
missing_values = df.isnull().sum()
print("\nMissing Values:\n", missing_values)


Missing Values:
 age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64


# Detect Outliers of numerical features

In [6]:
from scipy.stats import zscore

# Calculate Z-scores for each numerical feature
z_scores = df[numerical_features].apply(zscore)

# Define a threshold (commonly 3 or -3)
threshold = 3

# Detect outliers (absolute Z-score greater than the threshold)
outliers = (z_scores.abs() > threshold)

# Print outliers for each feature
for feature in numerical_features:
    print(f"\nOutliers for feature '{feature}':")
    print(df[outliers[feature]])



Outliers for feature 'age':
       age      job   marital            education  default housing loan  \
27757   76  retired   married    university.degree       no      no  yes   
27780   73  retired   married    university.degree       no     yes   no   
27800   88  retired  divorced             basic.4y       no     yes   no   
27802   88  retired  divorced             basic.4y       no      no   no   
27805   88  retired  divorced             basic.4y       no     yes  yes   
...    ...      ...       ...                  ...      ...     ...  ...   
40986   84  retired  divorced             basic.4y  unknown     yes  yes   
40996   81  retired   married             basic.4y       no     yes   no   
41004   80  retired   married  professional.course       no     yes   no   
41183   73  retired   married  professional.course       no     yes   no   
41187   74  retired   married  professional.course       no     yes   no   

        contact month day_of_week  ...  campaign  pdays  p

# Remove Outliers of numerical features

In [7]:
import pandas as pd
from scipy.stats import zscore

# Assuming `df` is your dataframe and `numerical_features` contains the numerical columns.

# Function to remove outliers based on Z-scores
def remove_outliers(df, numerical_features, z_threshold=3):
    # Calculate Z-scores for each numerical feature
    z_scores = df[numerical_features].apply(zscore)

    # Detect outliers (absolute Z-score greater than the threshold)
    outliers_z = (z_scores.abs() > z_threshold)

    # Remove rows where any feature has an outlier
    df_cleaned = df[~outliers_z.any(axis=1)]

    return df_cleaned, outliers_z

# List of numerical features (update with your actual columns)
numerical_features = df.select_dtypes(include=['float64', 'int64']).columns.tolist()

# Remove outliers
df_cleaned, outliers_z = remove_outliers(df, numerical_features)

# Print the shape of the cleaned DataFrame
print(f"Original DataFrame shape: {df.shape}")
print(f"Cleaned DataFrame shape after Z-score filtering: {df_cleaned.shape}")

# Save the cleaned dataset as a CSV file
output_file = "outlier_remove_zscore.csv"
df_cleaned.to_csv(output_file, index=False)

# Optionally, inspect the cleaned data
print(df_cleaned[numerical_features].head())  # Check the first few rows after cleaning with Z-scores

# Print the file path where the cleaned dataset was saved
print(f"Cleaned data after Z-score filtering saved as: {output_file}")

# Optional: If you want to inspect the number of outliers in each column
outliers_count = outliers_z.sum()
print("Number of outliers detected in each feature:")
print(outliers_count)


Original DataFrame shape: (41188, 21)
Cleaned DataFrame shape after Z-score filtering: (37314, 21)
   age  duration  campaign  pdays  previous  emp.var.rate  cons.price.idx  \
0   56       261         1    999         0           1.1          93.994   
1   57       149         1    999         0           1.1          93.994   
2   37       226         1    999         0           1.1          93.994   
3   40       151         1    999         0           1.1          93.994   
4   56       307         1    999         0           1.1          93.994   

   cons.conf.idx  euribor3m  nr.employed  
0          -36.4      4.857       5191.0  
1          -36.4      4.857       5191.0  
2          -36.4      4.857       5191.0  
3          -36.4      4.857       5191.0  
4          -36.4      4.857       5191.0  
Cleaned data after Z-score filtering saved as: outlier_remove_zscore.csv
Number of outliers detected in each feature:
age                369
duration           861
campaign        

# Handle Imbalanced dataset and apply one hot encoding

In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from collections import Counter

# Step 1: Load the cleaned dataset (after outlier removal)
df_cleaned = pd.read_csv("/content/outlier_remove_zscore.csv")

# Step 2: Check the class distribution before handling class imbalance
print("Class Distribution Before Handling:\n", df_cleaned['y'].value_counts())

# Step 3: Replace or remove infinite values in the dataset
df_cleaned.replace([np.inf, -np.inf], np.nan, inplace=True)  # Replace inf/-inf with NaN
df_cleaned.dropna(inplace=True)  # Drop rows with NaN (if any remain)

# Step 4: Convert True/False values to 1/0 (for the entire DataFrame)
df_cleaned = df_cleaned.applymap(lambda x: int(x) if isinstance(x, bool) else x)

# Step 5: Convert categorical variables to numerical using One-Hot Encoding (for features)
df_cleaned = pd.get_dummies(df_cleaned, drop_first=True)

# Step 6: Separate features (X) and target (y)
X = df_cleaned.drop(columns=['y_yes'])  # Drop the 'y_yes' column after One-Hot Encoding
y = df_cleaned['y_yes'].astype(int)  # Ensure target column is converted to integers

# Option: Rename the target column back to 'y' if you prefer to keep this name
df_cleaned.rename(columns={'y_yes': 'y'}, inplace=True)

# Step 7: Convert y to integer type to ensure 0/1 representation
y = y.astype(int)

# Step 8: Apply SMOTE to balance the dataset (oversample the minority class)
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)

# Step 9: Check the class distribution after SMOTE
print("Class Distribution After SMOTE:\n", Counter(y_smote))

# Step 10: Save the balanced dataset as a CSV
balanced_dataset = pd.concat([pd.DataFrame(X_smote, columns=X.columns), pd.Series(y_smote, name='y')], axis=1)
output_path = "processed_Dataset.csv"
balanced_dataset.to_csv(output_path, index=False)
print(f"Balanced dataset saved to {output_path}")


Class Distribution Before Handling:
 y
no     34346
yes     2968
Name: count, dtype: int64
Class Distribution After SMOTE:
 Counter({0: 34346, 1: 34346})
Balanced dataset saved to processed_Dataset.csv


# Preprocessed Dataset

In [9]:
# Identify boolean columns in the dataframe
bool_columns = df_cleaned.select_dtypes(include='bool').columns

# Convert boolean columns to integers (1 for True, 0 for False)
df_cleaned[bool_columns] = df_cleaned[bool_columns].astype(int)

# Verify the change
print(df_cleaned[bool_columns].dtypes)

# Save the cleaned and processed dataset to a CSV file
output_path = "Final_Processed_Data.csv"
df_cleaned.to_csv(output_path, index=False)

print(f"Processed dataset saved to {output_path}")


job_blue-collar                  int64
job_entrepreneur                 int64
job_housemaid                    int64
job_management                   int64
job_retired                      int64
job_self-employed                int64
job_services                     int64
job_student                      int64
job_technician                   int64
job_unemployed                   int64
job_unknown                      int64
marital_married                  int64
marital_single                   int64
marital_unknown                  int64
education_basic.6y               int64
education_basic.9y               int64
education_high.school            int64
education_illiterate             int64
education_professional.course    int64
education_university.degree      int64
education_unknown                int64
default_unknown                  int64
default_yes                      int64
housing_unknown                  int64
housing_yes                      int64
loan_unknown             