<a href="https://colab.research.google.com/github/ShabnaIlmi/Bank-Marketing-Random-Forest-and-Neural-Networks/blob/main/Coursework.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [99]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer


In [100]:
# Loading the dataset with the proper delimiter (semicolon)
data_full_additional = pd.read_csv('/content/drive/MyDrive/Bank-Marketing-Random-Forest-and-Neural-Networks/bank-additional-full.csv', delimiter=';')


In [101]:
# Displaying the data contained within the dataset
data_full_additional.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


# **Data Cleaning**

Removing whitespaces from the object type columns

In [102]:
# Removing whitespaces from the object type column
object_columns = data_full_additional.select_dtypes(include=['object']).columns
data_full_additional[object_columns] = data_full_additional[object_columns].apply(lambda x: x.str.strip())

## **Removing Duplicates**

In [103]:
# Removing duplicates
print(f"Initial dataset shape: {data_full_additional.shape}")
data_full_additional = data_full_additional.drop_duplicates()
print(f"Shape after removing duplicates: {data_full_additional.shape}")

Initial dataset shape: (41188, 21)
Shape after removing duplicates: (41176, 21)


In [104]:
# Display dataset information
print("Dataset Information:")
print(data_full_additional.info())

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
Index: 41176 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41176 non-null  int64  
 1   job             41176 non-null  object 
 2   marital         41176 non-null  object 
 3   education       41176 non-null  object 
 4   default         41176 non-null  object 
 5   housing         41176 non-null  object 
 6   loan            41176 non-null  object 
 7   contact         41176 non-null  object 
 8   month           41176 non-null  object 
 9   day_of_week     41176 non-null  object 
 10  duration        41176 non-null  int64  
 11  campaign        41176 non-null  int64  
 12  pdays           41176 non-null  int64  
 13  previous        41176 non-null  int64  
 14  poutcome        41176 non-null  object 
 15  emp.var.rate    41176 non-null  float64
 16  cons.price.idx  41176 non-null  float64
 17  cons.conf.idx  

## **Handling Missing Data**

In [105]:
# Handling "unknown" values in categorical features
categorical_cols = [
    'job', 'marital', 'education', 'default', 'housing',
    'loan', 'contact', 'month', 'day_of_week', 'poutcome'
]

In [106]:
# Creating an empty list
unknown_values = []

# Checking for 'unknown' values in the data of categorical columns
for col in categorical_cols:
    unknown_count = (data_full_additional[col] == 'unknown').sum()
    if unknown_count > 0:
        unknown_values.append({'Column': col, " Count": unknown_count})

# Converting the list into a pandas DataFrame
unknown_values_df = pd.DataFrame(unknown_values)

# Displaying the DataFrame
print("\nColumns with 'unknown' values and their counts:")
print(unknown_values_df)


Columns with 'unknown' values and their counts:
      Column   Count
0        job     330
1    marital      80
2  education    1730
3    default    8596
4    housing     990
5       loan     990


The "unknown" represents missing values in the dataset. Converting the "unknown" values to NaN

In [134]:
# Adding binary flag columns to indicate "unknown" values
for col in categorical_cols:
    if data_full_additional[col].str.contains('unknown').any():
        data_full_additional[f'is_unknown_{col}'] = (data_full_additional[col] == 'unknown').astype(int)

In [135]:
# Replacing "unknown" with NaN for imputation
data_full_additional[categorical_cols] = data_full_additional[categorical_cols].replace('unknown', np.nan)

Handling missing values in columns martial, housing and loan with the mode (Since they contain comparably low missing values)

In [138]:
# List of columns to handle specifically with mode
columns_to_handle_with_mode = ['marital', 'housing', 'loan']

# Replace NaN with the mode in the specified columns
for col in columns_to_handle_with_mode:
    mode_value = data_full_additional[col].mode()[0]
    data_full_additional[col] = data_full_additional[col].fillna(mode_value)

In [139]:
# Verifying the changes and checking for the columns with missing
for _, row in unknown_values_df.iterrows():
    col = row['Column']
    remaining_nan_count = data_full_additional[col].isnull().sum()
    if remaining_nan_count > 1:
        print(f"{col} : {remaining_nan_count}")

In [None]:
# Advanced encoding for categorical variables
# Defining mappings for categorical values
job_mapping = {
    'admin.': 1, 'blue-collar': 2, 'entrepreneur': 3, 'housemaid': 4,
    'management': 5, 'retired': 6, 'self-employed': 7, 'services': 8,
    'student': 9, 'technician': 10, 'unemployed': 11, 'unknown': 0
}
marital_mapping = {'married': 1, 'single': 2, 'divorced': 3, 'unknown': 0}
education_mapping = {
    'basic.4y': 1, 'basic.6y': 2, 'basic.9y': 3, 'high.school': 4,
    'illiterate': 5, 'professional.course': 6, 'university.degree': 7, 'unknown': 0
}
default_mapping = {'no': 0, 'yes': 1}
housing_mapping = {'no': 0, 'yes': 1}
loan_mapping = {'no': 0, 'yes': 1}
contact_mapping = {'cellular': 1, 'telephone': 2}
month_mapping = {
    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6,
    'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
}
day_of_week_mapping = {'mon': 1, 'tue': 2, 'wed': 3, 'thu': 4, 'fri': 5}
poutcome_mapping = {'failure': 0, 'success': 1, 'nonexistent': 2}