<a href="https://colab.research.google.com/github/TharinsaMudalige/Simple_Classification_for_Bank_Marketing_Dataset/blob/main/Machine_Learning_CW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

In [3]:
file_path = '/content/drive/MyDrive/ML Coursework/bank-additional-full.csv'

# Loading the CSV file into a Pandas DataFrame
data = pd.read_csv(file_path, delimiter=';')
# Displaying the first few rows
print(data.head())

   age        job  marital    education  default housing loan    contact  \
0   56  housemaid  married     basic.4y       no      no   no  telephone   
1   57   services  married  high.school  unknown      no   no  telephone   
2   37   services  married  high.school       no     yes   no  telephone   
3   40     admin.  married     basic.6y       no      no   no  telephone   
4   56   services  married  high.school       no      no  yes  telephone   

  month day_of_week  ...  campaign  pdays  previous     poutcome emp.var.rate  \
0   may         mon  ...         1    999         0  nonexistent          1.1   
1   may         mon  ...         1    999         0  nonexistent          1.1   
2   may         mon  ...         1    999         0  nonexistent          1.1   
3   may         mon  ...         1    999         0  nonexistent          1.1   
4   may         mon  ...         1    999         0  nonexistent          1.1   

   cons.price.idx  cons.conf.idx  euribor3m  nr.employed

In [4]:
print("Initial dataset shape:", data.shape)

Initial dataset shape: (41188, 21)


Data Preprocessing

1.   Handling "unknown values"

In [5]:
# Columns containing unknown values
columns = ['job', 'marital', 'education', 'default', 'housing', 'loan']

# Handling "unknown" values
for column in columns:
    unknown_count = data[column].value_counts().get('unknown', 0)
    print(f'"unknown" count in {column} column: {unknown_count}')

    if column in ['default', 'education']:  # Taking "unknown" as a separate category for these 2 columns
        data[column] = data[column].replace('unknown', 'unknown')
    else:  # Replacing "unknown" with the mode for other columns
        data[column] = data[column].replace('unknown', data[column].mode()[0])

"unknown" count in job column: 330
"unknown" count in marital column: 80
"unknown" count in education column: 1731
"unknown" count in default column: 8597
"unknown" count in housing column: 990
"unknown" count in loan column: 990


2. Handling the value of pdays

In [6]:
# Replacing 999 with -1 as "not contacted"
data['pdays'] = data['pdays'].replace(999, -1)

3. Handling duplicate rows

In [7]:
# Removing duplicate rows
data = data.drop_duplicates()

In [8]:
print("Dataset shape after remvoving duplicates:", data.shape)

Dataset shape after remvoving duplicates: (41176, 21)


4. Encoding categorical variables

In [None]:
file_path = '/content/drive/My Drive/ML Coursework/bank_additional_full_cleaned.csv'
data = pd.read_csv(file_path)

# Displaying initial information about the dataset
print("Dataset before encoding:")
print(data.info())

Dataset before encoding:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41176 entries, 0 to 41175
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41176 non-null  int64  
 1   job             41176 non-null  object 
 2   marital         41176 non-null  object 
 3   education       41176 non-null  object 
 4   default         41176 non-null  object 
 5   housing         41176 non-null  object 
 6   loan            41176 non-null  object 
 7   contact         41176 non-null  object 
 8   month           41176 non-null  object 
 9   day_of_week     41176 non-null  object 
 10  duration        41176 non-null  int64  
 11  campaign        41176 non-null  int64  
 12  pdays           41176 non-null  int64  
 13  previous        41176 non-null  int64  
 14  poutcome        41176 non-null  object 
 15  emp.var.rate    41176 non-null  float64
 16  cons.price.idx  41176 non-null  float64
 17  cons.c

In [None]:
categorical_columns = ['job', 'marital', 'education', 'default', 'housing', 'loan',
                       'contact', 'month', 'day_of_week', 'poutcome']

# Performing one-hot encoding for all categorical variables/features
data_encoded = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

# Displaying information after encoding
print("\nDataset after encoding:")
print(data_encoded.info())


Dataset after encoding:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41176 entries, 0 to 41175
Data columns (total 50 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   age                            41176 non-null  int64  
 1   duration                       41176 non-null  int64  
 2   campaign                       41176 non-null  int64  
 3   pdays                          41176 non-null  int64  
 4   previous                       41176 non-null  int64  
 5   emp.var.rate                   41176 non-null  float64
 6   cons.price.idx                 41176 non-null  float64
 7   cons.conf.idx                  41176 non-null  float64
 8   euribor3m                      41176 non-null  float64
 9   nr.employed                    41176 non-null  float64
 10  y                              41176 non-null  object 
 11  job_blue-collar                41176 non-null  bool   
 12  job_entrepreneur     

In [None]:
# Saving the encoded dataset
data_encoded.to_csv('/content/drive/My Drive/ML Coursework/bank_additional_full_encoded.csv', index=False)

5. Standardization (Z-Score Normalization)

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
file_path = '/content/drive/My Drive/ML Coursework/bank_additional_full_encoded.csv'
data = pd.read_csv(file_path)

# Separating the features and the target variable
X = data.drop('y', axis=1)  # Dropping the target variable
y = data['y']  # Extracting the target variable


numerical_columns = ['age', 'campaign', 'pdays', 'previous',
                     'emp.var.rate', 'cons.price.idx',
                     'cons.conf.idx', 'euribor3m', 'nr.employed']

# Initializing the StandardScaler to standardize features to have a mean of 0 and standard deviation of 1
scaler = StandardScaler()

# Creating a copy of the dataset
X_standardized = X.copy()
X_standardized[numerical_columns] = scaler.fit_transform(X[numerical_columns])

# Adding the target variable back
X_standardized['y'] = y

In [None]:
# Saving the dataset
X_standardized.to_csv('/content/drive/My Drive/ML Coursework/bank_additional_full_standardized.csv', index=False)

6. Mapping yes and no as 0 and 1

In [None]:
file_path = '/content/drive/My Drive/ML Coursework/bank_additional_full_standardized.csv'
data = pd.read_csv(file_path)

# Binary map the target variable
data['y'] = data['y'].map({'yes': 1, 'no': 0})  # Convert 'yes' to 1 and 'no' to 0

In [None]:
# Saving the dataset
data.to_csv('/content/drive/My Drive/ML Coursework/bank_additional_full_binary_mapped.csv', index=False)

7. Removing the "duration" feature

In [None]:
file_path = '/content/drive/My Drive/ML Coursework/bank_additional_full_binary_mapped.csv'
data = pd.read_csv(file_path)

# Removing the 'duration' feature as it directly correlates with the target variable
data = data.drop('duration', axis=1)

In [None]:
# Saving the dataset
data.to_csv('/content/drive/My Drive/ML Coursework/bank_additional_full_final.csv', index=False)

8. Handling class imbalance

In [None]:
# Checking whether imbalance exists

file_path = '/content/drive/My Drive/ML Coursework/bank_additional_full_final.csv'
data = pd.read_csv(file_path)

# Checking the class distribution for the target variable 'y'
class_distribution = data['y'].value_counts()
print("Class Distribution:")
print(class_distribution)

# Calculating the percentage of imbalance
percentage_no = (class_distribution[0] / len(data)) * 100
percentage_yes = (class_distribution[1] / len(data)) * 100
print(f"\nPercentage of 'no': {percentage_no:.2f}%")
print(f"Percentage of 'yes': {percentage_yes:.2f}%")

Class Distribution:
y
0    36537
1     4639
Name: count, dtype: int64

Percentage of 'no': 88.73%
Percentage of 'yes': 11.27%


In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from collections import Counter

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from collections import Counter
import pandas as pd

# Load the dataset
file_path = '/content/drive/My Drive/ML Coursework/bank_additional_full_final.csv'
data = pd.read_csv(file_path)

# Separate features (X) and target (y)
X = data.drop('y', axis=1)
y = data['y']

# Apply SMOTE to the entire dataset
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)

# Check the class distribution after SMOTE
print("\nClass distribution after SMOTE:")
print(Counter(y_smote))

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_smote, y_smote, test_size=0.2, stratify=y_smote, random_state=42)

# Confirm the class distribution in the training set
print("\nClass distribution in the training set after splitting:")
print(Counter(y_train))
print(Counter(y_test))





Class distribution after SMOTE:
Counter({0: 36537, 1: 36537})

Class distribution in the training set after splitting:
Counter({1: 29230, 0: 29229})
Counter({0: 7308, 1: 7307})


In [None]:
file_path = '/content/drive/My Drive/ML Coursework/bank_additional_full_final.csv'
data = pd.read_csv(file_path)

X = data.drop('y', axis=1)
y = data['y']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Applying SMOTE to the training set
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Checking class distribution after applying SMOTE
print("\nClass distribution after handling imbalance:")
print(Counter(y_train_smote))




Class distribution after handling imbalance:
Counter({0: 29229, 1: 29229})


Building the neural network

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
# Building the neural network
model = Sequential([
    Input(shape=(X_train.shape[1],)),
    Dense(512, activation='relu'),
    Dropout(0.3),
    Dense(256, activation='relu'),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

In [None]:
# Compiling the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
# Training the model
history = model.fit(X_train, y_train,
                    validation_split=0.2,  # 20% of training data as validation set
                    epochs=50,
                    batch_size=32,
                    verbose=1)

Epoch 1/50
[1m1462/1462[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 9ms/step - accuracy: 0.7368 - loss: 0.5268 - val_accuracy: 0.7919 - val_loss: 0.4458
Epoch 2/50
[1m1462/1462[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 8ms/step - accuracy: 0.7945 - loss: 0.4404 - val_accuracy: 0.8130 - val_loss: 0.4014
Epoch 3/50
[1m1462/1462[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 8ms/step - accuracy: 0.8077 - loss: 0.4103 - val_accuracy: 0.8228 - val_loss: 0.3837
Epoch 4/50
[1m1462/1462[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 8ms/step - accuracy: 0.8137 - loss: 0.3963 - val_accuracy: 0.8237 - val_loss: 0.3847
Epoch 5/50
[1m1462/1462[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 8ms/step - accuracy: 0.8250 - loss: 0.3776 - val_accuracy: 0.8352 - val_loss: 0.3689
Epoch 6/50
[1m1462/1462[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 8ms/step - accuracy: 0.8354 - loss: 0.3581 - val_accuracy: 0.8400 - val_loss: 0.3527
Epoch 7/50

In [None]:
# Evaluate the model on the test data
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=1)

# Print the testing accuracy
print(f"Testing Accuracy: {test_accuracy * 100:.2f}%")


[1m457/457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.8932 - loss: 0.2759
Testing Accuracy: 89.21%


In [None]:
# Adjust the threshold
y_pred_prob = model.predict(X_test)
y_pred_adjusted = (y_pred_prob > 0.5).astype(int)  # Adjust threshold to 0.3

# Recalculate metrics
from sklearn.metrics import classification_report, confusion_matrix
print("\nClassification Report (Adjusted Threshold):")
print(classification_report(y_test, y_pred_adjusted))

print("\nConfusion Matrix (Adjusted Threshold):")
print(confusion_matrix(y_test, y_pred_adjusted))


[1m457/457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step

Classification Report (Adjusted Threshold):
              precision    recall  f1-score   support

           0       0.90      0.89      0.89      7308
           1       0.89      0.90      0.89      7307

    accuracy                           0.89     14615
   macro avg       0.89      0.89      0.89     14615
weighted avg       0.89      0.89      0.89     14615


Confusion Matrix (Adjusted Threshold):
[[6487  821]
 [ 756 6551]]
