<a href="https://colab.research.google.com/github/TharinsaMudalige/Simple_Classification_for_Bank_Marketing_Dataset/blob/main/Machine_Learning_CW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

In [3]:
file_path = '/content/drive/MyDrive/ML Coursework/bank-additional-full.csv'

# Loading the CSV file into a Pandas DataFrame
data = pd.read_csv(file_path, delimiter=';')
# Displaying the first few rows
print(data.head())

   age        job  marital    education  default housing loan    contact  \
0   56  housemaid  married     basic.4y       no      no   no  telephone   
1   57   services  married  high.school  unknown      no   no  telephone   
2   37   services  married  high.school       no     yes   no  telephone   
3   40     admin.  married     basic.6y       no      no   no  telephone   
4   56   services  married  high.school       no      no  yes  telephone   

  month day_of_week  ...  campaign  pdays  previous     poutcome emp.var.rate  \
0   may         mon  ...         1    999         0  nonexistent          1.1   
1   may         mon  ...         1    999         0  nonexistent          1.1   
2   may         mon  ...         1    999         0  nonexistent          1.1   
3   may         mon  ...         1    999         0  nonexistent          1.1   
4   may         mon  ...         1    999         0  nonexistent          1.1   

   cons.price.idx  cons.conf.idx  euribor3m  nr.employed

In [4]:
print("Initial dataset shape:", data.shape)

Initial dataset shape: (41188, 21)


Data Preprocessing

1.   Handling "unknown values"

In [5]:
# Columns containing unknown values
columns = ['job', 'marital', 'education', 'default', 'housing', 'loan']

# Handling "unknown" values
for column in columns:
    unknown_count = data[column].value_counts().get('unknown', 0)
    print(f'"unknown" count in {column} column: {unknown_count}')

    if column in ['default', 'education']:  # Taking "unknown" as a separate category for these 2 columns
        data[column] = data[column].replace('unknown', 'unknown')
    else:  # Replacing "unknown" with the mode for other columns
        data[column] = data[column].replace('unknown', data[column].mode()[0])

"unknown" count in job column: 330
"unknown" count in marital column: 80
"unknown" count in education column: 1731
"unknown" count in default column: 8597
"unknown" count in housing column: 990
"unknown" count in loan column: 990


2. Handling the value of pdays

In [6]:
# Replacing 999 with -1 as "not contacted"
data['pdays'] = data['pdays'].replace(999, -1)

3. Handling duplicate rows

In [7]:
# Removing duplicate rows
data = data.drop_duplicates()

In [8]:
print("Dataset shape after remvoving duplicates:", data.shape)

Dataset shape after remvoving duplicates: (41176, 21)


4. Encoding categorical variables

In [9]:
# Displaying initial information about the dataset
print("Dataset before encoding:")
print(data.info())

Dataset before encoding:
<class 'pandas.core.frame.DataFrame'>
Index: 41176 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41176 non-null  int64  
 1   job             41176 non-null  object 
 2   marital         41176 non-null  object 
 3   education       41176 non-null  object 
 4   default         41176 non-null  object 
 5   housing         41176 non-null  object 
 6   loan            41176 non-null  object 
 7   contact         41176 non-null  object 
 8   month           41176 non-null  object 
 9   day_of_week     41176 non-null  object 
 10  duration        41176 non-null  int64  
 11  campaign        41176 non-null  int64  
 12  pdays           41176 non-null  int64  
 13  previous        41176 non-null  int64  
 14  poutcome        41176 non-null  object 
 15  emp.var.rate    41176 non-null  float64
 16  cons.price.idx  41176 non-null  float64
 17  cons.conf.i

In [10]:
columns = ['job', 'marital', 'education', 'default', 'housing', 'loan',
                       'contact', 'month', 'day_of_week', 'poutcome']

# Performing one-hot encoding for all categorical variables/features
data_encoded = pd.get_dummies(data, columns=columns, drop_first=True)

# Displaying information after encoding
print("\nDataset after encoding:")
print(data_encoded.info())


Dataset after encoding:
<class 'pandas.core.frame.DataFrame'>
Index: 41176 entries, 0 to 41187
Data columns (total 50 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   age                            41176 non-null  int64  
 1   duration                       41176 non-null  int64  
 2   campaign                       41176 non-null  int64  
 3   pdays                          41176 non-null  int64  
 4   previous                       41176 non-null  int64  
 5   emp.var.rate                   41176 non-null  float64
 6   cons.price.idx                 41176 non-null  float64
 7   cons.conf.idx                  41176 non-null  float64
 8   euribor3m                      41176 non-null  float64
 9   nr.employed                    41176 non-null  float64
 10  y                              41176 non-null  object 
 11  job_blue-collar                41176 non-null  bool   
 12  job_entrepreneur          

5. Standardization (Z-Score Normalization)

In [11]:
from sklearn.preprocessing import StandardScaler

In [12]:
numerical_columns = ['age', 'campaign', 'pdays', 'previous',
                     'emp.var.rate', 'cons.price.idx',
                     'cons.conf.idx', 'euribor3m', 'nr.employed']

# Initializing the StandardScaler to standardize features to have a mean of 0 and standard deviation of 1
scaler = StandardScaler()

# Creating a copy of the dataset
X_standardized = data_encoded.copy()
X_standardized[numerical_columns] = scaler.fit_transform(data_encoded[numerical_columns])

6. Mapping yes and no as 0 and 1

In [13]:
# Mapping the target variable 'y' to 1 and 0
X_standardized['y'] = X_standardized['y'].map({'yes': 1, 'no': 0})

# Confirm the mapping
print("After mapping:", X_standardized['y'].unique())

After mapping: [0 1]


7. Removing the "duration" feature

In [14]:
# Removing the 'duration' feature
X_standardized = X_standardized.drop(columns=['duration'])

# Confirm the removal
print("Dataset shape after removing 'duration':", X_standardized.shape)

Dataset shape after removing 'duration': (41176, 49)


8. Handling class imbalance

In [15]:
# Checking whether imbalance exists

# Checking the class distribution for the target variable
class_distribution = X_standardized['y'].value_counts()
print("Class Distribution:")
print(class_distribution)

# Calculating the percentage of imbalance
percentage_of_no = (class_distribution[0] / len(X_standardized)) * 100
percentage_of_yes = (class_distribution[1] / len(X_standardized)) * 100
print(f"\nPercentage of 'no': {percentage_of_no:.2f}%")
print(f"Percentage of 'yes': {percentage_of_yes:.2f}%")

Class Distribution:
y
0    36537
1     4639
Name: count, dtype: int64

Percentage of 'no': 88.73%
Percentage of 'yes': 11.27%


In [16]:
from imblearn.over_sampling import SMOTE
from collections import Counter

In [17]:
# Separating features and target variable
X = X_standardized.drop('y', axis=1)
y = X_standardized['y']

# Applying SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)

# Checking the class distribution after
print("\nClass distribution after SMOTE:")
print(Counter(y_smote))




Class distribution after SMOTE:
Counter({0: 36537, 1: 36537})


In [18]:
# Combining the features (X_smote) and target variable (y_smote) into a single dataset
smote_balanced_data = X_smote.copy()
smote_balanced_data['y'] = y_smote

# Save the pre processed dataset
file_path = '/content/drive/My Drive/ML Coursework/bank_additional_full_preprocessed.csv'
smote_balanced_data.to_csv(file_path, index=False)

9. Splitting the dataset

In [19]:
from sklearn.model_selection import train_test_split

In [22]:
# Loading the saved dataset
file_path = '/content/drive/My Drive/ML Coursework/bank_additional_full_preprocessed.csv'
data_smote = pd.read_csv(file_path)

# Separating features and target variable
X = data_smote.drop('y', axis=1)
y = data_smote['y']

# Splitting
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# Checking the class distribution in both sets
print("\nTraining set:")
print(Counter(y_train))

print("\nTesting set:")
print(Counter(y_test))


Training set:
Counter({1: 29230, 0: 29229})

Testing set:
Counter({0: 7308, 1: 7307})


Building the neural network

In [21]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input

In [None]:
# Building the neural network
model = Sequential([
    Input(shape=(X_train.shape[1],)),
    Dense(512, activation='relu'),
    Dropout(0.3),
    Dense(256, activation='relu'),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

In [None]:
# Compiling the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
# Train the model
history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=100,
    batch_size=64,
    verbose=1
)

Epoch 1/50
[1m1462/1462[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 9ms/step - accuracy: 0.7368 - loss: 0.5268 - val_accuracy: 0.7919 - val_loss: 0.4458
Epoch 2/50
[1m1462/1462[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 8ms/step - accuracy: 0.7945 - loss: 0.4404 - val_accuracy: 0.8130 - val_loss: 0.4014
Epoch 3/50
[1m1462/1462[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 8ms/step - accuracy: 0.8077 - loss: 0.4103 - val_accuracy: 0.8228 - val_loss: 0.3837
Epoch 4/50
[1m1462/1462[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 8ms/step - accuracy: 0.8137 - loss: 0.3963 - val_accuracy: 0.8237 - val_loss: 0.3847
Epoch 5/50
[1m1462/1462[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 8ms/step - accuracy: 0.8250 - loss: 0.3776 - val_accuracy: 0.8352 - val_loss: 0.3689
Epoch 6/50
[1m1462/1462[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 8ms/step - accuracy: 0.8354 - loss: 0.3581 - val_accuracy: 0.8400 - val_loss: 0.3527
Epoch 7/50

Evaluating the neural network model

In [None]:
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=1)

print(f"\nTest Accuracy: {test_accuracy * 100:.2f}%")
print(f"Test Loss: {test_loss:.4f}")