# Load Data

In [102]:
import pandas as pd
RS = 42
data = pd.read_csv('./bankDataset/bank-full.csv', sep=';')

# Explore Data

In [103]:
print(data.head())
print(data.tail())

   age           job  marital  education default  balance housing loan  \
0   58    management  married   tertiary      no     2143     yes   no   
1   44    technician   single  secondary      no       29     yes   no   
2   33  entrepreneur  married  secondary      no        2     yes  yes   
3   47   blue-collar  married    unknown      no     1506     yes   no   
4   33       unknown   single    unknown      no        1      no   no   

   contact  day month  duration  campaign  pdays  previous poutcome   y  
0  unknown    5   may       261         1     -1         0  unknown  no  
1  unknown    5   may       151         1     -1         0  unknown  no  
2  unknown    5   may        76         1     -1         0  unknown  no  
3  unknown    5   may        92         1     -1         0  unknown  no  
4  unknown    5   may       198         1     -1         0  unknown  no  
       age           job   marital  education default  balance housing loan  \
45206   51    technician   marri

In [104]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB
None


In [105]:
print(data.nunique())

age            77
job            12
marital         3
education       4
default         2
balance      7168
housing         2
loan            2
contact         3
day            31
month          12
duration     1573
campaign       48
pdays         559
previous       41
poutcome        4
y               2
dtype: int64


In [106]:
print("Check for missing values:\n", data.isnull().sum())

Check for missing values:
 age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64


In [107]:
duplicates = data[data.duplicated()]

if not duplicates.empty:
    print("Duplicates found in the dataset:")
    print(duplicates)
else:
    print("No duplicates found in the dataset.")

No duplicates found in the dataset.


# Data Preprocessing

In [108]:
poutcome_unknown_count = (data['poutcome'] == 'unknown').sum()
poutcome_other_count = (data['poutcome'] == 'other').sum()

poutcome_count = len(data['poutcome'])

percentage_unknown = poutcome_unknown_count / poutcome_count
percentage_other = poutcome_other_count / poutcome_count

print("Precentage of unknowns in poutcome", percentage_unknown)
print("Precentage of others in poutcome", percentage_other)

data = data.drop(columns=['poutcome'])

Precentage of unknowns in poutcome 0.8174780473778506
Precentage of others in poutcome 0.040698060206586895


In [109]:
unknown_perc_cols = (data == "unknown").mean() * 100
col_with_highest_perc_id = unknown_perc_cols.idxmax()
col_with_highest_perc_val = unknown_perc_cols.max()

print("Precentage of unknown per column:\n\n", unknown_perc_cols, '\n')
print("Column with the highest precentage of unknown:", col_with_highest_perc_id, 'value:', col_with_highest_perc_val,'\n')

unknown_perc_rows = (data == "unknown").mean(axis=1) * 100
row_with_highest_perc_id = unknown_perc_rows.idxmax()
row_with_highest_perc_val = unknown_perc_rows.max()

print("Precentage of unknown per row:", unknown_perc_rows, '\n')
print("Row with the highest precentage of unknown:", row_with_highest_perc_id, 'value:', row_with_highest_perc_val,'\n')

data = data.drop(row_with_highest_perc_id)

Precentage of unknown per column:

 age           0.000000
job           0.637013
marital       0.000000
education     4.107407
default       0.000000
balance       0.000000
housing       0.000000
loan          0.000000
contact      28.798301
day           0.000000
month         0.000000
duration      0.000000
campaign      0.000000
pdays         0.000000
previous      0.000000
y             0.000000
dtype: float64 

Column with the highest precentage of unknown: contact value: 28.798301298356595 

Precentage of unknown per row: 0         6.25
1         6.25
2         6.25
3        12.50
4        18.75
         ...  
45206     0.00
45207     0.00
45208     0.00
45209     0.00
45210     0.00
Length: 45211, dtype: float64 

Row with the highest precentage of unknown: 4 value: 18.75 



In [110]:
data.boxplot()

<Axes: >

In [111]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
for column in data.select_dtypes(include=['object']).columns:
    data[column] = le.fit_transform(data[column])

# Supervised Model

In [112]:
from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

RS = 56

X = data.drop('y', axis=1)
y = data['y']

# Split the data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=RS)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=RS)

# SMOTE for handling class imbalance
smote = SMOTE(random_state=RS)
X_res, y_res = smote.fit_resample(X_train, y_train)

# Random Forest with hyperparameter tuning
rf = RandomForestClassifier(random_state=RS)

# Define a grid of hyperparameters to search
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Use GridSearchCV for finding the best hyperparameters and fitting the model
grid_search = GridSearchCV(rf, param_grid, cv=2, scoring='accuracy')
grid_search.fit(X_res, y_res)

# Use the best estimator found
best_rf = grid_search.best_estimator_

# Predict on the validation and test sets
y_val_pred = best_rf.predict(X_val)
y_test_pred = best_rf.predict(X_test)

# Evaluate the model
print("Validation Accuracy: ", accuracy_score(y_val, y_val_pred))
print("Test Accuracy: ", accuracy_score(y_test, y_test_pred))
print("Confusion matrix: ", confusion_matrix(y_test, y_test_pred))

Validation Accuracy:  0.8804011207786462
Test Accuracy:  0.8842524329106458
Confusion matrix:  [[5493  516]
 [ 269  504]]


# Unlabeled dataset creation

In [113]:
import numpy as np
rng = np.random.RandomState(RS)

random_unlabeled_points = rng.rand(data['y'].shape[0]) < 0.8
unlabel_dataset = data

unlabel_dataset['y'][random_unlabeled_points] = -1

# Self-training SSL

In [114]:
from sklearn.semi_supervised import SelfTrainingClassifier

rf = RandomForestClassifier()
self_training_model = SelfTrainingClassifier(rf)
self_training_model.fit(unlabel_dataset.drop('y', axis=1), unlabel_dataset['y'])

y_unlabeled_test_pred = self_training_model.predict(X_test)

# Evaluate the model
print("Test Accuracy: ", accuracy_score(y_test, y_unlabeled_test_pred))
print("Confusion matrix: ", confusion_matrix(y_test, y_unlabeled_test_pred))

Test Accuracy:  0.9127101150103214
Confusion matrix:  [[5953   56]
 [ 536  237]]


# Co-training SSL

In [115]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Split the data into labeled and unlabeled sets
labeled_data = unlabel_dataset[unlabel_dataset['y'] != -1]
unlabeled_data = unlabel_dataset[unlabel_dataset['y'] == -1]

# Split labeled data into training and test sets
X_labeled = labeled_data.drop('y', axis=1)
y_labeled = labeled_data['y']

X_train, X_test, y_train, y_test = train_test_split(X_labeled, y_labeled, test_size=0.3)

# Split features for co-training
set_of_features_1 = ['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan']
set_of_features_2 = ['contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous']

X_train_1 = X_train[set_of_features_1]
X_train_2 = X_train[set_of_features_2]

y_train_1 = y_train
y_train_2 = y_train

X_test_1 = X_test[set_of_features_1]
X_test_2 = X_test[set_of_features_2]

# Initialize the classifiers
rf_clf1 = RandomForestClassifier()
xgb_clf2 = XGBClassifier()

# Train both classifiers on the labeled data
rf_clf1.fit(X_train_1, y_train)
xgb_clf2.fit(X_train_2, y_train)

# Co-training loop
n_iterations = 10  # Number of iterations
n_labels = 10     # Number of samples to label per iteration

for _ in range(n_iterations):
    # Use rf_clf1 to label data and add to xgb_clf2's training set
    if len(unlabeled_data) > 0:
        unlabeled_X1 = unlabeled_data[set_of_features_1]
        pred_y1 = rf_clf1.predict(unlabeled_X1)
        
        # Select n_labels instances rf_clf1 is most confident about
        pred_proba1 = rf_clf1.predict_proba(unlabeled_X1).max(axis=1)
        confident_indices1 = np.argsort(pred_proba1)[-n_labels:]
        add_to_train2_X = unlabeled_data.iloc[confident_indices1][set_of_features_2]
        add_to_train2_y = pred_y1[confident_indices1]

        # Update the training set for xgb_clf2
        X_train_2 = pd.concat([X_train_2, add_to_train2_X])
        y_train_2 = pd.concat([y_train_2, pd.Series(add_to_train2_y)]) 
        
        # Remove labeled instances from the unlabeled data
        unlabeled_data = unlabeled_data.drop(unlabeled_data.index[confident_indices1])

        # Retrain xgb_clf2
        xgb_clf2.fit(X_train_2, y_train_2)

    # Use xgb_clf2 to label data and add to rf_clf1's training set
    if len(unlabeled_data) > 0:
        unlabeled_X2 = unlabeled_data[set_of_features_2]
        pred_y2 = xgb_clf2.predict(unlabeled_X2)
        
        # Select n_labels instances xgb_clf2 is most confident about
        pred_proba2 = xgb_clf2.predict_proba(unlabeled_X2).max(axis=1)
        confident_indices2 = np.argsort(pred_proba2)[-n_labels:]
        add_to_train1_X = unlabeled_data.iloc[confident_indices2][set_of_features_1]
        add_to_train1_y = pred_y2[confident_indices2]

        # Update the training set for rf_clf1
        X_train_1 = pd.concat([X_train_1, add_to_train1_X])
        y_train_1 = pd.concat([y_train_1, pd.Series(add_to_train1_y)])
        
        # Remove labeled instances from the unlabeled data
        unlabeled_data = unlabeled_data.drop(unlabeled_data.index[confident_indices2])

        # Retrain rf_clf1
        rf_clf1.fit(X_train_1, y_train_1)

rf_clf1_pred = rf_clf1.predict(X_test_1)
print("clf 1 Test Accuracy: ", accuracy_score(y_test, rf_clf1_pred))
print("clf 1 Confusion matrix: ", confusion_matrix(y_test, rf_clf1_pred))


xgb_clf2_pred = xgb_clf2.predict(X_test_2)
print("clf 2 Test Accuracy: ", accuracy_score(y_test, xgb_clf2_pred))
print("clf 2 Confusion matrix: ", confusion_matrix(y_test, xgb_clf2_pred))

clf 1 Test Accuracy:  0.8755506607929515
clf 1 Confusion matrix:  [[2349   60]
 [ 279   36]]
clf 2 Test Accuracy:  0.9019823788546255
clf 2 Confusion matrix:  [[2328   81]
 [ 186  129]]
