In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

RS = 42
data = pd.read_csv('./bankDataset/bank-full.csv', sep=';')

# Check for missing values
print(data.isnull().sum())

for column in data.select_dtypes(include=['object']).columns:
    data[column] = LabelEncoder().fit_transform(data[column])

print(data.head())
print(data.info())

X = data.drop('y', axis=1)
y = data['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=RS)

# Apply undersampling to handle class imbalance
under_sampler = RandomUnderSampler(random_state=RS)
X_res, y_res = under_sampler.fit_resample(X_train, y_train)

rf = RandomForestClassifier(random_state=RS)

# Define a grid of hyperparameters to search
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

grid_search = GridSearchCV(rf, param_grid, cv=2, scoring='accuracy')
grid_search.fit(X_res, y_res)


# Use the best estimator found
best_rf = grid_search.best_estimator_
# Predict on the validation and test sets
y_test_pred = best_rf.predict(X_test)

# Evaluate the model
print("Test Accuracy: ", accuracy_score(y_test, y_test_pred))
print("Confusion matrix: ", confusion_matrix(y_test, y_test_pred))

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64
   age  job  marital  education  default  balance  housing  loan  contact  \
0   58    4        1          2        0     2143        1     0        2   
1   44    9        2          1        0       29        1     0        2   
2   33    2        1          1        0        2        1     1        2   
3   47    1        1          3        0     1506        1     0        2   
4   33   11        2          3        0        1        0     0        2   

   day  month  duration  campaign  pdays  previous  poutcome  y  
0    5      8       261         1     -1         0         3  0  
1    5      8       151         1     -1         0         3  0  
2    5      8        76         1     -1         0         3  0  
3    

In [23]:
import numpy as np
from sklearn.semi_supervised import SelfTrainingClassifier

rng = np.random.RandomState(RS)

random_unlabeled_points = rng.rand(data['y'].shape[0]) < 0.99
unlabel_dataset = data

unlabel_dataset['y'][random_unlabeled_points] = -1

rf = RandomForestClassifier()
self_training_model = SelfTrainingClassifier(rf)
self_training_model.fit(unlabel_dataset.drop('y', axis=1), unlabel_dataset['y'])

y_unlabeled_test_pred = self_training_model.predict(X_test)

# Evaluate the model
print("Test Accuracy: ", accuracy_score(y_test, y_unlabeled_test_pred))
print("Confusion matrix: ", confusion_matrix(y_test, y_unlabeled_test_pred))

45211 45211
Test Accuracy:  0.8870539663815984
Confusion matrix:  [[11651   315]
 [ 1217   381]]


In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.utils import shuffle
import numpy as np


# Split the data into labeled and unlabeled sets
labeled_data = unlabel_dataset[unlabel_dataset['y'] != -1]
unlabeled_data = unlabel_dataset[unlabel_dataset['y'] == -1]

# Split labeled data into training and test sets
X_labeled = labeled_data.drop('y', axis=1)
y_labeled = labeled_data['y']
X_train, X_test, y_train, y_test = train_test_split(X_labeled, y_labeled, test_size=0.2)

# Split features for co-training
set_of_features_1 = ['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan']
set_of_features_2 = ['contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome']

X_train_1 = X_train[set_of_features_1]
X_train_2 = X_train[set_of_features_2]
X_test_1 = X_test[set_of_features_1]
X_test_2 = X_test[set_of_features_2]

# Initialize the classifiers
clf1 = RandomForestClassifier()
clf2 = XGBClassifier()

# Train both classifiers on the labeled data
clf1.fit(X_train_1, y_train)
clf2.fit(X_train_2, y_train)

# Co-training loop
n_iterations = 10  # Number of iterations
n_labels = 10     # Number of samples to label per iteration

for _ in range(n_iterations):
    # Use clf1 to label data and add to clf2's training set
    if len(unlabeled_data) > 0:
        unlabeled_X1 = unlabeled_data[set_of_features_1]
        pred_y1 = clf1.predict(unlabeled_X1)
        
        # Select n_labels instances clf1 is most confident about
        pred_proba1 = clf1.predict_proba(unlabeled_X1).max(axis=1)
        confident_indices1 = np.argsort(pred_proba1)[-n_labels:]
        add_to_train2 = unlabeled_data.iloc[confident_indices1]

        # Update the training set for clf2
        X_train_2 = pd.concat([X_train_2, add_to_train2[set_of_features_2]])
        y_train_2 = pd.concat([y_train, add_to_train2['y']])
        
        # Remove labeled instances from the unlabeled data
        unlabeled_data = unlabeled_data.drop(add_to_train2.index)

        # Retrain clf2
        clf2.fit(X_train_2, y_train_2)

    # Use clf2 to label data and add to clf1's training set
    if len(unlabeled_data) > 0:
        unlabeled_X2 = unlabeled_data[set_of_features_2]
        pred_y2 = clf2.predict(unlabeled_X2)
        
        # Select n_labels instances clf2 is most confident about
        pred_proba2 = clf2.predict_proba(unlabeled_X2).max(axis=1)
        confident_indices2 = np.argsort(pred_proba2)[-n_labels:]
        add_to_train1 = unlabeled_data.iloc[confident_indices2]

        # Update the training set for clf1
        X_train_1 = pd.concat([X_train_1, add_to_train1[set_of_features_1]])
        y_train_1 = pd.concat([y_train, add_to_train1['y']])
        
        # Remove labeled instances from the unlabeled data
        unlabeled_data = unlabeled_data.drop(add_to_train1.index)

        # Retrain clf1
        clf1.fit(X_train_1, y_train_1)


ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1 2], got [-1  0  1]

In [24]:
data.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')