In [2]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

In [3]:
df = pd.read_csv("./bankDataset/bank-full.csv",sep = ";")

In [4]:
#Encode data using LabelEncoder
for column in df.select_dtypes(include=['object']).columns:
    df[column] = LabelEncoder().fit_transform(df[column])


In [5]:
X = df.drop("y",axis = 1)
y = df["y"]

In [6]:
X_train,X_test,y_train_true,y_test_true = train_test_split(X,y,test_size=0.2, random_state=42)

In [7]:
#Division of into labeled and unlabeled
X_label,y_label = X_train[:400], y_train_true[:400]
X_unlabel = X_train[400:]

## SEMI-SUPERVISED ENSEMBLE

In [8]:
tree_clf = DecisionTreeClassifier(random_state = 42)
svm_clf = SVC(probability=True, random_state=42)

tree_clf.fit(X_label,y_label)
svm_clf.fit(X_label,y_label)

# Pseudo-labeling on unlabeled data
pseudo_labels = np.array([tree_clf.predict(X_unlabel), svm_clf.predict(X_unlabel)])
confidence = np.max([tree_clf.predict_proba(X_unlabel), svm_clf.predict_proba(X_unlabel)], axis=0)
threshold = 0.75  # You can adjust this threshold
high_confidence = np.max(confidence, axis=1) > threshold
X_pseudo_labeled = X_unlabel[high_confidence]
y_pseudo_labeled = np.round(np.mean(pseudo_labels[:, high_confidence], axis=0)).astype(int)

# Combine into an ensemble and retrain
ensemble_clf = VotingClassifier(estimators=[('dt', tree_clf), ('svm', svm_clf)], voting='soft')

# Combine labeled and pseudo-labeled data
X_combined = np.vstack((X_label, X_pseudo_labeled))
y_combined = np.hstack((y_label, y_pseudo_labeled))

ensemble_clf.fit(X_combined, y_combined)

# Evaluate the model
y_pred = ensemble_clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test_true, y_pred))



Accuracy: 0.8791330310737587


## UNSUPERVISED PRE-TRAINED

In [20]:
from sklearn.neural_network import BernoulliRBM
from sklearn.pipeline import Pipeline
import tensorflow as tf


In [27]:
rbm = BernoulliRBM(n_components=100, learning_rate=0.1, batch_size=10, n_iter =10, verbose = True, random_state=42)

rbm.fit(X_unlabel)

X_label_rbm = rbm.transform(X_label)

model = tf.keras.Sequential([
    tf.keras.layers.Dense(10, activation="relu"),
    tf.keras.layers.Dense(1, activation = 'sigmoid')
])

model.compile(optimizer = "adam", loss = "mse",metrics = ["accuracy"])
model.fit(X_label_rbm, y_label, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model
loss, accuracy = model.evaluate(X_label_rbm, y_label)
print("Accuracy:", accuracy)



[BernoulliRBM] Iteration 1, pseudo-likelihood = -1449854150.61, time = 0.37s




[BernoulliRBM] Iteration 2, pseudo-likelihood = -2899872175.87, time = 0.48s




[BernoulliRBM] Iteration 3, pseudo-likelihood = -4349890201.33, time = 0.44s




[BernoulliRBM] Iteration 4, pseudo-likelihood = -5799908224.11, time = 0.43s




[BernoulliRBM] Iteration 5, pseudo-likelihood = -7249926249.90, time = 0.43s




[BernoulliRBM] Iteration 6, pseudo-likelihood = -8699944271.87, time = 0.43s




[BernoulliRBM] Iteration 7, pseudo-likelihood = -10149962297.19, time = 0.44s




[BernoulliRBM] Iteration 8, pseudo-likelihood = -11599980320.71, time = 0.47s




[BernoulliRBM] Iteration 9, pseudo-likelihood = -13049998346.06, time = 0.44s
[BernoulliRBM] Iteration 10, pseudo-likelihood = -14500016369.73, time = 0.43s
Epoch 1/10




Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.8924999833106995
