In [1]:
# Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from tabulate import tabulate
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error
from sklearn.svm import LinearSVC
from sklearn.linear_model import RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import LabelEncoder
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
from loguru import logger

In [2]:
TEST_PERCENTAGE = 0.2
SEED = [24, 42, 206, 602, 412, 214, 754, 457, 2023, 3202]
UNLABELED_PERCENTAGE = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.85, 0.9]
MODELS = [
    GaussianNB(var_smoothing=0.001),
    DecisionTreeClassifier(criterion='gini', max_depth=7, min_samples_leaf=1, min_samples_split=2),
    KNeighborsClassifier(metric='euclidean', n_neighbors=3, weights='uniform'),
    RandomForestClassifier(criterion='gini', max_depth=9, min_samples_leaf=1, min_samples_split=2, n_estimators=100),
]

In [3]:
def encode_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """Encode dataframe using LabelEncoder"""
    labelencoder = LabelEncoder()

    for col in df.columns:
        df[col] = labelencoder.fit_transform(df[col])

    return df


def get_metrics():
    metrics = {
        'accuracy': accuracy_score,
        'precision': precision_score,
        'recall': recall_score,
        'f1': f1_score,
        'roc': roc_auc_score
    }
    return metrics


def get_metrics_df(y_true, y_pred) -> pd.DataFrame:
    metrics = get_metrics()
    df = pd.DataFrame()
    for metric_name, metric in metrics.items():
        df[metric_name] = [metric(y_true, y_pred)]

    return df

In [4]:
import warnings

# Filter out the specific warning messages
warnings.filterwarnings('ignore', message='X has feature names, but GaussianNB was fitted without feature names')
warnings.filterwarnings('ignore', message='X has feature names, but DecisionTreeClassifier was fitted without feature names')
warnings.filterwarnings('ignore', message='X has feature names, but KNeighborsClassifier was fitted without feature names')
warnings.filterwarnings('ignore', message='X has feature names, but RandomForestClassifier was fitted without feature names')

In [5]:
df = pd.read_csv('../../datasets/mushrooms.csv')
df = encode_dataframe(df)


X = df.drop('class', axis=1)
y = df['class']

scaler = RobustScaler()

In [6]:
exec_counter = 0
results = []

for model in MODELS:
    logger.info(f"Running model: {model.__class__.__name__}")
    for value in UNLABELED_PERCENTAGE:
        logger.info(f"Unlabeled percentage: {value}")
        for seed in SEED:
            exec_counter += 1
            logger.info(f"Execution: {exec_counter}")
            logger.info(f"Seed: {seed}")
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=value, random_state=seed)
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)
            logger.debug(f'X_train: {X_train.shape}, y_train: {y_train.shape}, X_test: {X_test.shape}, y_test: {y_test.shape}')

            X_train, X_unlabeled, y_train, y_unlabeled = train_test_split(X_train, y_train, test_size=value, random_state=seed)
            logger.debug(f'X_train: {X_train.shape}, X_unlabeled: {X_unlabeled.shape}, y_train: {y_train.shape}, y_unlabeled: {y_unlabeled.shape}')

            model.fit(X_train, y_train)

            # Predict on unlabeled data
            y_unlabeled_predicted = model.predict(X_unlabeled)
            y_unlabeled_predicted_proba = model.predict_proba(X_unlabeled)

            # Concat the predicted values with the original data
            X_train = np.concatenate((X_train, X_unlabeled))
            y_train = np.concatenate((y_train, y_unlabeled_predicted))
            logger.info('New data added to training set')
            logger.debug(f'X_train: {X_train.shape}, y_train: {y_train.shape}')

            # Train the model with the new data
            model.fit(X_train, y_train)

            # Predict on test set
            y_pred = model.predict(X_test)
            y_pred_proba = model.predict_proba(X_test)

            # Metrics
            metrics = get_metrics_df(y_test, y_pred)

            accuracy = metrics["accuracy"].values[0]
            precision = metrics["precision"].values[0]
            recall = metrics["recall"].values[0]
            f1 = metrics["f1"].values[0]
            roc = metrics["roc"].values[0]

            # Results
            results.append({"rows_seed": seed, "model": type(model).__name__, "unlabeled": value, "acc": accuracy, "precision": precision, "recall": recall, "f1": f1, "roc": roc})

            results_rows_seed = []
            results_models = []
            results_unlabeled = []
            results_acc = []
            results_precision = []
            results_recall = []
            results_f1 = []
            results_roc = []

            for result in results:
                results_rows_seed.append(result.get('rows_seed'))
                results_models.append(result.get('model'))
                results_unlabeled.append(result.get('unlabeled'))
                results_acc.append(result.get('acc'))
                results_precision.append(result.get('precision'))
                results_recall.append(result.get('recall'))
                results_f1.append(result.get('f1'))
                results_roc.append(result.get('roc'))


            results_df = pd.DataFrame({'rows_seed': results_rows_seed, 'model': results_models, 'unlabeled': results_unlabeled, 'acc': results_acc, 'precision': results_precision, 'recall': results_recall, 'f1': results_f1, 'roc': results_roc})

[32m2023-10-23 22:15:47.412[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1mRunning model: GaussianNB[0m
[32m2023-10-23 22:15:47.413[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mUnlabeled percentage: 0.1[0m
[32m2023-10-23 22:15:47.414[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m10[0m - [1mExecution: 1[0m
[32m2023-10-23 22:15:47.414[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m11[0m - [1mSeed: 24[0m
[32m2023-10-23 22:15:47.430[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m15[0m - [34m[1mX_train: (7311, 22), y_train: (7311,), X_test: (813, 22), y_test: (813,)[0m
[32m2023-10-23 22:15:47.432[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m18[0m - [34m[1mX_train: (6579, 22), X_unlabeled: (732, 22), y_train: (6579,), y_unlabeled: (732,)[0m
[32m2023-10-23 22:15:47.435[0m | [1mINFO    [0m | [36m__main__[0m:[36m<mo

In [7]:
# mean and standard deviation for all metrics
results_df_with_mean_std = results_df.groupby(['model', 'unlabeled']).agg({'acc': ['mean', 'std'], 'f1': ['mean', 'std'], 'precision': ['mean', 'std'], 'recall': ['mean', 'std'], 'roc': ['mean', 'std']})
results_df_with_mean_std = results_df_with_mean_std.reset_index(level=1)
results_df_with_mean_std.columns = ["_".join(col) for col in results_df_with_mean_std.columns.values]
results_df_with_mean_std.rename(columns={'unlabeled_': 'unlabeled'}, inplace=True)
results_df_with_mean_std.reset_index(inplace=True)
results_df_with_mean_std.columns

Index(['model', 'unlabeled', 'acc_mean', 'acc_std', 'f1_mean', 'f1_std',
       'precision_mean', 'precision_std', 'recall_mean', 'recall_std',
       'roc_mean', 'roc_std'],
      dtype='object')

In [8]:
# Metrics with tabulate
from tabulate import tabulate

print(tabulate(results_df_with_mean_std, headers='keys', tablefmt='psql', showindex=False))

+------------------------+-------------+------------+-------------+-----------+-------------+------------------+-----------------+---------------+--------------+------------+-------------+
| model                  |   unlabeled |   acc_mean |     acc_std |   f1_mean |      f1_std |   precision_mean |   precision_std |   recall_mean |   recall_std |   roc_mean |     roc_std |
|------------------------+-------------+------------+-------------+-----------+-------------+------------------+-----------------+---------------+--------------+------------+-------------|
| DecisionTreeClassifier |        0.1  |   1        | 0           |  1        | 0           |         1        |     0           |      1        |  0           |   1        | 0           |
| DecisionTreeClassifier |        0.2  |   1        | 0           |  1        | 0           |         1        |     0           |      1        |  0           |   1        | 0           |
| DecisionTreeClassifier |        0.3  |   0.999467 | 0