In [7]:
##################
# Libraries
##################
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import LinearSVC, SVC
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from loguru import logger

In [2]:
##################
# Constants
##################
MODEL_SEED=42
ROWS_SEED=[24, 42, 206, 602, 412, 214, 754, 457, 2023, 3202]
SIZE_OF_UNLABELLED_DATA = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.85, 0.9]
MODELS = [
    GaussianNB(var_smoothing=0.657933224657568),
    DecisionTreeClassifier(criterion='entropy', max_depth=7, min_samples_leaf=6, min_samples_split=7),
    KNeighborsClassifier(metric='euclidean', weights='distance'),
    RandomForestClassifier(criterion='gini', max_depth=15, min_samples_leaf=2, min_samples_split=2, n_estimators=100, ),
]

In [10]:
##################
# Functions
##################
def encode_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """Encode dataframe using LabelEncoder"""
    labelencoder = LabelEncoder()

    for col in df.columns:
        df[col] = labelencoder.fit_transform(df[col])

    return df

def get_metrics() -> dict:
    metrics = {
        'accuracy': accuracy_score,
        'precision': precision_score,
        'recall': recall_score,
        'f1': f1_score,
        'roc': roc_auc_score
    }
    return metrics


def get_metrics_df(y_true, y_pred) -> pd.DataFrame:
    metrics = get_metrics()
    df = pd.DataFrame()
    for metric_name, metric in metrics.items():
        df[metric_name] = [metric(y_true, y_pred)]

    return df

def run_pipeline(X_train, X_test, y_train, y_test, size_of_unlabelled_data=0.5, model=None, rows_seed=42) -> dict:
    if model is None:
        raise ValueError("Model is None")
    
    X_train = X_train.copy()
    X_test = X_test.copy()
    y_train = y_train.copy()
    y_test = y_test.copy()

    # Unlabelled data
    rng = np.random.RandomState(rows_seed)
    random_rows_with_rng = rng.choice(X_train.index, size=int(len(X_train)*size_of_unlabelled_data), replace=False)
    y_train.loc[random_rows_with_rng] = -1

    # Scale data
    scaler = RobustScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)


    self_training_model = SelfTrainingClassifier(model, verbose=False)
    self_training_model.fit(X_train, y_train)

    # Predict
    y_pred = self_training_model.predict(X_test)

    # Evaluate - Get Metrics
    metrics = get_metrics_df(y_test, y_pred)
    accuracy = metrics["accuracy"].values[0]
    precision = metrics["precision"].values[0]
    recall = metrics["recall"].values[0]
    f1 = metrics["f1"].values[0]
    roc = metrics["roc"].values[0]
    

    return {"rows_seed": rows_seed, "model": type(model).__name__, "unlabeled": size_of_unlabelled_data, "acc": accuracy, "precision": precision, "recall": recall, "f1": f1, "roc": roc}

def run_pipeline_self(X_train, X_test, y_train, y_test) -> pd.DataFrame:
    run_counter = 0
    results = []

    for model in MODELS:
        for value in SIZE_OF_UNLABELLED_DATA:
            logger.info(f'Model: {model}')
            logger.info(f'Size of unlabelled data: {value}')
            for row_seed in ROWS_SEED:
                run_counter += 1
                logger.info(f"%{run_counter} - Running pipeline for row_seed: {row_seed} and size_of_unlabelled_data: {value} and model: {model}")
                result = run_pipeline(X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, size_of_unlabelled_data=value, model=model, rows_seed=row_seed)
                results.append(result)
            print('\n')

    results_rows_seed = []
    results_models = []
    results_unlabeled = []
    results_acc = []
    results_precision = []
    results_recall = []
    results_f1 = []
    results_roc = []

    for result in results:
        results_rows_seed.append(result.get('rows_seed'))
        results_models.append(result.get('model'))
        results_unlabeled.append(result.get('unlabeled'))
        results_acc.append(result.get('acc'))
        results_precision.append(result.get('precision'))
        results_recall.append(result.get('recall'))
        results_f1.append(result.get('f1'))
        results_roc.append(result.get('roc'))


    results_df = pd.DataFrame({'rows_seed': results_rows_seed, 'model': results_models, 'unlabeled': results_unlabeled, 'acc': results_acc, 'precision': results_precision, 'recall': results_recall, 'f1': results_f1, 'roc': results_roc})

    return results_df

    

In [4]:
df = pd.read_csv('../../datasets/diabetes.csv')
df = encode_dataframe(df)


X = df.drop('Outcome', axis=1)
y = df['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=MODEL_SEED)

In [11]:
results_df = run_pipeline_self(X_train, X_test, y_train, y_test)

[32m2023-10-22 10:50:46.789[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m75[0m - [1mModel: GaussianNB(var_smoothing=0.657933224657568)[0m
[32m2023-10-22 10:50:46.790[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m76[0m - [1mSize of unlabelled data: 0.1[0m
[32m2023-10-22 10:50:46.791[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%1 - Running pipeline for row_seed: 24 and size_of_unlabelled_data: 0.1 and model: GaussianNB(var_smoothing=0.657933224657568)[0m
[32m2023-10-22 10:50:46.803[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%2 - Running pipeline for row_seed: 42 and size_of_unlabelled_data: 0.1 and model: GaussianNB(var_smoothing=0.657933224657568)[0m
[32m2023-10-22 10:50:46.816[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%3 - Running pipeline for row_seed: 206 and size_of_unlabelled_da







[32m2023-10-22 10:50:47.122[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%27 - Running pipeline for row_seed: 754 and size_of_unlabelled_data: 0.3 and model: GaussianNB(var_smoothing=0.657933224657568)[0m
[32m2023-10-22 10:50:47.135[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%28 - Running pipeline for row_seed: 457 and size_of_unlabelled_data: 0.3 and model: GaussianNB(var_smoothing=0.657933224657568)[0m
[32m2023-10-22 10:50:47.148[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%29 - Running pipeline for row_seed: 2023 and size_of_unlabelled_data: 0.3 and model: GaussianNB(var_smoothing=0.657933224657568)[0m
[32m2023-10-22 10:50:47.161[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%30 - Running pipeline for row_seed: 3202 and size_of_unlabelled_data: 0.3 and model: GaussianNB(var_smoothing=0.657933224657568)[0







[32m2023-10-22 10:50:47.383[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%44 - Running pipeline for row_seed: 602 and size_of_unlabelled_data: 0.5 and model: GaussianNB(var_smoothing=0.657933224657568)[0m
[32m2023-10-22 10:50:47.410[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%45 - Running pipeline for row_seed: 412 and size_of_unlabelled_data: 0.5 and model: GaussianNB(var_smoothing=0.657933224657568)[0m
[32m2023-10-22 10:50:47.432[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%46 - Running pipeline for row_seed: 214 and size_of_unlabelled_data: 0.5 and model: GaussianNB(var_smoothing=0.657933224657568)[0m
[32m2023-10-22 10:50:47.446[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%47 - Running pipeline for row_seed: 754 and size_of_unlabelled_data: 0.5 and model: GaussianNB(var_smoothing=0.657933224657568)[0m








[32m2023-10-22 10:50:47.718[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%65 - Running pipeline for row_seed: 412 and size_of_unlabelled_data: 0.7 and model: GaussianNB(var_smoothing=0.657933224657568)[0m
[32m2023-10-22 10:50:47.737[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%66 - Running pipeline for row_seed: 214 and size_of_unlabelled_data: 0.7 and model: GaussianNB(var_smoothing=0.657933224657568)[0m
[32m2023-10-22 10:50:47.756[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%67 - Running pipeline for row_seed: 754 and size_of_unlabelled_data: 0.7 and model: GaussianNB(var_smoothing=0.657933224657568)[0m
[32m2023-10-22 10:50:47.774[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%68 - Running pipeline for row_seed: 457 and size_of_unlabelled_data: 0.7 and model: GaussianNB(var_smoothing=0.657933224657568)[0m








[32m2023-10-22 10:50:48.026[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%86 - Running pipeline for row_seed: 214 and size_of_unlabelled_data: 0.85 and model: GaussianNB(var_smoothing=0.657933224657568)[0m
[32m2023-10-22 10:50:48.042[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%87 - Running pipeline for row_seed: 754 and size_of_unlabelled_data: 0.85 and model: GaussianNB(var_smoothing=0.657933224657568)[0m
[32m2023-10-22 10:50:48.057[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%88 - Running pipeline for row_seed: 457 and size_of_unlabelled_data: 0.85 and model: GaussianNB(var_smoothing=0.657933224657568)[0m
[32m2023-10-22 10:50:48.075[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%89 - Running pipeline for row_seed: 2023 and size_of_unlabelled_data: 0.85 and model: GaussianNB(var_smoothing=0.657933224657568)







[32m2023-10-22 10:50:48.321[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%105 - Running pipeline for row_seed: 412 and size_of_unlabelled_data: 0.1 and model: DecisionTreeClassifier(criterion='entropy', max_depth=7, min_samples_leaf=6,
                       min_samples_split=7)[0m
[32m2023-10-22 10:50:48.340[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%106 - Running pipeline for row_seed: 214 and size_of_unlabelled_data: 0.1 and model: DecisionTreeClassifier(criterion='entropy', max_depth=7, min_samples_leaf=6,
                       min_samples_split=7)[0m
[32m2023-10-22 10:50:48.356[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%107 - Running pipeline for row_seed: 754 and size_of_unlabelled_data: 0.1 and model: DecisionTreeClassifier(criterion='entropy', max_depth=7, min_samples_leaf=6,
                       min_samples_split=7)[0m
[32m2023-10-22 







[32m2023-10-22 10:50:48.641[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%123 - Running pipeline for row_seed: 206 and size_of_unlabelled_data: 0.3 and model: DecisionTreeClassifier(criterion='entropy', max_depth=7, min_samples_leaf=6,
                       min_samples_split=7)[0m
[32m2023-10-22 10:50:48.658[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%124 - Running pipeline for row_seed: 602 and size_of_unlabelled_data: 0.3 and model: DecisionTreeClassifier(criterion='entropy', max_depth=7, min_samples_leaf=6,
                       min_samples_split=7)[0m
[32m2023-10-22 10:50:48.677[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%125 - Running pipeline for row_seed: 412 and size_of_unlabelled_data: 0.3 and model: DecisionTreeClassifier(criterion='entropy', max_depth=7, min_samples_leaf=6,
                       min_samples_split=7)[0m
[32m2023-10-22 







[32m2023-10-22 10:50:48.988[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%143 - Running pipeline for row_seed: 206 and size_of_unlabelled_data: 0.5 and model: DecisionTreeClassifier(criterion='entropy', max_depth=7, min_samples_leaf=6,
                       min_samples_split=7)[0m
[32m2023-10-22 10:50:49.009[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%144 - Running pipeline for row_seed: 602 and size_of_unlabelled_data: 0.5 and model: DecisionTreeClassifier(criterion='entropy', max_depth=7, min_samples_leaf=6,
                       min_samples_split=7)[0m
[32m2023-10-22 10:50:49.026[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%145 - Running pipeline for row_seed: 412 and size_of_unlabelled_data: 0.5 and model: DecisionTreeClassifier(criterion='entropy', max_depth=7, min_samples_leaf=6,
                       min_samples_split=7)[0m
[32m2023-10-22 







[32m2023-10-22 10:50:49.341[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%163 - Running pipeline for row_seed: 206 and size_of_unlabelled_data: 0.7 and model: DecisionTreeClassifier(criterion='entropy', max_depth=7, min_samples_leaf=6,
                       min_samples_split=7)[0m
[32m2023-10-22 10:50:49.356[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%164 - Running pipeline for row_seed: 602 and size_of_unlabelled_data: 0.7 and model: DecisionTreeClassifier(criterion='entropy', max_depth=7, min_samples_leaf=6,
                       min_samples_split=7)[0m
[32m2023-10-22 10:50:49.377[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%165 - Running pipeline for row_seed: 412 and size_of_unlabelled_data: 0.7 and model: DecisionTreeClassifier(criterion='entropy', max_depth=7, min_samples_leaf=6,
                       min_samples_split=7)[0m
[32m2023-10-22 







[32m2023-10-22 10:50:49.682[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%184 - Running pipeline for row_seed: 602 and size_of_unlabelled_data: 0.85 and model: DecisionTreeClassifier(criterion='entropy', max_depth=7, min_samples_leaf=6,
                       min_samples_split=7)[0m
[32m2023-10-22 10:50:49.701[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%185 - Running pipeline for row_seed: 412 and size_of_unlabelled_data: 0.85 and model: DecisionTreeClassifier(criterion='entropy', max_depth=7, min_samples_leaf=6,
                       min_samples_split=7)[0m
[32m2023-10-22 10:50:49.717[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%186 - Running pipeline for row_seed: 214 and size_of_unlabelled_data: 0.85 and model: DecisionTreeClassifier(criterion='entropy', max_depth=7, min_samples_leaf=6,
                       min_samples_split=7)[0m
[32m2023-10-







[32m2023-10-22 10:50:49.998[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%206 - Running pipeline for row_seed: 214 and size_of_unlabelled_data: 0.1 and model: KNeighborsClassifier(metric='euclidean', weights='distance')[0m
[32m2023-10-22 10:50:50.011[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%207 - Running pipeline for row_seed: 754 and size_of_unlabelled_data: 0.1 and model: KNeighborsClassifier(metric='euclidean', weights='distance')[0m
[32m2023-10-22 10:50:50.026[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%208 - Running pipeline for row_seed: 457 and size_of_unlabelled_data: 0.1 and model: KNeighborsClassifier(metric='euclidean', weights='distance')[0m
[32m2023-10-22 10:50:50.040[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%209 - Running pipeline for row_seed: 2023 and size_of_unlabelled_data: 0.1 and







[32m2023-10-22 10:50:50.275[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%224 - Running pipeline for row_seed: 602 and size_of_unlabelled_data: 0.3 and model: KNeighborsClassifier(metric='euclidean', weights='distance')[0m
[32m2023-10-22 10:50:50.290[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%225 - Running pipeline for row_seed: 412 and size_of_unlabelled_data: 0.3 and model: KNeighborsClassifier(metric='euclidean', weights='distance')[0m
[32m2023-10-22 10:50:50.309[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%226 - Running pipeline for row_seed: 214 and size_of_unlabelled_data: 0.3 and model: KNeighborsClassifier(metric='euclidean', weights='distance')[0m
[32m2023-10-22 10:50:50.328[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%227 - Running pipeline for row_seed: 754 and size_of_unlabelled_data: 0.3 and 







[32m2023-10-22 10:50:50.610[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%243 - Running pipeline for row_seed: 206 and size_of_unlabelled_data: 0.5 and model: KNeighborsClassifier(metric='euclidean', weights='distance')[0m
[32m2023-10-22 10:50:50.629[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%244 - Running pipeline for row_seed: 602 and size_of_unlabelled_data: 0.5 and model: KNeighborsClassifier(metric='euclidean', weights='distance')[0m
[32m2023-10-22 10:50:50.651[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%245 - Running pipeline for row_seed: 412 and size_of_unlabelled_data: 0.5 and model: KNeighborsClassifier(metric='euclidean', weights='distance')[0m
[32m2023-10-22 10:50:50.668[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%246 - Running pipeline for row_seed: 214 and size_of_unlabelled_data: 0.5 and 







[32m2023-10-22 10:50:50.975[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%262 - Running pipeline for row_seed: 42 and size_of_unlabelled_data: 0.7 and model: KNeighborsClassifier(metric='euclidean', weights='distance')[0m
[32m2023-10-22 10:50:50.995[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%263 - Running pipeline for row_seed: 206 and size_of_unlabelled_data: 0.7 and model: KNeighborsClassifier(metric='euclidean', weights='distance')[0m
[32m2023-10-22 10:50:51.014[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%264 - Running pipeline for row_seed: 602 and size_of_unlabelled_data: 0.7 and model: KNeighborsClassifier(metric='euclidean', weights='distance')[0m
[32m2023-10-22 10:50:51.032[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%265 - Running pipeline for row_seed: 412 and size_of_unlabelled_data: 0.7 and m





[32m2023-10-22 10:50:51.358[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m75[0m - [1mModel: KNeighborsClassifier(metric='euclidean', weights='distance')[0m
[32m2023-10-22 10:50:51.359[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m76[0m - [1mSize of unlabelled data: 0.85[0m
[32m2023-10-22 10:50:51.359[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%281 - Running pipeline for row_seed: 24 and size_of_unlabelled_data: 0.85 and model: KNeighborsClassifier(metric='euclidean', weights='distance')[0m
[32m2023-10-22 10:50:51.384[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%282 - Running pipeline for row_seed: 42 and size_of_unlabelled_data: 0.85 and model: KNeighborsClassifier(metric='euclidean', weights='distance')[0m
[32m2023-10-22 10:50:51.404[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%283 - 





[32m2023-10-22 10:50:51.563[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m75[0m - [1mModel: KNeighborsClassifier(metric='euclidean', weights='distance')[0m
[32m2023-10-22 10:50:51.564[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m76[0m - [1mSize of unlabelled data: 0.9[0m
[32m2023-10-22 10:50:51.564[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%291 - Running pipeline for row_seed: 24 and size_of_unlabelled_data: 0.9 and model: KNeighborsClassifier(metric='euclidean', weights='distance')[0m
[32m2023-10-22 10:50:51.583[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%292 - Running pipeline for row_seed: 42 and size_of_unlabelled_data: 0.9 and model: KNeighborsClassifier(metric='euclidean', weights='distance')[0m
[32m2023-10-22 10:50:51.602[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%293 - Run







[32m2023-10-22 10:50:52.265[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%302 - Running pipeline for row_seed: 42 and size_of_unlabelled_data: 0.1 and model: RandomForestClassifier(max_depth=15, min_samples_leaf=2)[0m
[32m2023-10-22 10:50:52.955[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%303 - Running pipeline for row_seed: 206 and size_of_unlabelled_data: 0.1 and model: RandomForestClassifier(max_depth=15, min_samples_leaf=2)[0m
[32m2023-10-22 10:50:53.556[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%304 - Running pipeline for row_seed: 602 and size_of_unlabelled_data: 0.1 and model: RandomForestClassifier(max_depth=15, min_samples_leaf=2)[0m
[32m2023-10-22 10:50:54.047[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%305 - Running pipeline for row_seed: 412 and size_of_unlabelled_data: 0.1 and model: Random





[32m2023-10-22 10:50:58.520[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%312 - Running pipeline for row_seed: 42 and size_of_unlabelled_data: 0.2 and model: RandomForestClassifier(max_depth=15, min_samples_leaf=2)[0m
[32m2023-10-22 10:50:59.579[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%313 - Running pipeline for row_seed: 206 and size_of_unlabelled_data: 0.2 and model: RandomForestClassifier(max_depth=15, min_samples_leaf=2)[0m
[32m2023-10-22 10:51:00.259[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%314 - Running pipeline for row_seed: 602 and size_of_unlabelled_data: 0.2 and model: RandomForestClassifier(max_depth=15, min_samples_leaf=2)[0m
[32m2023-10-22 10:51:00.851[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%315 - Running pipeline for row_seed: 412 and size_of_unlabelled_data: 0.2 and model: Random





[32m2023-10-22 10:51:05.638[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%322 - Running pipeline for row_seed: 42 and size_of_unlabelled_data: 0.3 and model: RandomForestClassifier(max_depth=15, min_samples_leaf=2)[0m
[32m2023-10-22 10:51:06.730[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%323 - Running pipeline for row_seed: 206 and size_of_unlabelled_data: 0.3 and model: RandomForestClassifier(max_depth=15, min_samples_leaf=2)[0m
[32m2023-10-22 10:51:07.688[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%324 - Running pipeline for row_seed: 602 and size_of_unlabelled_data: 0.3 and model: RandomForestClassifier(max_depth=15, min_samples_leaf=2)[0m
[32m2023-10-22 10:51:08.811[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%325 - Running pipeline for row_seed: 412 and size_of_unlabelled_data: 0.3 and model: Random





[32m2023-10-22 10:51:15.056[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%332 - Running pipeline for row_seed: 42 and size_of_unlabelled_data: 0.4 and model: RandomForestClassifier(max_depth=15, min_samples_leaf=2)[0m
[32m2023-10-22 10:51:16.071[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%333 - Running pipeline for row_seed: 206 and size_of_unlabelled_data: 0.4 and model: RandomForestClassifier(max_depth=15, min_samples_leaf=2)[0m
[32m2023-10-22 10:51:17.072[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%334 - Running pipeline for row_seed: 602 and size_of_unlabelled_data: 0.4 and model: RandomForestClassifier(max_depth=15, min_samples_leaf=2)[0m
[32m2023-10-22 10:51:18.066[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%335 - Running pipeline for row_seed: 412 and size_of_unlabelled_data: 0.4 and model: Random





[32m2023-10-22 10:51:24.546[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%342 - Running pipeline for row_seed: 42 and size_of_unlabelled_data: 0.5 and model: RandomForestClassifier(max_depth=15, min_samples_leaf=2)[0m
[32m2023-10-22 10:51:25.431[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%343 - Running pipeline for row_seed: 206 and size_of_unlabelled_data: 0.5 and model: RandomForestClassifier(max_depth=15, min_samples_leaf=2)[0m
[32m2023-10-22 10:51:26.237[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%344 - Running pipeline for row_seed: 602 and size_of_unlabelled_data: 0.5 and model: RandomForestClassifier(max_depth=15, min_samples_leaf=2)[0m
[32m2023-10-22 10:51:27.212[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%345 - Running pipeline for row_seed: 412 and size_of_unlabelled_data: 0.5 and model: Random





[32m2023-10-22 10:51:33.853[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%352 - Running pipeline for row_seed: 42 and size_of_unlabelled_data: 0.6 and model: RandomForestClassifier(max_depth=15, min_samples_leaf=2)[0m
[32m2023-10-22 10:51:34.806[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%353 - Running pipeline for row_seed: 206 and size_of_unlabelled_data: 0.6 and model: RandomForestClassifier(max_depth=15, min_samples_leaf=2)[0m
[32m2023-10-22 10:51:35.766[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%354 - Running pipeline for row_seed: 602 and size_of_unlabelled_data: 0.6 and model: RandomForestClassifier(max_depth=15, min_samples_leaf=2)[0m
[32m2023-10-22 10:51:36.721[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%355 - Running pipeline for row_seed: 412 and size_of_unlabelled_data: 0.6 and model: Random





[32m2023-10-22 10:51:43.211[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%362 - Running pipeline for row_seed: 42 and size_of_unlabelled_data: 0.7 and model: RandomForestClassifier(max_depth=15, min_samples_leaf=2)[0m
[32m2023-10-22 10:51:44.157[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%363 - Running pipeline for row_seed: 206 and size_of_unlabelled_data: 0.7 and model: RandomForestClassifier(max_depth=15, min_samples_leaf=2)[0m
[32m2023-10-22 10:51:45.103[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%364 - Running pipeline for row_seed: 602 and size_of_unlabelled_data: 0.7 and model: RandomForestClassifier(max_depth=15, min_samples_leaf=2)[0m
[32m2023-10-22 10:51:45.873[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%365 - Running pipeline for row_seed: 412 and size_of_unlabelled_data: 0.7 and model: Random





[32m2023-10-22 10:51:52.483[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%372 - Running pipeline for row_seed: 42 and size_of_unlabelled_data: 0.8 and model: RandomForestClassifier(max_depth=15, min_samples_leaf=2)[0m
[32m2023-10-22 10:51:53.404[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%373 - Running pipeline for row_seed: 206 and size_of_unlabelled_data: 0.8 and model: RandomForestClassifier(max_depth=15, min_samples_leaf=2)[0m
[32m2023-10-22 10:51:54.324[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%374 - Running pipeline for row_seed: 602 and size_of_unlabelled_data: 0.8 and model: RandomForestClassifier(max_depth=15, min_samples_leaf=2)[0m
[32m2023-10-22 10:51:55.256[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%375 - Running pipeline for row_seed: 412 and size_of_unlabelled_data: 0.8 and model: Random





[32m2023-10-22 10:52:01.631[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%382 - Running pipeline for row_seed: 42 and size_of_unlabelled_data: 0.85 and model: RandomForestClassifier(max_depth=15, min_samples_leaf=2)[0m
[32m2023-10-22 10:52:02.529[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%383 - Running pipeline for row_seed: 206 and size_of_unlabelled_data: 0.85 and model: RandomForestClassifier(max_depth=15, min_samples_leaf=2)[0m
[32m2023-10-22 10:52:03.444[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%384 - Running pipeline for row_seed: 602 and size_of_unlabelled_data: 0.85 and model: RandomForestClassifier(max_depth=15, min_samples_leaf=2)[0m
[32m2023-10-22 10:52:04.385[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%385 - Running pipeline for row_seed: 412 and size_of_unlabelled_data: 0.85 and model: Ra





[32m2023-10-22 10:52:10.873[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%392 - Running pipeline for row_seed: 42 and size_of_unlabelled_data: 0.9 and model: RandomForestClassifier(max_depth=15, min_samples_leaf=2)[0m
[32m2023-10-22 10:52:11.788[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%393 - Running pipeline for row_seed: 206 and size_of_unlabelled_data: 0.9 and model: RandomForestClassifier(max_depth=15, min_samples_leaf=2)[0m
[32m2023-10-22 10:52:12.696[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%394 - Running pipeline for row_seed: 602 and size_of_unlabelled_data: 0.9 and model: RandomForestClassifier(max_depth=15, min_samples_leaf=2)[0m
[32m2023-10-22 10:52:13.596[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_pipeline_self[0m:[36m79[0m - [1m%395 - Running pipeline for row_seed: 412 and size_of_unlabelled_data: 0.9 and model: Random





In [12]:
# mean and standard deviation for all metrics
results_df_with_mean_std = results_df.groupby(['model', 'unlabeled']).agg({'acc': ['mean', 'std'], 'f1': ['mean', 'std'], 'precision': ['mean', 'std'], 'recall': ['mean', 'std'], 'roc': ['mean', 'std']})
results_df_with_mean_std = results_df_with_mean_std.reset_index(level=1)
results_df_with_mean_std.columns = ["_".join(col) for col in results_df_with_mean_std.columns.values]
results_df_with_mean_std.rename(columns={'unlabeled_': 'unlabeled'}, inplace=True)
results_df_with_mean_std.reset_index(inplace=True)
results_df_with_mean_std.columns

Index(['model', 'unlabeled', 'acc_mean', 'acc_std', 'f1_mean', 'f1_std',
       'precision_mean', 'precision_std', 'recall_mean', 'recall_std',
       'roc_mean', 'roc_std'],
      dtype='object')

In [8]:
import plotly.express as px

fig1 = px.scatter(results_df_with_mean_std, x='unlabeled', y='acc_mean', error_y='acc_std', color='model', title='Accuracy vs. Unlabeled Data')
fig2 = px.scatter(results_df_with_mean_std, x='unlabeled', y='f1_mean', error_y='f1_std', color='model', title='F1 Score vs. Unlabeled Data')
fig3 = px.scatter(results_df_with_mean_std, x='unlabeled', y='precision_mean', error_y='precision_std', color='model', title='Precision vs. Unlabeled Data')
fig4 = px.scatter(results_df_with_mean_std, x='unlabeled', y='recall_mean', error_y='recall_std', color='model', title='Recall vs. Unlabeled Data')
fig5 = px.scatter(results_df_with_mean_std, x='unlabeled', y='roc_mean', error_y='roc_std', color='model', title='ROC AUC vs. Unlabeled Data')


figs = [fig1, fig2, fig3, fig4, fig5]

for fig in figs:
    fig.update_layout(
        xaxis_title="% of Unlabeled Data in Dataset",
        yaxis_title="Metric Value",
        legend_title="Model",
        autosize=False,
        width=1200,
        height=500,
    )
    fig.update_traces(textposition='top center', texttemplate='%{y:.3f}')
    fig.update_layout(hovermode="x unified")
    fig.show()


In [15]:
# Metrics with tabulate
from tabulate import tabulate

print(tabulate(results_df_with_mean_std, headers='keys', tablefmt='psql', showindex=False))

+------------------------+-------------+------------+------------+-----------+-----------+------------------+-----------------+---------------+--------------+------------+------------+
| model                  |   unlabeled |   acc_mean |    acc_std |   f1_mean |    f1_std |   precision_mean |   precision_std |   recall_mean |   recall_std |   roc_mean |    roc_std |
|------------------------+-------------+------------+------------+-----------+-----------+------------------+-----------------+---------------+--------------+------------+------------|
| DecisionTreeClassifier |        0.1  |   0.732468 | 0.0310363  |  0.612157 | 0.0593319 |         0.635283 |       0.0519972 |      0.598182 |    0.0963808 |   0.702626 | 0.0399787  |
| DecisionTreeClassifier |        0.2  |   0.722727 | 0.0181225  |  0.612012 | 0.0442131 |         0.610771 |       0.0217363 |      0.618182 |    0.0808586 |   0.699495 | 0.0295079  |
| DecisionTreeClassifier |        0.3  |   0.708442 | 0.0367902  |  0.58444