# Assessment of semi-supervised learning

## Libraries

In [1]:
import pandas as pd
pd.set_option('display.max_rows', None)

from sklearn.metrics import f1_score
import xgboost as xgb
import sys

In [2]:
sys.path.append('../src')

In [3]:
from pre_processing import assign_data_set, ohe_features
from ml import BaselineClf, SelfTrainingClf, LabelPropagationClf, LabelSpreadingClf
from visualizing import plot_scores
from defs import VAL_SCORE, TEST_SCORE
from tuning import tune_param, get_best_score

## Parameters

In [4]:
DATA_FILE = '..\\res\\diabetes_prediction_dataset.csv'

# columns to be one-hot encoded
ENCODE_COLS = ['gender', 'smoking_history']
# column to be predicted
TARGET_COL = 'diabetes'

SCORE_FCN = f1_score

# classifier evaluation metric
CLF = xgb.XGBClassifier
LABEL_SIZES = [20, 30, 100, 200, 300, 500, 1000, 3000, 10000, 30000, 80000]

## Data

source: https://www.kaggle.com/datasets/iammustafatz/diabetes-prediction-dataset

In [5]:
df = pd.read_csv(DATA_FILE)
len(df)

100000

In [6]:
# split into train, val, test sets
split_df = assign_data_set(df)
split_df.index.value_counts()

data_set
TRAIN    80000
VAL      10000
TEST     10000
Name: count, dtype: int64

In [7]:
feature_df = ohe_features(split_df, ENCODE_COLS)
feature_df.head()

Unnamed: 0_level_0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,gender_Female,gender_Male,gender_Other,smoking_history_No Info,smoking_history_current,smoking_history_ever,smoking_history_former,smoking_history_never,smoking_history_not current
data_set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
TRAIN,52.0,0,0,27.32,4.8,140,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
TRAIN,56.0,0,0,27.32,4.8,100,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
TRAIN,22.0,0,0,37.16,6.6,85,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
TRAIN,49.0,0,0,43.83,5.0,160,0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
TRAIN,10.0,0,0,14.18,4.0,155,0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


# Baseline

In [8]:
baseline_scores = []
for label_size in LABEL_SIZES:
    baseline_clf = BaselineClf(feature_df, CLF, SCORE_FCN, TARGET_COL)
    baseline_scores.append(baseline_clf.fit_score(label_size))

baseline_scores_df = pd.DataFrame(baseline_scores)
baseline_scores_df

Unnamed: 0,model,label_size,train_score,val_score,test_score,fit_time,score_time,model_size
0,Baseline,20,0.0,0.0,0.0,1.040111,0.075344,4528
1,Baseline,30,0.8,0.353153,0.363316,0.066939,0.091664,4528
2,Baseline,100,0.923077,0.325088,0.318339,0.087873,0.107772,4528
3,Baseline,200,1.0,0.563821,0.56792,0.141114,0.151521,4528
4,Baseline,300,1.0,0.673961,0.704997,0.168477,0.137172,4528
5,Baseline,500,1.0,0.720167,0.750164,0.154422,0.137087,4528
6,Baseline,1000,1.0,0.790945,0.792714,0.37167,0.137672,4528
7,Baseline,3000,0.996169,0.793377,0.800255,0.367755,0.17255,4528
8,Baseline,10000,0.956258,0.785526,0.817204,0.698859,0.1585,4528
9,Baseline,30000,0.867883,0.80429,0.825048,1.528784,0.180472,4528


In [9]:
plot_scores(baseline_scores_df, 'label_size', VAL_SCORE, 'model').show()

# Self-Training

In [10]:
st_scores = []
for label_size in LABEL_SIZES:
    st_clf = SelfTrainingClf(feature_df, CLF, SCORE_FCN, TARGET_COL)
    st_scores.append(st_clf.fit_score(label_size))

st_base_scores_df = pd.DataFrame(st_scores)


y contains no unlabeled samples



In [11]:
plot_scores(pd.concat([baseline_scores_df, st_base_scores_df]), 'label_size', VAL_SCORE, 'model').show()

## Tune Threshold

In [12]:
param_tune = {'threshold': [0.5, 0.7, 0.9, 0.97, 0.99, 0.997, 0.999]}

st_tune_scores_df = tune_param(SelfTrainingClf, CLF, feature_df, SCORE_FCN, TARGET_COL, param_tune, 
                               LABEL_SIZES, n_workers=-1)

Tuning threshold


In [13]:
plot_scores(st_tune_scores_df, 'threshold', VAL_SCORE, 'label_size', xaxis_type='reverselog').show()

In [14]:
# get best parameter of each training size
st_best_scores_df = get_best_score(st_tune_scores_df)
st_best_scores_df

Unnamed: 0,model,label_size,threshold,criterion,k_best,train_score,val_score,test_score,fit_time,score_time,model_size,param_tuned
0,SelfTraining,20,0.5,threshold,10,0.0,0.0,0.0,8.416786,0.399004,1289360,threshold
1,SelfTraining,30,0.997,threshold,10,0.8,0.353153,0.363316,0.647321,0.334318,1289024,threshold
2,SelfTraining,100,0.99,threshold,10,0.923077,0.454616,0.466667,77.474369,0.266649,1289024,threshold
3,SelfTraining,200,0.999,threshold,10,0.965517,0.678815,0.695398,106.723874,0.255449,1288928,threshold
4,SelfTraining,300,0.97,threshold,10,0.96,0.736842,0.75838,219.559588,0.490069,1288928,threshold
5,SelfTraining,500,0.99,threshold,10,0.948718,0.802221,0.822591,206.433846,0.517473,1288928,threshold
6,SelfTraining,1000,0.99,threshold,10,0.956044,0.805805,0.823762,227.301527,0.579633,1288928,threshold
7,SelfTraining,3000,0.9,threshold,10,0.865801,0.802787,0.821716,247.629238,0.381853,1288928,threshold
8,SelfTraining,10000,0.9,threshold,10,0.85543,0.803841,0.824387,257.628644,0.494835,1288928,threshold
9,SelfTraining,30000,0.997,threshold,10,0.869314,0.807822,0.827943,206.581921,0.588046,1288928,threshold


In [15]:
# plot baseline, self-training default and self-training tuned
combine_df = pd.concat([baseline_scores_df, st_base_scores_df.assign(model='ST Default'), 
                        st_best_scores_df.assign(model='ST Threshold Tuned')])

plot_scores(combine_df, 'label_size', TEST_SCORE, 'model', xaxis_type='log').show()

## Tune k_best

In [16]:
param_tune = {'criterion': ['k_best'], 'k_best': [1, 3, 10, 30, 100, 300, 1000]}
st_tune_kb_scores_df = tune_param(SelfTrainingClf, CLF, feature_df, SCORE_FCN, TARGET_COL, param_tune, 
                                  LABEL_SIZES, n_workers=-1)

Tuning k_best


In [17]:
plot_scores(st_tune_kb_scores_df, 'k_best', 'test_score', 'label_size').show()

In [18]:
st_kb_best_df = get_best_score(st_tune_kb_scores_df)

In [19]:
# plot baseline, self-training default and self-training tuned
combine_df = pd.concat([baseline_scores_df, st_base_scores_df.assign(model='ST Default'), 
                        st_best_scores_df.assign(model='ST Thres Tuned'), 
                        st_kb_best_df.assign(model='ST KB Tuned')])

plot_scores(combine_df, 'label_size', TEST_SCORE, 'model', xaxis_type='log').show()

# Label Propagation

In [20]:
lp_base_res = []
for label_size in LABEL_SIZES:
    lp_clf = LabelPropagationClf(feature_df, CLF, SCORE_FCN, TARGET_COL)
    lp_base_res.append(lp_clf.fit_score(label_size))
lp_base_df = pd.DataFrame(lp_base_res)


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide



In [21]:
plot_scores(pd.concat([baseline_scores_df, lp_base_df]), 'label_size', TEST_SCORE, 'model').show()

In [22]:
lp_time_res = []
label_size = 30000
for rbf_size in [100, 300, 1000, 3000, 10000, 30000]:
    lp_clf = LabelPropagationClf(feature_df, CLF, SCORE_FCN, TARGET_COL, rbf_size=rbf_size)
    lp_time_res.append(lp_clf.fit_score(label_size))
lp_time_df = pd.DataFrame(lp_time_res)
lp_time_df


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide



Unnamed: 0,model,label_size,kernel,gamma,n_neighbors,rbf_size,train_score,val_score,test_score,fit_time,score_time,model_size
0,LabelPropagation,30000,rbf,20.0,7,100,0.511076,0.261174,0.295054,3.168059,0.164802,21280
1,LabelPropagation,30000,rbf,20.0,7,300,0.570969,0.343972,0.37659,3.633974,0.168086,50080
2,LabelPropagation,30000,rbf,20.0,7,1000,0.701046,0.545016,0.576953,5.730567,0.184309,150880
3,LabelPropagation,30000,rbf,20.0,7,3000,0.768863,0.65155,0.686204,9.981,0.164383,438880
4,LabelPropagation,30000,rbf,20.0,7,10000,0.797091,0.703894,0.737063,52.162258,0.259149,1446880
5,LabelPropagation,30000,rbf,20.0,7,30000,0.810634,0.742075,0.76776,399.553809,0.297265,4326880


In [23]:
fig = plot_scores(lp_time_df, 'rbf_size', 'fit_time', 'model')
# set y-axis to log scale
fig.update_yaxes(type='log')
fig.show()

## Tune RBF kernel

In [24]:
param_tune = dict(kernel=['rbf'], gamma=[0.01, 0.03, 0.1, 0.3, 1, 3], rbf_size=[100, 300, 1000, 3000, 10000])
lp_rbf_tune_df = tune_param(LabelPropagationClf, CLF, feature_df, SCORE_FCN, TARGET_COL, param_tune, 
                            LABEL_SIZES, n_workers=1)

Tuning gamma



invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid val

Tuning rbf_size



invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid val

In [25]:
plot_df = lp_rbf_tune_df.query('param_tuned == "gamma"')
plot_scores(plot_df, 'gamma', TEST_SCORE, 'label_size').show()

In [26]:
plot_df = lp_rbf_tune_df.query('param_tuned == "rbf_size"')
plot_scores(plot_df, 'rbf_size', TEST_SCORE, 'label_size').show()

In [27]:
plot_df = lp_rbf_tune_df.query('param_tuned == "rbf_size"')
plot_scores(plot_df, 'rbf_size', 'model_size', 'label_size').show()

In [28]:
# get best parameter of each training size
lp_rbf_best_df = get_best_score(lp_rbf_tune_df)

## Tune KNN kernel

In [29]:
param_tune = dict(kernel=['knn'], n_neighbors=[1, 3, 10, 30, 100, 300])
lp_knn_tune_df = tune_param(LabelPropagationClf, CLF, feature_df, SCORE_FCN, TARGET_COL, param_tune, 
                            LABEL_SIZES, n_workers=-1)

Tuning n_neighbors


In [30]:
plot_scores(lp_knn_tune_df, 'n_neighbors', TEST_SCORE, 'label_size').show()

In [31]:
lp_knn_best_df = get_best_score(lp_knn_tune_df)

In [32]:
# compare baseline, label propagation
combine_df = pd.concat([baseline_scores_df, lp_base_df.assign(model='LP Base'), 
                        lp_rbf_best_df.assign(model='LP RBF Tuned'), lp_knn_best_df.assign(model='LP KNN Tuned')])
plot_scores(combine_df, 'label_size', TEST_SCORE, 'model').show()

# Label-Spreading

In [33]:
ls_base_res = []
for label_size in LABEL_SIZES:
    ls_clf = LabelSpreadingClf(feature_df, CLF, SCORE_FCN, TARGET_COL)
    ls_base_res.append(ls_clf.fit_score(label_size))
ls_base_df = pd.DataFrame(ls_base_res)


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide



In [34]:
plot_scores(pd.concat([baseline_scores_df, lp_base_df, ls_base_df]), 'label_size', TEST_SCORE, 'model').show()

### Tune RBF kernel

In [35]:
param_tune = dict(kernel=['rbf'], gamma=[0.01, 0.03, 0.1, 0.3, 1, 3], alpha=[0.1, 0.3, 0.5, 0.7, 0.9], 
                  rbf_size=[100, 300, 1000, 3000, 10000])
ls_rbf_tune_df = tune_param(LabelSpreadingClf, CLF, feature_df, SCORE_FCN, TARGET_COL, param_tune, 
                            LABEL_SIZES, n_workers=1)

Tuning gamma



invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid val

Tuning alpha



invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid val

Tuning rbf_size



invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


max_iter=30 was reached without convergence.


invalid value encountered in divide


max_iter=30 was reached without convergence.


invalid value encountered in divide


max_iter=30 was reached without convergence.


invalid value encountered in divide


max_iter=30 was reached without convergence.


invalid value encountered in divide


max_iter=30 w

In [36]:
plot_scores(ls_rbf_tune_df.query('param_tuned=="gamma"'), 'gamma', TEST_SCORE, 'label_size').show()

In [37]:
plot_scores(ls_rbf_tune_df.query('param_tuned=="alpha"'), 'alpha', TEST_SCORE, 'label_size',
             xaxis_type='linear').show()

In [38]:
plot_scores(ls_rbf_tune_df.query('param_tuned=="rbf_size"'), 'rbf_size', TEST_SCORE, 'label_size').show()

In [39]:
ls_rbf_best_df = get_best_score(ls_rbf_tune_df)

In [40]:
# compare baseline, label propagation
combine_df = pd.concat([baseline_scores_df, ls_base_df.assign(model='LS Base'), 
                        ls_rbf_best_df.assign(model='LS RBF Tuned')])
plot_scores(combine_df, 'label_size', TEST_SCORE, 'model').show()

### Tune KNN Kernel

In [41]:
param_tune = dict(kernel=['knn'], n_neighbors=[1, 3, 10, 30, 100, 300], alpha=[0.1, 0.3, 0.5, 0.7, 0.9])
ls_knn_tune_df = tune_param(LabelSpreadingClf, CLF, feature_df, SCORE_FCN, TARGET_COL, param_tune, 
                            LABEL_SIZES, n_workers=-1)

Tuning n_neighbors
Tuning alpha


In [42]:
plot_scores(ls_knn_tune_df.query('param_tuned=="n_neighbors"'), 'n_neighbors', TEST_SCORE, 'label_size').show()

In [43]:
plot_scores(ls_knn_tune_df.query('param_tuned=="alpha"'), 'alpha', TEST_SCORE, 'label_size', 
            xaxis_type='linear').show()

In [44]:
ls_knn_best_df = get_best_score(ls_knn_tune_df)

In [45]:
# compare baseline, label propagation
combine_df = pd.concat([baseline_scores_df, ls_base_df.assign(model='LS Base'), 
                        ls_rbf_best_df.assign(model='LS RBF Tuned'), ls_knn_best_df.assign(model='LS KNN Tuned')])
plot_scores(combine_df, 'label_size', TEST_SCORE, 'model').show()

# Compare tuned Models

In [46]:
# get best val scores of each model
st_best_df = get_best_score(pd.concat([st_best_scores_df, st_kb_best_df], ignore_index=True))
lp_best_df = get_best_score(pd.concat([lp_rbf_best_df, lp_knn_best_df], ignore_index=True))
ls_best_df = get_best_score(pd.concat([ls_rbf_best_df, ls_knn_best_df], ignore_index=True))

combine_best_df = pd.concat([baseline_scores_df, st_best_df, lp_best_df, ls_best_df], ignore_index=True)

In [47]:
plot_scores(combine_best_df, 'label_size', TEST_SCORE, 'model').show()