In [1]:
import os.path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from docopt import docopt
from joblib import load as joblib_load

from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import precision_recall_curve, precision_score, recall_score
from sklearn.metrics import roc_curve
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

In [58]:
test_scores_sheet_path = 'results/test_scores/test_f1_scores.csv' 
test_scores_df = pd.read_csv(test_scores_sheet_path, index_col=0)
test_scores_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, svc to knn
Data columns (total 1 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   test_f1_score  5 non-null      float64
dtypes: float64(1)
memory usage: 80.0+ bytes


In [47]:
cv_scores_sheet_path = 'results/cross_validation_results.csv'
cv_scores_df = pd.read_csv(cv_scores_sheet_path, index_col=0)
train_f1_scores_raw = cv_scores_df.loc['test_f1',:]
train_f1_scores_raw

Dummy                               0.000 (+/- 0.000)
RandomForestClassifier              0.476 (+/- 0.014)
RandomForestClassifier Optimized    0.477 (+/- 0.019)
kNN                                 0.430 (+/- 0.013)
kNN Optimized                       0.430 (+/- 0.013)
SCV                                 0.447 (+/- 0.021)
SCV Optimized                       0.000 (+/- 0.000)
Logistic Regression                 0.358 (+/- 0.011)
Logistic Regression Otimized        0.345 (+/- 0.010)
Name: test_f1, dtype: object

In [65]:
model_names_map = {
    'Dummy': 'dummy_classifier',
    'SCV Optimized': 'svc',
    'kNN Optimized': 'knn',
    'Logistic Regression Otimized': 'logistic_regression',
    'RandomForestClassifier Optimized': 'random_forest'
}
model_names_list = list(model_names_map.keys())
train_f1_scores_series = train_f1_scores_raw[model_names_list]
train_f1_scores_series = train_f1_scores_series.str.split(' ', expand=True).iloc[:, 0].rename('train_f1_score')
train_f1_scores_series = train_f1_scores_series.rename(model_names_map)
train_f1_scores_series = train_f1_scores_series.astype(np.float64)
train_scores_df = pd.DataFrame(train_f1_scores_series)

In [66]:
train_scores_df

Unnamed: 0,train_f1_score
dummy_classifier,0.0
svc,0.0
knn,0.43
logistic_regression,0.345
random_forest,0.477


In [67]:
train_scores_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, dummy_classifier to random_forest
Data columns (total 1 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   train_f1_score  5 non-null      float64
dtypes: float64(1)
memory usage: 80.0+ bytes


In [68]:
test_scores_df

Unnamed: 0,test_f1_score
svc,0.0
logistic_regression,0.348529
dummy_classifier,0.0
random_forest,0.476865
knn,0.436105


In [69]:
test_scores_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, svc to knn
Data columns (total 1 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   test_f1_score  5 non-null      float64
dtypes: float64(1)
memory usage: 80.0+ bytes


In [76]:
train_test_scores_df = pd.concat([train_scores_df, test_scores_df], axis=1)
train_test_scores_df

Unnamed: 0,train_f1_score,test_f1_score
dummy_classifier,0.0,0.0
svc,0.0,0.0
knn,0.43,0.436105
logistic_regression,0.345,0.348529
random_forest,0.477,0.476865
