In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split

from gridsearch_utils import *

In [9]:
import IPython

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
df_labeled = pd.read_csv("labeled_examples.csv")
df_unlabeled = pd.read_csv("unlabeled_sample01_cellpose.csv")

# we remove the centroid displacement features as they're bugged (for now)
del df_labeled['PhaseSTDLocalCentroidDisplacement']
del df_unlabeled['PhaseSTDLocalCentroidDisplacement']
del df_labeled['AmplitudeSTDLocalCentroidDisplacement']
del df_unlabeled['AmplitudeSTDLocalCentroidDisplacement']

df_labeled['Labels'] = df_labeled['Labels'].str.strip("b'")

X = df_labeled.drop("Labels", axis=1)
y = df_labeled['Labels']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, 
                                                    random_state=22, stratify=y)

In [12]:
ct = create_column_transformer(X, y)

# OOF Binary Classifier

In [13]:
y_train_binary_oof = np.where(y_train == 'oof', 1, 0)
y_test_binary_oof = np.where(y_test == 'oof', 1, 0)

## Logistic Regression

In [None]:
oof_logreg_prauc = binary_logreg_gridsearch(ct, X_train, y_train_binary_oof, 'AVG_PR')
oof_logreg_prauc_model = oof_logreg_prauc.best_estimator_
save_model(oof_logreg_prauc_model, "models", "oof_logreg_prauc.pkl")

oof_logreg_fhalf = binary_logreg_gridsearch(ct, X_train, y_train_binary_oof, 'FHALF')
oof_logreg_fhalf_model = oof_logreg_fhalf.best_estimator_
save_model(oof_logreg_fhalf_model, "models", "oof_logreg_fhalf.pkl")

evaluate_model(oof_logreg_prauc_model, X_train, X_test, y_train_binary_oof, y_test_binary_oof)
evaluate_model(oof_logreg_fhalf_model, X_train, X_test, y_train_binary_oof, y_test_binary_oof)

Fitting 25 folds for each of 1080 candidates, totalling 27000 fits
[CV 1/25] END classifier__C=0.001, classifier__class_weight=None, classifier__penalty=l1, column_transform=None, feature_selection__k=5, oversample=SMOTE(k_neighbors=3, random_state=42), scaler=None, undersample=RandomUnderSampler(random_state=42); ACC: (train=0.091, test=0.118) AVG_PR: (train=0.091, test=0.118) BACC: (train=0.500, test=0.500) F1: (train=0.167, test=0.211) FHALF: (train=0.111, test=0.143) RE: (train=1.000, test=1.000) total time=   0.1s
[CV 17/25] END classifier__C=0.001, classifier__class_weight=None, classifier__penalty=l1, column_transform=None, feature_selection__k=5, oversample=SMOTE(k_neighbors=3, random_state=42), scaler=None, undersample=RandomUnderSampler(random_state=42); ACC: (train=0.091, test=0.118) AVG_PR: (train=0.091, test=0.118) BACC: (train=0.500, test=0.500) F1: (train=0.167, test=0.211) FHALF: (train=0.111, test=0.143) RE: (train=1.000, test=1.000) total time=   0.0s
[CV 19/25] END c

## XGBoost

In [None]:
oof_xgb_prauc = binary_xgboost_gridsearch(ct, X_train, y_train_binary_oof, 'AVG_PR')
oof_xgb_prauc_model = oof_xgb_prauc.best_estimator_
save_model(oof_xgb_prauc_model, "models", "oof_xgb_prauc.pkl")

oof_xgb_fhalf = binary_xgboost_gridsearch(ct, X_train, y_train_binary_oof, 'FHALF')
oof_xgb_fhalf_model = oof_xgb_fhalf.best_estimator_
save_model(oof_xgb_fhalf_model, "models", "oof_xgb_fhalf.pkl")

evaluate_model(oof_xgb_prauc_model, X_train, X_test, y_train_binary_oof, y_test_binary_oof)
evaluate_model(oof_xgb_fhalf_model, X_train, X_test, y_train_binary_oof, y_test_binary_oof)

# AGG Binary Classifier

In [None]:
y_train_binary_agg = np.where(y_train == 'agg', 1, 0)
y_test_binary_agg = np.where(y_test == 'agg', 1, 0)

## Logistic Regression

In [None]:
agg_logreg_prauc = binary_logreg_gridsearch(ct, X_train, y_train_binary_agg, 'AVG_PR')
agg_logreg_prauc_model = agg_logreg_prauc.best_estimator_
save_model(agg_logreg_prauc_model, "models", "agg_logreg_prauc.pkl")

agg_logreg_fhalf = binary_logreg_gridsearch(ct, X_train, y_train_binary_agg, 'FHALF')
agg_logreg_fhalf_model = agg_logreg_fhalf.best_estimator_
save_model(agg_logreg_fhalf_model, "models", "agg_logreg_fhalf.pkl")

evaluate_model(agg_logreg_prauc_model, X_train, X_test, y_train_binary_agg, y_test_binary_agg)
evaluate_model(agg_logreg_fhalf_model, X_train, X_test, y_train_binary_agg, y_test_binary_agg)

## XGBoost

In [None]:
agg_xgb_prauc = binary_xgboost_gridsearch(ct, X_train, y_train_binary_agg, 'AVG_PR')
agg_xgb_prauc_model = agg_xgb_prauc.best_estimator_
save_model(agg_xgb_prauc_model, "models", "agg_xgb_prauc.pkl")

agg_xgb_fhalf = binary_xgboost_gridsearch(ct, X_train, y_train_binary_agg, 'FHALF')
agg_xgb_fhalf_model = agg_xgb_fhalf.best_estimator_
save_model(agg_xgb_fhalf_model, "models", "agg_xgb_fhalf.pkl")

evaluate_model(agg_xgb_prauc_model, X_train, X_test, y_train_binary, y_test_binary)
evaluate_model(agg_xgb_fhalf_model, X_train, X_test, y_train_binary, y_test_binary)

# Cell Multiclass Classifier

In [None]:
X_train_cell = X_train[(y_train == 'plt') | (y_train == 'wbc') | (y_train == 'rbc')]
y_train_cell = y_train[(y_train == 'plt') | (y_train == 'wbc') | (y_train == 'rbc')]
X_test_cell = X_test[(y_test == 'plt') | (y_test == 'wbc') | (y_test == 'rbc')]
y_test_cell = y_test[(y_test == 'plt') | (y_test == 'wbc') | (y_test == 'rbc')]

## Logistic Regression

In [None]:
cell_logreg_f1macro = multiclass_logreg_gridsearch(ct, X_train_cell, y_train_cell, 'F1_MACRO')
cell_logreg_f1macro_model = cell_logreg_f1macro.best_estimator_
save_model(cell_logreg_f1macro_model, "models", "cell_logreg_f1macro.pkl")

## XGBoost

In [None]:
cell_xgb_f1macro = multiclass_xgboost_gridsearch(ct, X_train_cell, y_train_cell, 'F1_MACRO')
cell_xgb_f1macro_model = cell_xgb_f1macro.best_estimator_
save_model(cell_xgb_f1macro_model, "models", "cell_xgb_f1macro.pkl")