In [None]:
import sys
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from cuml.linear_model import LogisticRegression as cuMLLogisticRegression

print('NOVA_HOME is at', os.getenv('NOVA_HOME'))
sys.path.insert(1, os.getenv('NOVA_HOME'))
%load_ext autoreload
%autoreload 2

from utils import *

In [None]:
dataset_config = {
    "path_to_embeddings": "/home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen",
    "multiplexed": False,
    "config_fmt": "newNeuronsD8FigureConfig_UMAP1_B{batch}",
    "config_dir": "manuscript/manuscript_figures_data_config",
}

### LOO

In [None]:
run_baseline_model(
    dataset_config,                # dict with paths/loading settings for embeddings
    batches=[1, 2, 3, 7, 8, 9,],   # list of batch IDs to include in the experiment
    balance=False,                 # whether to balance class distributions during training
    norm=False,                    # whether to normalize features before training
    choose_features=False,         # whether to select top features (e.g., univariate ranking)
    top_k=100,                     # number of features to keep if choose_features=True
    apply_pca=False,               # whether to reduce dimensionality with PCA
    pca_components=50,             # number of PCA components if apply_pca=True
    label_map=None,                # optional mapping to merge/remap labels, e.g. {"WT":0,"KO":1}
    classifier_class=cuMLLogisticRegression, # classifier class to use (any sklearn/cuML-compatible estimator)
    classifier_kwargs=dict(),      # extra arguments for the classifier constructor (e.g. {"max_depth":10})
    test_specific_batches=None,    # int or list: which batches to use as test folds; None = default LOOCV
    train_specific_batches=None,   # int or list: which batches to use for training; None = complement of test
    return_proba=False             # if True, return DataFrame of predicted probabilities along with metrics
)


### Train on batch 1 and test on the rest 

In [None]:
## Baseline
run_baseline_model(
    dataset_config= dataset_config,
    batches=[1, 2, 3, 7, 8, 9],
    balance=False,
    norm=False,
    choose_features=False,
    top_k=100,
    label_map=None,
    classifier_class=cuMLLogisticRegression,
    classifier_kwargs={},
    train_specific_batches = [1]
)

### Random 80/20 training and testing

In [None]:
run_train_test_split_baseline(
    dataset_config,                # dict with paths/loading settings for embeddings
    batches=[1, 2, 3, 7, 8, 9],    # list of batch IDs to load and combine into one dataset
    balance=False,                 # whether to balance class distributions during training
    norm=False,                    # whether to normalize features before training
    choose_features=False,         # whether to select top features (e.g., univariate ranking)
    top_k=100,                     # number of features to keep if choose_features=True
    apply_pca=False,               # whether to reduce dimensionality with PCA
    pca_components=50,             # number of PCA components if apply_pca=True
    classifier_class=cuMLLogisticRegression, # classifier class to use (any sklearn/cuML-compatible estimator)
    classifier_kwargs={},          # extra arguments for the classifier constructor (e.g. {"max_depth":10})
    return_proba=False             # if True, return predicted probabilities along with metrics
)