In [None]:
import sys
import os
import time
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
from cuml.linear_model import LogisticRegression as cuMLLogisticRegression


print('NOVA_HOME is at', os.getenv('NOVA_HOME'))
sys.path.insert(1, os.getenv('NOVA_HOME'))
%load_ext autoreload
%autoreload 2

from utils import *

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
import itertools
from cuml.linear_model import LogisticRegression as cuLogisticRegression

In [None]:
dataset_config = {
    "path_to_embeddings": "/home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen",
    "multiplexed": False,
    "config_fmt": "newNeuronsD8FigureConfig_UMAP1_B{batch}",
    "config_dir": "manuscript/manuscript_figures_data_config",
}

In [None]:
## Baseline
run_baseline_model(
    dataset_config= dataset_config,
    batches=[1, 2, 3, 7, 8, 9],
    balance=False,
    norm=False,
    choose_features=False,
    top_k=100,
    label_map=None,
    classifier_class=cuMLLogisticRegression,
    classifier_kwargs={},
)

In [None]:
## Baseline
run_baseline_model(
    dataset_config= dataset_config,
    batches=[1, 2, 3, 7, 8, 9],
    balance=False,
    norm=False,
    choose_features=False,
    top_k=100,
    label_map=None,
    classifier_class=cuMLLogisticRegression,
    classifier_kwargs={},
    train_specific_batches = [1],
    results_csv = 'classification_results-indi.csv'
)

In [None]:
additional_classifiers = [
    (GaussianNB, {}),
    (RidgeClassifier, {}),
    (LinearSVC, {"C": 1.0, "max_iter": 1000, "random_state": 42}),
    (RandomForestClassifier, {"n_estimators": 300, "random_state": 42}),
    (ExtraTreesClassifier, {
        "max_depth": None,
        "min_samples_leaf": 1,
        "n_estimators": 300,
        "n_jobs": -1,
        "random_state": 42
    }),
]

In [None]:
for clf_class, clf_kwargs in additional_classifiers:
    print(f"\n=== Running {clf_class.__name__} ===")
    run_baseline_model(
        dataset_config=dataset_config,
        batches=[1, 2, 3, 7, 8, 9],
        classifier_class=clf_class,
        classifier_kwargs=clf_kwargs,
        train_specific_batches=[1],
        results_csv="classification_results-indi.csv"
    )

In [None]:
run_train_test_split_baseline(
    dataset_config,              
    batches=[1,],    
    classifier_class=cuMLLogisticRegression,
    classifier_kwargs={},       
)

In [None]:
run_train_test_split_baseline(
    dataset_config,              
    batches=[1,2,3,7,8,9],    
    classifier_class=cuMLLogisticRegression,
    classifier_kwargs={},       
)

### Cytoself

In [None]:
Cytoself_dataset_config = {
    "path_to_embeddings": "/home/projects/hornsteinlab/Collaboration/NOVA/outputs/cytoself_model/",
    "multiplexed": False,
    "config_fmt": "newNeuronsD8FigureConfig_UMAP1_B{batch}",
    "config_dir": "manuscript/manuscript_figures_data_config",
}

In [None]:
## Baseline
run_baseline_model(
    dataset_config= Cytoself_dataset_config,
    batches=[1, 2, 3, 7, 8, 9],
    balance=False,
    norm=False,
    choose_features=False,
    top_k=100,
    label_map=None,
    classifier_class=cuMLLogisticRegression,
    classifier_kwargs={},
)

In [None]:
## Baseline
run_baseline_model(
    dataset_config= Cytoself_dataset_config,
    batches=[1, 2, 3, 7, 8, 9],
    balance=False,
    norm=False,
    choose_features=False,
    top_k=100,
    label_map=None,
    classifier_class=cuMLLogisticRegression,
    classifier_kwargs={},
    train_specific_batches = [1],
    results_csv = 'classification_results-indi.csv'
)

In [None]:
## Baseline
run_baseline_model(
    dataset_config= Cytoself_dataset_config,
    batches=[1, 2, 3, 7, 8, 9],
    balance=False,
    norm=False,
    choose_features=False,
    top_k=100,
    label_map=None,
    classifier_class=cuMLLogisticRegression,
    classifier_kwargs={},
    train_specific_batches = [1],
    apply_pca = True
)

In [None]:
run_train_test_split_baseline(
    Cytoself_dataset_config,              
    batches=[1,],    
    classifier_class=cuMLLogisticRegression,
    classifier_kwargs={},       
)

In [None]:
run_train_test_split_baseline(
    Cytoself_dataset_config,              
    batches=[1,2,3,7,8,9],    
    classifier_class=cuMLLogisticRegression,
    classifier_kwargs={},       
)

In [None]:
for clf_class, clf_kwargs in additional_classifiers:
    print(f"\n=== Running {clf_class.__name__} ===")
    run_baseline_model(
        dataset_config=Cytoself_dataset_config,
        batches=[1, 2, 3, 7, 8, 9],
        classifier_class=clf_class,
        classifier_kwargs=clf_kwargs,
        train_specific_batches=[1],
        results_csv="classification_results-indi.csv"
    )

## Pretrained Model

In [None]:
pretrained_dataset_config = {
    "path_to_embeddings": "/home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/pretrained_model",
    "multiplexed": False,
    "config_fmt": "newNeuronsD8FigureConfig_UMAP1_B{batch}",
    "config_dir": "manuscript/manuscript_figures_data_config",
}

In [None]:
## Baseline
run_baseline_model(
    dataset_config= pretrained_dataset_config,
    batches=[1, 2, 3, 7, 8, 9],
    balance=False,
    norm=False,
    choose_features=False,
    top_k=100,
    label_map=None,
    classifier_class=cuMLLogisticRegression,
    classifier_kwargs={},
    train_specific_batches = [1],        
    results_csv="classification_results-indi.csv"
)

In [None]:
for clf_class, clf_kwargs in additional_classifiers:
    print(f"\n=== Running {clf_class.__name__} ===")
    run_baseline_model(
        dataset_config=pretrained_dataset_config,
        batches=[1, 2, 3, 7, 8, 9],
        classifier_class=clf_class,
        classifier_kwargs=clf_kwargs,
        train_specific_batches=[1],
        results_csv="classification_results-indi.csv"
    )

In [None]:
Cs        = [1.0, 3.0, 10.0, 30.0]     # weaker regularization for 200 features
balances  = [False]#[False, True]        # uses your run_baseline_model's 'balance'
norms     = [False, True]       # uses your run_baseline_model's 'norm'

best = None

for dataset in [dataset_config]:#Cytoself_dataset_config, pretrained_dataset_config]:
    print(dataset)
    for C, bal, norm in itertools.product(Cs, balances, norms):
        print(C,bal,norm)
        clf_class = cuLogisticRegression
        clf_kwargs = dict(
            C=C,
        )
        try:
            res = run_baseline_model(
                dataset_config=dataset,
                batches=[1, 2, 3, 7, 8, 9],
                classifier_class=clf_class,
                classifier_kwargs=clf_kwargs,
                train_specific_batches=[1],
                results_csv=f"classifier_test_linear_params.csv",
                norm=norm,
                balance=bal
            )

            # Pick your metric (prefer macro F1 if available)
            score = res.get("f1_macro", res.get("accuracy"))
            if score is not None and (best is None or score > best[0]):
                best = (score, {"C": C, "balance": bal, "norm": norm})
        except:
            print('failed')

    print("Best config:", best)

## Examine Features

In [None]:
X_train, y_train = load_batches([1])

le = LabelEncoder()
y_encoded = le.fit_transform(y_train)

f_scores, p_values = f_classif(X_train, y_encoded)

In [None]:
top_n = 100
top_idx = np.argsort(f_scores)[-top_n:]

plt.figure(figsize=(10, 5))
plt.bar(range(top_n), f_scores[top_idx])
plt.xticks(range(top_n), top_idx, rotation=45)
plt.ylabel("F-score")
plt.xlabel("Embedding dimension")
plt.title("Top correlated embedding features with labels")
plt.tight_layout()
plt.show()

In [None]:
def get_top_features_for_batch(batch, top_k=200):
    X, y = load_batches([batch])
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)
    f_scores, _ = f_classif(X, y_encoded)
    top_features = np.argsort(f_scores)[-top_k:]
    return set(top_features)

batches = [1, 2, 3, 7, 8, 9,]
top_k = 100

# Get top features per batch
batch_feature_map = {batch: get_top_features_for_batch(batch, top_k) for batch in batches}

# Create overlap matrix
overlap_matrix = pd.DataFrame(index=batches, columns=batches, dtype=int)
for b1, b2 in combinations(batches, 2):
    overlap = len(batch_feature_map[b1] & batch_feature_map[b2])
    overlap_matrix.loc[b1, b2] = overlap
    overlap_matrix.loc[b2, b1] = overlap
for b in batches:
    overlap_matrix.loc[b, b] = top_k

# Plot
plt.figure(figsize=(8, 6))
sns.heatmap(overlap_matrix.astype(int), annot=True, fmt='d', cmap='Blues')
plt.title(f'Overlap of Top {top_k} Features Across Batches')
plt.xlabel("Batch")
plt.ylabel("Batch")
plt.tight_layout()
plt.show()

In [None]:
feat1, feat2 = top_idx[:2]
X_vis = X_train[:, [feat1, feat2]]

# Convert encoded labels back to original strings
labels_str = le.inverse_transform(y_encoded)

# Create a scatter plot
plt.figure(figsize=(8, 6))
for label in np.unique(labels_str):
    idx = labels_str == label
    plt.scatter(X_vis[idx, 0], X_vis[idx, 1], label=label, alpha=0.7)

plt.xlabel(f'Feature {feat1}')
plt.ylabel(f'Feature {feat2}')
plt.title('Top 2 Embeddings by Label')
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.show()


In [None]:
from sklearn.decomposition import PCA
import umap

In [None]:
# 1. Select top 100 features from X_train
X_top = X_train[:, top_idx[:100]]

# 2. Encode labels to color
labels_str = le.inverse_transform(y_encoded)

# --- Option A: PCA ---
use_pca = True
if use_pca:
    reducer = PCA(n_components=2, random_state=42)
else:
    # --- Option B: UMAP ---
    reducer = umap.UMAP(n_components=2, random_state=42)

# 3. Reduce dimensions
X_embedded = reducer.fit_transform(X_top)

# 4. Plot
plt.figure(figsize=(8, 6))
for label in np.unique(labels_str):
    idx = labels_str == label
    plt.scatter(X_embedded[idx, 0], X_embedded[idx, 1], label=label, alpha=0.6)

plt.title("2D Projection of Top 100 Features using " + ("PCA" if use_pca else "UMAP"))
plt.xlabel("Component 1")
plt.ylabel("Component 2")
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.show()

In [None]:
# 1. Select top 100 features from X_train
X_top = X_train[:, top_idx[:100]]

# 2. Encode labels to color
labels_str = le.inverse_transform(y_encoded)

# --- Option A: PCA ---
use_pca = False
if use_pca:
    reducer = PCA(n_components=2, random_state=42)
else:
    # --- Option B: UMAP ---
    reducer = umap.UMAP(n_components=2, random_state=42)

# 3. Reduce dimensions
X_embedded = reducer.fit_transform(X_top)

# 4. Plot
plt.figure(figsize=(8, 6))
for label in np.unique(labels_str):
    idx = labels_str == label
    plt.scatter(X_embedded[idx, 0], X_embedded[idx, 1], label=label, alpha=0.6)

plt.title("2D Projection of Top 100 Features using " + ("PCA" if use_pca else "UMAP"))
plt.xlabel("Component 1")
plt.ylabel("Component 2")
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.show()

In [None]:
# 1. Select top 100 features from X_train
X_top = X_train[:, top_idx[:1000]]

# 2. Encode labels to color
labels_str = le.inverse_transform(y_encoded)

# --- Option A: PCA ---
use_pca = True
if use_pca:
    reducer = PCA(n_components=2, random_state=42)
else:
    # --- Option B: UMAP ---
    reducer = umap.UMAP(n_components=2, random_state=42)

# 3. Reduce dimensions
X_embedded = reducer.fit_transform(X_top)

# 4. Plot
plt.figure(figsize=(8, 6))
for label in np.unique(labels_str):
    idx = labels_str == label
    plt.scatter(X_embedded[idx, 0], X_embedded[idx, 1], label=label, alpha=0.6)

plt.title("2D Projection of Top 100 Features using " + ("PCA" if use_pca else "UMAP"))
plt.xlabel("Component 1")
plt.ylabel("Component 2")
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.show()

In [None]:
# 1. Select top 100 features from X_train
X_top = X_train[:, top_idx[:1000]]

# 2. Encode labels to color
labels_str = le.inverse_transform(y_encoded)

# --- Option A: PCA ---
use_pca = False
if use_pca:
    reducer = PCA(n_components=2, random_state=42)
else:
    # --- Option B: UMAP ---
    reducer = umap.UMAP(n_components=2, random_state=42)

# 3. Reduce dimensions
X_embedded = reducer.fit_transform(X_top)

# 4. Plot
plt.figure(figsize=(8, 6))
for label in np.unique(labels_str):
    idx = labels_str == label
    plt.scatter(X_embedded[idx, 0], X_embedded[idx, 1], label=label, alpha=0.6)

plt.title("2D Projection of Top 100 Features using " + ("PCA" if use_pca else "UMAP"))
plt.xlabel("Component 1")
plt.ylabel("Component 2")
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.show()

In [None]:
# 1. Select top 100 features from X_train
X_top = X_train

# 2. Encode labels to color
labels_str = le.inverse_transform(y_encoded)

# --- Option A: PCA ---
use_pca = False
if use_pca:
    reducer = PCA(n_components=2, random_state=42)
else:
    # --- Option B: UMAP ---
    reducer = umap.UMAP(n_components=2, random_state=42)

# 3. Reduce dimensions
X_embedded = reducer.fit_transform(X_top)

# 4. Plot
plt.figure(figsize=(8, 6))
for label in np.unique(labels_str):
    idx = labels_str == label
    plt.scatter(X_embedded[idx, 0], X_embedded[idx, 1], label=label, alpha=0.6)

plt.title("2D Projection of Top 100 Features using " + ("PCA" if use_pca else "UMAP"))
plt.xlabel("Component 1")
plt.ylabel("Component 2")
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.show()

In [None]:
# 1. Select top 100 features from X_train
X_top = X_train[:, top_idx[:10]]

# 2. Encode labels to color
labels_str = le.inverse_transform(y_encoded)

# --- Option A: PCA ---
use_pca = False
if use_pca:
    reducer = PCA(n_components=2, random_state=42)
else:
    # --- Option B: UMAP ---
    reducer = umap.UMAP(n_components=2, random_state=42)

# 3. Reduce dimensions
X_embedded = reducer.fit_transform(X_top)

# 4. Plot
plt.figure(figsize=(8, 6))
for label in np.unique(labels_str):
    idx = labels_str == label
    plt.scatter(X_embedded[idx, 0], X_embedded[idx, 1], label=label, alpha=0.6)

plt.title("2D Projection of Top 100 Features using " + ("PCA" if use_pca else "UMAP"))
plt.xlabel("Component 1")
plt.ylabel("Component 2")
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.show()