In [20]:
import os
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
import pickle
import time
from xgboost import XGBClassifier
import pandas as pd
from imblearn.over_sampling import SMOTE
from transformers import BertTokenizer, BertModel, RobertaTokenizer, RobertaModel
from sklearn.preprocessing import LabelEncoder

In [44]:
TEXT_PATH = r"/scratch/vjh9526/bdml_2025/project/code/classifiers/data/combined_data_with_labels.csv"
RESULTS_PATH = r'/scratch/vjh9526/bdml_2025/project/code/classifiers/natural_modes/results_new_text.pkl'
BATCH_SIZE = 256

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cpu


In [15]:
def load_text_labels(csv_path: str):
    print(f"[INFO] Loading text labels from {csv_path}")
    df = pd.read_csv(csv_path, delimiter=";")
    # assume the sentence is in column 2
    sentences = df.iloc[:, 2].astype(str).str.strip().str.lower()
    categories = df["category"].astype(str).str.strip()
    return dict(zip(sentences, categories))

In [27]:
# Load the BERT and RoBERTa models and tokenizers
bert_model_name = 'bert-base-uncased'
roberta_model_name = 'roberta-base'
bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name)
bert_model = BertModel.from_pretrained(bert_model_name)
roberta_tokenizer = RobertaTokenizer.from_pretrained(roberta_model_name)
roberta_model = RobertaModel.from_pretrained(roberta_model_name)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bert_model.to(device)
roberta_model.to(device)

# Function to get BERT embeddings
def get_bert_embeddings(sentences):
    inputs = bert_tokenizer(sentences, return_tensors='pt', padding=True, truncation=True, max_length=512)
    inputs = {key: val.to(device) for key, val in inputs.items()}
    with torch.no_grad():
        outputs = bert_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).cpu().numpy()

# Function to get RoBERTa embeddings
def get_roberta_embeddings(sentences):
    inputs = roberta_tokenizer(sentences, return_tensors='pt', padding=True, truncation=True, max_length=512)
    inputs = {key: val.to(device) for key, val in inputs.items()}
    with torch.no_grad():
        outputs = roberta_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).cpu().numpy()

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
data = load_text_labels(TEXT_PATH)

sentences, labels = [],[] 
for item in data.items():
    sentences.append(item[0])
    labels.append(item[1])

[INFO] Loading text labels from /scratch/vjh9526/bdml_2025/project/code/classifiers/data/combined_data_with_labels.csv


In [29]:
# Get BERT and RoBERTa embeddings for the sentences, that is index 2 of the dataframe
bert_embeddings = get_bert_embeddings(sentences)
roberta_embeddings = get_roberta_embeddings(sentences)
# bert_embeddings.shape, roberta_embeddings.shape

# use standard scaler to scale the embeddings
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
bert_embeddings_scaled = scaler.fit_transform(bert_embeddings)
roberta_embeddings_scaled = scaler.fit_transform(roberta_embeddings)
bert_embeddings_scaled.shape, roberta_embeddings_scaled.shape

((365, 768), (365, 768))

In [30]:
# Split the data into training and testing sets
X_train_bert, X_test_bert, y_train, y_test = train_test_split(bert_embeddings, labels, test_size=0.2, random_state=42)
X_train_roberta, X_test_roberta, _, _ = train_test_split(roberta_embeddings, labels, test_size=0.2, random_state=42)

le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

X_train_bert.shape, X_test_bert.shape, X_train_roberta.shape, X_test_roberta.shape

((292, 768), (73, 768), (292, 768), (73, 768))

In [22]:
del bert_embeddings, roberta_embeddings

import gc
gc.collect()

1797

In [23]:
import gc
gc.collect()

0

In [43]:
# ------------------- Classifier Training -------------------
from classifiers_eeg_embeds import SoftmaxClassifier, MLPClassifier, train_pytorch_model
from sklearn.tree           import DecisionTreeClassifier
from sklearn.linear_model  import LogisticRegression
from sklearn.naive_bayes   import GaussianNB
from sklearn.neighbors     import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics       import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix
)
import time

embed_models = ["bert", "roberta"]
oversample = ["yes", "no"]

results = {}

for name in embed_models:
    for over in oversample:
        print(f"\n--- Training classifiers on {name} features ---")
        if name == "bert":
            X_tr = X_train_bert
            y_tr = y_train
            X_val = X_test_bert
            y_val = y_test
        else:
            X_tr = X_train_roberta
            y_tr = y_train
            X_val = X_test_roberta
            y_val = y_test

        if over == "yes":
            print(f"\n--- Using SMOTE on {name} features ---")
            smote = SMOTE(random_state=42)
            X_tr, y_tr = smote.fit_resample(X_tr, y_tr)
        
        num_classes = len(np.unique(y_tr))
        input_dim   = X_tr.shape[1]
        results[name] = {}
    
        print(f"Shape of training data is {X_tr.shape}")
        # --- scikit‑learn classifiers ---
        for clf_name_org, clf in {
            'dt'      : DecisionTreeClassifier(random_state=42),
            'logistic': LogisticRegression(
                             multi_class='multinomial',
                             max_iter=1000,
                             random_state=42
                         ),
            'nb'      : GaussianNB(),
            'knn'     : KNeighborsClassifier(n_neighbors=5)
        }.items():
            if over == "yes":
                clf_name = f"{clf_name_org}_SMOTE"
            else:
                clf_name = clf_name_org
            print(f"Training {clf_name} classifier...")
            t0 = time.time()
            clf.fit(X_tr, y_tr)
            preds = clf.predict(X_val)
            acc       = accuracy_score(y_val, preds)
            prec      = precision_score(y_val, preds, average='weighted', zero_division=0)
            rec       = recall_score(y_val, preds, average='weighted', zero_division=0)
            f1        = f1_score(y_val, preds, average='weighted', zero_division=0)
            print(f"[{name}-{clf_name}] acc={acc:.4f} precision={prec:.4f} "
                  f"recall={rec:.4f} f1={f1:.4f} time={time.time()-t0:.2f}s")
    
            results[name][clf_name] = {
                'accuracy' : acc,
                'precision': prec,
                'recall'   : rec,
                'f1'       : f1,
                # 'report'   : classification_report(
                #                  y_val, preds,
                #                  target_names=unique_labels,
                #                  zero_division=0
                #              ),
                'cm'       : confusion_matrix(y_val, preds)
            }
            del clf
    
        # --- PyTorch classifiers ---
        for clf_name, Model in {
            'softmax': SoftmaxClassifier,
            'mlp'    : lambda in_dim, n_cls: MLPClassifier(in_dim, [1024, 512], n_cls)
        }.items():
            print(f"Training {clf_name} classifier...")
            # instantiate
            if clf_name == 'softmax':
                model = Model(input_dim, num_classes)
            else:
                model = Model(input_dim, num_classes)
            pt = train_pytorch_model(
                model, X_tr, X_val, y_tr, y_val,
                batch_size=BATCH_SIZE,
                device=device,
                early_stopping=100
            )
            preds = pt['y_pred']
            acc   = pt['accuracy']
            prec  = precision_score(y_val, preds, average='weighted', zero_division=0)
            rec   = recall_score(y_val, preds, average='weighted', zero_division=0)
            f1_   = f1_score(y_val, preds, average='weighted', zero_division=0)
            print(f"[{name}-{clf_name}] acc={acc:.4f} precision={prec:.4f} "
                  f"recall={rec:.4f} f1={f1_:.4f}")
    
            results[name][clf_name] = {
                'accuracy'      : acc,
                'precision'     : prec,
                'recall'        : rec,
                'f1'            : f1_,
                'train_losses'  : pt['train_losses'],
                'val_accuracies': pt['val_accuracies'],
                # 'report'        : classification_report(
                #                       y_val, preds,
                #                       target_names=unique_labels,
                #                       zero_division=0
                #                   ),
                'cm'            : confusion_matrix(y_val, preds)
            }
            del model, pt
    
        # free memory before next extractor
        del X_tr, y_tr, X_val, y_val



--- Training classifiers on bert features ---

--- Using SMOTE on bert features ---
Shape of training data is (856, 768)
Training dt_SMOTE classifier...
[bert-dt_SMOTE] acc=0.3151 precision=0.3714 recall=0.3151 f1=0.3262 time=0.47s
Training logistic_SMOTE classifier...
[bert-logistic_SMOTE] acc=0.5616 precision=0.6040 recall=0.5616 f1=0.5729 time=0.17s
Training nb_SMOTE classifier...
[bert-nb_SMOTE] acc=0.5205 precision=0.4908 recall=0.5205 f1=0.4945 time=0.01s
Training knn_SMOTE classifier...
[bert-knn_SMOTE] acc=0.3014 precision=0.5793 recall=0.3014 f1=0.2727 time=0.01s
Training softmax classifier...
(856, 768)




Epoch 10, Loss: 1.2019, Accuracy: 0.4384
Epoch 20, Loss: 0.8077, Accuracy: 0.4932
Epoch 30, Loss: 0.6156, Accuracy: 0.5205
Epoch 40, Loss: 0.5094, Accuracy: 0.4932
Epoch 50, Loss: 0.4529, Accuracy: 0.4932
Epoch 60, Loss: 0.3849, Accuracy: 0.5068
Epoch 70, Loss: 0.3467, Accuracy: 0.5205
Epoch 80, Loss: 0.3152, Accuracy: 0.5205
Epoch 90, Loss: 0.2744, Accuracy: 0.5342
Epoch 100, Loss: 0.2532, Accuracy: 0.5205
[INFO] Best accuracy: 0.5479 at epoch 33
[bert-softmax] acc=0.5479 precision=0.5614 recall=0.5205 f1=0.5260
Training mlp classifier...
(856, 768)
Epoch 10, Loss: 0.1981, Accuracy: 0.5890
Epoch 20, Loss: 0.0216, Accuracy: 0.5616
Epoch 30, Loss: 0.0048, Accuracy: 0.5616
Epoch 40, Loss: 0.0028, Accuracy: 0.5479
Epoch 50, Loss: 0.0013, Accuracy: 0.5616
Epoch 60, Loss: 0.0008, Accuracy: 0.5479
Epoch 70, Loss: 0.0008, Accuracy: 0.5479
Epoch 80, Loss: 0.0006, Accuracy: 0.5616
Epoch 90, Loss: 0.0004, Accuracy: 0.5479
Epoch 100, Loss: 0.0004, Accuracy: 0.5479
[INFO] Best accuracy: 0.6027 at 



Epoch 20, Loss: 1.3204, Accuracy: 0.4795
Epoch 30, Loss: 1.1237, Accuracy: 0.4932
Epoch 40, Loss: 1.0168, Accuracy: 0.5205
Epoch 50, Loss: 0.9685, Accuracy: 0.5479
Epoch 60, Loss: 0.8738, Accuracy: 0.5616
Epoch 70, Loss: 0.7923, Accuracy: 0.5616
Epoch 80, Loss: 0.7383, Accuracy: 0.5616
Epoch 90, Loss: 0.6791, Accuracy: 0.5753
Epoch 100, Loss: 0.6921, Accuracy: 0.5890
[INFO] Best accuracy: 0.6027 at epoch 93
[bert-softmax] acc=0.6027 precision=0.5807 recall=0.5890 f1=0.5721
Training mlp classifier...
(292, 768)
Epoch 10, Loss: 1.0876, Accuracy: 0.4795
Epoch 20, Loss: 0.4190, Accuracy: 0.5890
Epoch 30, Loss: 0.1892, Accuracy: 0.5890
Epoch 40, Loss: 0.0417, Accuracy: 0.5616
Epoch 50, Loss: 0.0206, Accuracy: 0.5342
Epoch 60, Loss: 0.0075, Accuracy: 0.5479
Epoch 70, Loss: 0.0051, Accuracy: 0.5205
Epoch 80, Loss: 0.0037, Accuracy: 0.5479
Epoch 90, Loss: 0.0023, Accuracy: 0.5479
Epoch 100, Loss: 0.0033, Accuracy: 0.5342
[INFO] Best accuracy: 0.6575 at epoch 24
[bert-mlp] acc=0.6575 precision=



[roberta-logistic_SMOTE] acc=0.4932 precision=0.5210 recall=0.4932 f1=0.4839 time=0.49s
Training nb_SMOTE classifier...
[roberta-nb_SMOTE] acc=0.4247 precision=0.5065 recall=0.4247 f1=0.4379 time=0.01s
Training knn_SMOTE classifier...
[roberta-knn_SMOTE] acc=0.2055 precision=0.5313 recall=0.2055 f1=0.1861 time=0.01s
Training softmax classifier...
(856, 768)
Epoch 10, Loss: 1.7687, Accuracy: 0.3425
Epoch 20, Loss: 1.5194, Accuracy: 0.3288
Epoch 30, Loss: 1.3178, Accuracy: 0.3425
Epoch 40, Loss: 1.1625, Accuracy: 0.3699
Epoch 50, Loss: 1.0377, Accuracy: 0.3973
Epoch 60, Loss: 0.9411, Accuracy: 0.4795
Epoch 70, Loss: 0.8523, Accuracy: 0.4795
Epoch 80, Loss: 0.7956, Accuracy: 0.4932
Epoch 90, Loss: 0.7401, Accuracy: 0.4932
Epoch 100, Loss: 0.6893, Accuracy: 0.5068
[INFO] Best accuracy: 0.5068 at epoch 92
[roberta-softmax] acc=0.5068 precision=0.5793 recall=0.5068 f1=0.5043
Training mlp classifier...
(856, 768)
Epoch 10, Loss: 0.5564, Accuracy: 0.4795
Epoch 20, Loss: 0.2051, Accuracy: 0.493



[roberta-logistic] acc=0.4384 precision=0.3880 recall=0.4384 f1=0.3854 time=0.34s
Training nb classifier...
[roberta-nb] acc=0.3151 precision=0.3717 recall=0.3151 f1=0.3134 time=0.01s
Training knn classifier...
[roberta-knn] acc=0.3425 precision=0.3583 recall=0.3425 f1=0.3433 time=0.01s
Training softmax classifier...
(292, 768)
Epoch 10, Loss: 1.8034, Accuracy: 0.4110
Epoch 20, Loss: 1.6452, Accuracy: 0.4110
Epoch 30, Loss: 1.6334, Accuracy: 0.4110
Epoch 40, Loss: 1.5434, Accuracy: 0.4110
Epoch 50, Loss: 1.4357, Accuracy: 0.4247
Epoch 60, Loss: 1.4267, Accuracy: 0.4247
Epoch 70, Loss: 1.4070, Accuracy: 0.4110
Epoch 80, Loss: 1.3704, Accuracy: 0.4247
Epoch 90, Loss: 1.3301, Accuracy: 0.4110
Epoch 100, Loss: 1.3032, Accuracy: 0.4247
[INFO] Best accuracy: 0.4384 at epoch 42
[roberta-softmax] acc=0.4384 precision=0.2783 recall=0.4247 f1=0.3118
Training mlp classifier...
(292, 768)
Epoch 10, Loss: 1.6549, Accuracy: 0.4384
Epoch 20, Loss: 1.3036, Accuracy: 0.4932
Epoch 30, Loss: 0.8632, Accu

In [34]:
results

{'bert': {'dt': {'accuracy': 0.4246575342465753,
   'precision': 0.4200913242009132,
   'recall': 0.4246575342465753,
   'f1': 0.4143746664294609,
   'cm': array([[ 2,  0,  0,  0,  0,  2,  2,  0],
          [ 0,  2,  0,  1,  1,  2,  1,  1],
          [ 0,  0,  0,  0,  2,  2,  0,  0],
          [ 1,  0,  0,  0,  0,  2,  0,  0],
          [ 0,  1,  0,  0,  0,  1,  0,  2],
          [ 2,  1,  0,  2,  1, 15,  5,  4],
          [ 0,  0,  0,  0,  0,  1, 12,  1],
          [ 1,  2,  0,  0,  0,  0,  1,  0]])},
  'logistic': {'accuracy': 0.5068493150684932,
   'precision': 0.5177978414279784,
   'recall': 0.5068493150684932,
   'f1': 0.4997215383428375,
   'cm': array([[ 3,  0,  0,  0,  0,  0,  2,  1],
          [ 0,  3,  0,  0,  0,  3,  2,  0],
          [ 0,  0,  2,  0,  0,  2,  0,  0],
          [ 0,  0,  0,  0,  0,  3,  0,  0],
          [ 0,  0,  0,  0,  1,  1,  1,  1],
          [ 0,  2,  0,  0,  1, 18,  9,  0],
          [ 1,  1,  0,  0,  0,  4,  8,  0],
          [ 0,  0,  0,  0,  1,  1

In [45]:
# Save results
with open(RESULTS_PATH, 'wb') as f:
    pickle.dump(results, f)
print("\nAll done.")


All done.
