### Loading phase
#### Load the encoder section and tokenizer

In [41]:
from transformers import T5EncoderModel, AutoTokenizer
import numpy as np
import torch

# model_name = "/home/tadesse/research/110125/run2_best"
# model_name = "google/flan-t5-base"
model_name = "/home/tadesse/research/selfsuper_finetuned_test_more_epochs_best"
rank = 0

enc = T5EncoderModel.from_pretrained(model_name)
tok = AutoTokenizer.from_pretrained(model_name)

device = torch.device(f"cuda:{rank}" if torch.cuda.is_available() else "cpu")
enc = enc.to(device)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

#### Load Dataset and create in-domain vs out-domain splits
<!-- * Dataset class extends torch.utils.dataset and automatically restructures input text to match the expected shape within the model -->


In [42]:
from datasets import load_dataset

ds = load_dataset("qiaojin/PubMedQA", "pqa_unlabeled")


cardio_mesh_terms = {
    "Cardiovascular Diseases", "Heart Diseases", "Myocardial Infarction",
    "Coronary Disease", "Atrial Fibrillation", "Heart Failure",
    "Hypertension", "Arrhythmias, Cardiac", "Stroke",
    "Cardiomyopathies", "Valvular Heart Diseases",
    "Statins", "Antihypertensive Agents", "Coronary Artery Bypass", "AlismataceaeApoptosisCell"
}

alzheimers_mesh_terms = {
    "Alzheimer Disease",
    "Alzheimer's Disease",
    "Alzheimer Disease/genetics",
    "Alzheimer Disease/diagnosis",
    "Alzheimer Disease/pathology",
    "Amyloid beta-Peptides",
    "Amyloid beta-Protein Precursor",
    "Amyloid beta-Protein Precursor/genetics",
    "Tau Proteins",
    "tauopathies",
    "Neurofibrillary Tangles",
    "Neurodegenerative Diseases",
    "Dementia",
    "Dementia, Vascular",
    "Mild Cognitive Impairment",
    "Cognition Disorders",
    "Neuroinflammation",
    "Apolipoproteins E",
    "Presenilin-1",
    "Presenilin-2",
    "MAPT",  # tau gene
    "APP",   # amyloid precursor protein
    "PSEN1", "PSEN2"  # presenilin genes
}

# If any mesh in mesh_list is in cardio_mesh_terms, return True
def is_cardiovascular_entry(mesh_list):
    return any(term in cardio_mesh_terms for term in mesh_list)
    # return not any(term in cardio_mesh_terms for term in mesh_list)

# If any mesh in mesh_list is in alzheimers_mesh_terms, return True
def is_alzheimers_entry (mesh_list):
    return any(term in alzheimers_mesh_terms for term in mesh_list)

# Sentences about cardiovascular diseases
ds_not_domain_filtered_neg = ["".join(ex['context']['contexts'][:]) for ex in ds['train'] if is_cardiovascular_entry(ex['context']['meshes'])]

# Sentences about Alzheimer's disease
ds_not_domain_filtered_pos = ["".join(ex['context']['contexts'][:]) for ex in ds['train'] if is_alzheimers_entry(ex['context']['meshes'])]

import random 

probing_data = []
# positive data
i = 0
for item in ds_not_domain_filtered_pos:
    if i < 781:
        probing_data.append({
            'text': item,
            'label': 1
        })
    i += 1

# negative data
i = 0
for item in ds_not_domain_filtered_neg:
    if i < 1000:
        probing_data.append({
            'text': item,
            'label': 0
        })
    i += 1


random.seed(43)
random.shuffle(probing_data)

from sklearn.model_selection import train_test_split

texts = [item['text'] for item in probing_data]
labels = [item['label'] for item in probing_data]

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(
    texts, labels,
    test_size=0.2,
    random_state=42,
    shuffle=True,
    stratify=labels
)

# Reconstruct back into dicts
train_data = [{"text": x, "label": y} for x, y in zip(X_train, y_train)]
test_data  = [{"text": x, "label": y} for x, y in zip(X_test, y_test)]


### Get hidden states and train probe

In [43]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score

def mean_pool(hidden_state, attention_mask):
    mask = attention_mask.unsqueeze(-1).float()
    return (hidden_state * mask).sum(1) / mask.sum(1).clamp_min(1e-9)

@torch.no_grad()
def get_hidden_rep_and_labels(examples, batch_size=8, layers=None):
    """
    examples: list of dicts like {'text': ..., 'label': ...}
    layers: list of integer layer indices into output.hidden_states (e.g., range(25))
    returns: (X_dict, y_array)
        X_dict[L] -> (N, D) for each layer L
        y_array   -> (N,)
    """
    assert len(examples) > 0
    if layers is None:
        # You can also set this explicitly, e.g., layers = list(range(25))
        layers = list(range(25))

    X = {L: [] for L in layers}
    Ys = []

    enc.eval()

    for i in range(0, len(examples), batch_size):
        batch_examples = examples[i:i+batch_size]
        texts = [ex['text'] for ex in batch_examples]
        y_batch = np.array([ex['label'] for ex in batch_examples])

        inputs = tok(texts, padding=True, truncation=True, return_tensors="pt")
        inputs = {k: v.to(device) for k, v in inputs.items() if k in {"input_ids", "attention_mask"}}

        output = enc(**inputs, output_hidden_states=True, return_dict=True)
        hiddens = output.hidden_states  # tuple/list: each is (B, T, D)

        for L in layers:
            pooled = mean_pool(hiddens[L], inputs["attention_mask"]).cpu().numpy()  # (B, D)
            X[L].append(pooled)

        Ys.append(y_batch)

    X_out = {L: np.vstack(X[L]) for L in layers}   # (N, D) per layer
    y_out = np.concatenate(Ys, axis=0)             # (N,)
    return X_out, y_out

def layer_probe_cv(X, y, Cs=(0.01, 0.1, 1.0, 10.0)):
    best = None
    for C in Cs:
        clf = LogisticRegression(penalty="l2", C=C, max_iter=1000)
        # clf = LogisticRegression(penalty="l2", C=C, max_iter=2000, n_jobs=-1)
        scores = cross_val_score(clf, X, y, cv=StratifiedKFold(5, shuffle=True, random_state=0))
        avg = scores.mean()
        if (best is None) or (avg > best[0]):
            best = (avg, C)
    return best  # (cv_score, C)


In [44]:
print('=========Embedding training data (for probe)============')
layers = list(range(25))  # e.g., embeddings + 24 encoder blocks
X_train, y_train = get_hidden_rep_and_labels(train_data, batch_size=8, layers=layers)

print('=========Embedding testing data (for probe)============')
X_test,  y_test  = get_hidden_rep_and_labels(test_data,  batch_size=8, layers=layers)



## Training and testing probe
### Visualizing embeddings using t-SNE

In [None]:
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Train on best layer and evaluate on test
# clf = LogisticRegression(penalty="l2", C=best_C, max_iter=2000, n_jobs=-1)
# clf = LogisticRegression(penalty="l2", max_iter = 2000)
clf = LogisticRegression(max_iter = 2000)
clf.fit(X_train[22], y_train)
y_pred = clf.predict(X_test[22])

print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9131652661064426
F1 Score: 0.9015873015873016
Report:
               precision    recall  f1-score   support

           0       0.92      0.92      0.92       200
           1       0.90      0.90      0.90       157

    accuracy                           0.91       357
   macro avg       0.91      0.91      0.91       357
weighted avg       0.91      0.91      0.91       357



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


: 

In [35]:
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Train on best layer and evaluate on test
# clf = LogisticRegression(penalty="l2", C=best_C, max_iter=2000, n_jobs=-1)
# clf = LogisticRegression(penalty="l2", max_iter = 2000)
clf = LogisticRegression(max_iter = 2000)
clf.fit(X_train[24], y_train)
y_pred = clf.predict(X_test[24])

print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.896358543417367
F1 Score: 0.8762541806020067
Report:
               precision    recall  f1-score   support

           0       0.88      0.94      0.91       200
           1       0.92      0.83      0.88       157

    accuracy                           0.90       357
   macro avg       0.90      0.89      0.89       357
weighted avg       0.90      0.90      0.90       357



In [None]:
layer_results = {}
for L in layers:
    cv_score, C = layer_probe_cv(X_train[L], y_train)
    layer_results[L] = (cv_score, C)

best_L = max(layer_results, key=lambda L: layer_results[L][0])
best_score, best_C = layer_results[best_L]

# Train on best layer and evaluate on test
# clf = LogisticRegression(penalty="l2", C=best_C, max_iter=2000, n_jobs=-1)
# clf.fit(X_train[best_L], y_train)
# y_pred = clf.predict(X_test[best_L])


In [13]:
layer_results = {}
for L in layers:
    cv_score, C = layer_probe_cv(X_train[L], y_train)
    layer_results[L] = (cv_score, C)

best_L = max(layer_results, key=lambda L: layer_results[L][0])
best_score, best_C = layer_results[best_L]

# Train on best layer and evaluate on test
clf = LogisticRegression(penalty="l2", C=best_C, max_iter=2000, n_jobs=-1)
clf.fit(X_train[best_L], y_train)
y_pred = clf.predict(X_test[best_L])


4686.21s - Error patching args (debugger not attached to subprocess).
Traceback (most recent call last):
  File "/home/tadesse/miniconda3/envs/nlp_ml/lib/python3.9/site-packages/debugpy/_vendored/pydevd/_pydev_bundle/pydev_monkey.py", line 541, in patch_args
    new_args.append(_get_python_c_args(host, port, code, unquoted_args, SetupHolder.setup))
  File "/home/tadesse/miniconda3/envs/nlp_ml/lib/python3.9/site-packages/debugpy/_vendored/pydevd/_pydev_bundle/pydev_monkey.py", line 193, in _get_python_c_args
    if "__future__" in code:
TypeError: a bytes-like object is required, not 'str'


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

KeyboardInterrupt: 

In [8]:
from sklearn.metrics import accuracy_score, f1_score, classification_report

best_y_pred = layer_probe_cv(X_train, y_train)

print("Accuracy:", accuracy_score(y_test, best_y_pred))
print("F1 Score:", f1_score(y_test, best_y_pred))
print("Report:\n", classification_report(y_test, best_y_pred))

ValueError: Found input variables with inconsistent numbers of samples: [25, 1424]

In [None]:
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt