In [None]:
import pandas as pd
from transformers import BertTokenizer, BertModel
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import torch
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

device

device(type='mps')

In [3]:
# ─── Cell 1: pick‐and‐load your dataset ────────────────────────────────────────

# 1) change this to 'ag', 'amazon' or 'imdb'
DATASET = 'ag'  
# DATASET = 'amazon'
# DATASET = 'imdb'

TEST_SIZE = 0.2  # fraction to hold out as “test” when you don't have a built‐in test split

configs = {
    'ag': {
        'path': 'dataset/ag-news-classification-dataset',
        'train_file': 'train.csv',
        'test_file':  'test.csv',
        'text_cols':  ['Title','Description'],
        'label_col':  'Class Index',
        'label_shift': -1,
        'has_test_file': True
    },
    'amazon': {
        'path': 'dataset/amazon-fine-food-reviews',
        'train_file': 'Reviews.csv',
        'test_file':  None,
        'text_cols':  ['Summary','Text'],
        'label_col':  'Score',
        'label_transform': lambda x: int(x)-1,
        'has_test_file': False
    },
    'imdb': {
        'path': 'dataset/imdb-dataset-of-50k-movie-reviews',
        'train_file': 'IMDB Dataset.csv',
        'test_file':  None,
        'text_cols':  ['review'],
        'label_col':  'sentiment',
        'label_transform': lambda x: 1 if x=='positive' else 0,
        'has_test_file': False
    }
}

cfg = configs[DATASET]

# load train
train_df = pd.read_csv(f"{cfg['path']}/{cfg['train_file']}")
# load test if provided
if cfg['has_test_file']:
    test_df  = pd.read_csv(f"{cfg['path']}/{cfg['test_file']}")

# build train_texts
if len(cfg['text_cols'])>1:
    train_texts = (train_df[cfg['text_cols'][0]] + " " + train_df[cfg['text_cols'][1]]).tolist()
else:
    train_texts = train_df[cfg['text_cols'][0]].tolist()

# build train_labels
if 'label_shift' in cfg:
    train_labels = (train_df[cfg['label_col']] + cfg['label_shift']).tolist()
else:
    train_labels = train_df[cfg['label_col']].map(cfg['label_transform']).tolist()

# handle test_texts / test_labels
if cfg['has_test_file']:
    if len(cfg['text_cols'])>1:
        test_texts = (test_df[cfg['text_cols'][0]] + " " + test_df[cfg['text_cols'][1]]).tolist()
    else:
        test_texts = test_df[cfg['text_cols'][0]].tolist()
    if 'label_shift' in cfg:
        test_labels = (test_df[cfg['label_col']] + cfg['label_shift']).tolist()
    else:
        test_labels = test_df[cfg['label_col']].map(cfg['label_transform']).tolist()
else:
    split_idx   = int(len(train_texts)*(1-TEST_SIZE))
    test_texts  = train_texts[split_idx:]
    test_labels = train_labels[split_idx:]
    if DATASET == 'amazon':
        # drop (29617, 31891, 48510, 75966) because of errors in the test set
        test_labels = np.delete(test_labels, [29617, 31891, 48510, 75966])
    train_texts = train_texts[:split_idx]
    train_labels= train_labels[:split_idx]

print(f"{DATASET}:  #train={len(train_texts)}  #test={len(test_texts)}")


ag:  #train=120000  #test=7600


In [4]:
def get_embeddings(text, model, tokenizer, device):
    # tokenize on CPU…
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    # …then move to MPS (or CUDA)  
    inputs = {k: v.to(device) for k, v in inputs.items()}
    # inference without tracking gradients  
    with torch.no_grad():
        outputs = model(**inputs)
    # take [CLS] embedding, move it back to CPU if you want a numpy array
    cls_embedding = outputs.last_hidden_state[0, 0, :].cpu().numpy()
    return cls_embedding

# usage
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased").to(device)


In [5]:
fractions = [0.2, 0.4, 0.6, 0.8]

In [None]:
# ─── Cell 2: Generate & save TRAINING embeddings ─────────────────────────────

# assumes: DATASET, fractions, train_texts, model, tokenizer are already defined

for frac in fractions:
    n = int(len(train_texts) * frac)
    embs = []
    print(f"→ {DATASET}: generating {int(frac*100)}% training embeddings ({n} samples)")
    for txt in tqdm(train_texts[:n], leave=False):
        e = get_embeddings(txt, model, tokenizer, device)
        embs.append(e)
    embs = np.array(embs)
    np.save(f"{DATASET}_bert_embeddings_{int(frac*100)}.npy", embs)


In [None]:
# ─── Cell 3: Generate & save TEST embeddings ─────────────────────────────────

# assumes: DATASET, test_texts, model, tokenizer are already defined

print(f"→ {DATASET}: generating test embeddings ({len(test_texts)} samples)")
test_embs = []
for txt in tqdm(test_texts, leave=False):
    try:
        e = get_embeddings(txt, model, tokenizer, device)
        test_embs.append(e)
    except Exception as j:
        print(f"Error processing text")

test_embs = np.array(test_embs)
np.save(f"{DATASET}_bert_embeddings_test.npy", test_embs)


In [None]:
# ─── Cell 4: train LogisticRegression and eval on the fixed test set ─────────
test_embs  = np.load(f"{DATASET}_bert_embeddings_test.npy")
test_lbls  = np.array(test_labels)

for frac in fractions:
    train_embs = np.load(f"{DATASET}_bert_embeddings_{int(frac*100)}.npy")
    train_lbls = np.array(train_labels[:len(train_embs)])
    if train_embs.shape[0] != len(train_lbls):
        train_embs = train_embs[:len(train_lbls)]
    clf = LogisticRegression(max_iter=1_000)
    clf.fit(train_embs, train_lbls)
    y_pred = clf.predict(test_embs)
    print(f"\n=== {DATASET} | {int(frac*100)}% train ===")
    print(classification_report(test_lbls, y_pred))


In [6]:
# ─── Cell 5: summarize BERT fractions with ULMFiT metrics ───────────────────

# load test set once
test_embs = np.load(f"./bert/{DATASET}/embed/{DATASET}_bert_embeddings_test.npy")
y_true   = np.array(test_labels)

rows = []
baseline_frac = fractions[0]
baseline_error = None

for frac in fractions:
    train_embs = np.load(f"./bert/{DATASET}/embed/{DATASET}_bert_embeddings_{int(frac*100)}.npy")
    y_train    = np.array(train_labels[: len(train_embs)])
    if train_embs.shape[0] != len(y_train):
        train_embs = train_embs[:len(y_train)]

    clf = LogisticRegression(max_iter=1_000)
    clf.fit(train_embs, y_train)
    y_pred = clf.predict(test_embs)

    acc = accuracy_score(y_true, y_pred)
    err = 1.0 - acc
    if frac == baseline_frac:
        baseline_error = err
    rel = (baseline_error - err) / baseline_error * 100 if baseline_error is not None else 0.0

    rows.append({
        "fraction_%":        int(frac*100),
        "accuracy":          acc,
        "error_rate":        err,
        "rel_err_reduction": rel
    })

df = pd.DataFrame(rows).set_index("fraction_%")
print(df)

# ensure results dir exists
results_dir = f"./bert/{DATASET}/results"
os.makedirs(results_dir, exist_ok=True)

# save CSV into that folder
output_path = os.path.join(results_dir, "bert_ulmfit_metrics.csv")
df.to_csv(output_path)
print(f"→ Saved ULMFiT-style metrics to {output_path}")


            accuracy  error_rate  rel_err_reduction
fraction_%                                         
20          0.878553    0.121447           0.000000
40          0.890263    0.109737           9.642470
60          0.897632    0.102368          15.709642
80          0.901316    0.098684          18.743229
→ Saved ULMFiT-style metrics to ./bert/ag/results/bert_ulmfit_metrics.csv
