In [1]:
import pandas as pd
from transformers import BertTokenizer, BertModel
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ─── Cell 1: pick‐and‐load your dataset ────────────────────────────────────────

# 1) change this to 'ag', 'amazon' or 'imdb'
# DATASET = 'ag'  
DATASET = 'amazon'
# DATASET = 'imdb'

TEST_SIZE = 0.2  # fraction to hold out as “test” when you don't have a built‐in test split

configs = {
    'ag': {
        'path': 'dataset/ag-news-classification-dataset',
        'train_file': 'train.csv',
        'test_file':  'test.csv',
        'text_cols':  ['Title','Description'],
        'label_col':  'Class Index',
        'label_shift': -1,
        'has_test_file': True
    },
    'amazon': {
        'path': 'dataset/amazon-fine-food-reviews',
        'train_file': 'Reviews.csv',
        'test_file':  None,
        'text_cols':  ['Summary','Text'],
        'label_col':  'Score',
        'label_transform': lambda x: int(x)-1,
        'has_test_file': False
    },
    'imdb': {
        'path': 'dataset/imdb-dataset-of-50k-movie-reviews',
        'train_file': 'IMDB Dataset.csv',
        'test_file':  None,
        'text_cols':  ['review'],
        'label_col':  'sentiment',
        'label_transform': lambda x: 1 if x=='positive' else 0,
        'has_test_file': False
    }
}

cfg = configs[DATASET]

# load train
train_df = pd.read_csv(f"{cfg['path']}/{cfg['train_file']}")
# load test if provided
if cfg['has_test_file']:
    test_df  = pd.read_csv(f"{cfg['path']}/{cfg['test_file']}")

# build train_texts
if len(cfg['text_cols'])>1:
    train_texts = (train_df[cfg['text_cols'][0]] + " " + train_df[cfg['text_cols'][1]]).tolist()
else:
    train_texts = train_df[cfg['text_cols'][0]].tolist()

# build train_labels
if 'label_shift' in cfg:
    train_labels = (train_df[cfg['label_col']] + cfg['label_shift']).tolist()
else:
    train_labels = train_df[cfg['label_col']].map(cfg['label_transform']).tolist()

# handle test_texts / test_labels
if cfg['has_test_file']:
    if len(cfg['text_cols'])>1:
        test_texts = (test_df[cfg['text_cols'][0]] + " " + test_df[cfg['text_cols'][1]]).tolist()
    else:
        test_texts = test_df[cfg['text_cols'][0]].tolist()
    if 'label_shift' in cfg:
        test_labels = (test_df[cfg['label_col']] + cfg['label_shift']).tolist()
    else:
        test_labels = test_df[cfg['label_col']].map(cfg['label_transform']).tolist()
else:
    split_idx   = int(len(train_texts)*(1-TEST_SIZE))
    test_texts  = train_texts[split_idx:]
    test_labels = train_labels[split_idx:]
    train_texts = train_texts[:split_idx]
    train_labels= train_labels[:split_idx]

print(f"{DATASET}:  #train={len(train_texts)}  #test={len(test_texts)}")


amazon:  #train=454763  #test=113691


In [3]:
# Step 1: Preprocess the text
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")

# Function to extract embeddings
def get_embeddings(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    outputs = model(**inputs)
    cls_embedding = outputs.last_hidden_state[0, 0, :]
    return cls_embedding

In [None]:
# ─── Cell 2: Generate & save TRAINING embeddings ─────────────────────────────
fractions = [0.2, 0.4, 0.6, 0.8]

# assumes: DATASET, fractions, train_texts, model, tokenizer are already defined

for frac in fractions:
    n = int(len(train_texts) * frac)
    embs = []
    print(f"→ {DATASET}: generating {int(frac*100)}% training embeddings ({n} samples)")
    for txt in tqdm(train_texts[:n], leave=False):
        e = get_embeddings(txt, model, tokenizer)
        embs.append(e.detach().numpy())
    embs = np.array(embs)
    np.save(f"{DATASET}_bert_embeddings_{int(frac*100)}.npy", embs)


In [None]:
# ─── Cell 3: Generate & save TEST embeddings ─────────────────────────────────

# assumes: DATASET, test_texts, model, tokenizer are already defined

print(f"→ {DATASET}: generating test embeddings ({len(test_texts)} samples)")
test_embs = []
for txt in tqdm(test_texts, leave=False):
    try:
        e = get_embeddings(txt, model, tokenizer)
        test_embs.append(e.detach().numpy())
    except Exception as j:
        print(f"Error processing text")

test_embs = np.array(test_embs)
np.save(f"{DATASET}_bert_embeddings_test.npy", test_embs)


→ amazon: generating test embeddings (113691 samples)


 26%|██▌       | 29608/113691 [19:05<42:43, 32.80it/s]  

Error processing text


 28%|██▊       | 31881/113691 [20:33<40:35, 33.59it/s]  

Error processing text


 43%|████▎     | 48500/113691 [31:18<31:17, 34.73it/s]  

Error processing text


 47%|████▋     | 53517/113691 [46:42<1048:04:54, 62.70s/it]

In [None]:
# ─── Cell 3: train LogisticRegression and eval on the fixed test set ─────────

test_embs  = np.load(f"{DATASET}_bert_embeddings_test.npy")
test_lbls  = np.array(test_labels)

for frac in fractions:
    train_embs = np.load(f"{DATASET}_bert_embeddings_{int(frac*100)}.npy")
    train_lbls = np.array(train_labels[:len(train_embs)])
    clf = LogisticRegression(max_iter=1_000)
    clf.fit(train_embs, train_lbls)
    y_pred = clf.predict(test_embs)
    print(f"\n=== {DATASET} | {int(frac*100)}% train ===")
    print(classification_report(test_lbls, y_pred))
