In [72]:
import os
import pandas as pd
import numpy as np
import warnings
from sklearn.metrics import accuracy_score, f1_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

warnings.filterwarnings('ignore')

# Change this to 'amazon', 'ag', or 'imdb'
# DATASET = 'amazon'
# DATASET = 'ag'
DATASET = 'amazon'

TEST_SIZE = 0.2  # fraction for test set

configs = {
    'ag': {
        'path': 'dataset/ag-news-classification-dataset',
        'train_file': 'train.csv',
        'test_file':  'test.csv',
        'text_cols':  ['Title','Description'],
        'label_col':  'Class Index',
        'label_shift': -1,
        'has_test_file': True
    },
    'amazon': {
        'path': 'dataset/amazon-fine-food-reviews',
        'train_file': 'Reviews.csv',
        'test_file':  None,
        'text_cols':  ['Text'],
        'label_col':  'Score',
        'has_test_file': False
    },
    'imdb': {
        'path': 'dataset/imdb-dataset-of-50k-movie-reviews',
        'train_file': 'IMDB Dataset.csv',
        'test_file':  None,
        'text_cols':  ['review'],
        'label_col':  'sentiment',
        'label_transform': lambda x: 1 if x=='positive' else 0,
        'has_test_file': False
    }
}

cfg = configs[DATASET]

In [73]:
# Cell 2: Load and split dataset
# ────────────────────────────────

# load train
train_df = pd.read_csv(f"{cfg['path']}/{cfg['train_file']}")

# build train_texts
if len(cfg['text_cols']) > 1:
    texts = train_df[cfg['text_cols'][0]].astype(str) + " " + train_df[cfg['text_cols'][1]].astype(str)
else:
    texts = train_df[cfg['text_cols'][0]].astype(str)

# build train_labels
if 'label_shift' in cfg:
    labels = (train_df[cfg['label_col']] + cfg['label_shift']).tolist()
elif 'label_transform' in cfg:
    labels = train_df[cfg['label_col']].map(cfg['label_transform']).tolist()
else:
    labels = train_df[cfg['label_col']].tolist()

# split into train/test
if cfg['has_test_file']:
    # built‐in test split
    test_df = pd.read_csv(f"{cfg['path']}/{cfg['test_file']}")
    if len(cfg['text_cols']) > 1:
        test_texts = test_df[cfg['text_cols'][0]].astype(str) + " " + test_df[cfg['text_cols'][1]].astype(str)
    else:
        test_texts = test_df[cfg['text_cols'][0]].astype(str)
    if 'label_shift' in cfg:
        test_labels = (test_df[cfg['label_col']] + cfg['label_shift']).tolist()
    elif 'label_transform' in cfg:
        test_labels = test_df[cfg['label_col']].map(cfg['label_transform']).tolist()
    else:
        test_labels = test_df[cfg['label_col']].tolist()

    train_texts = texts.tolist()
    train_labels = labels
else:
    # sequential split: first (1–TEST_SIZE) for train, last TEST_SIZE for test
    split_idx = int(len(texts) * (1 - TEST_SIZE))
    train_texts = texts.tolist()[:split_idx]
    train_labels = labels[:split_idx]
    test_texts  = texts.tolist()[split_idx:]
    test_labels = labels[split_idx:]

print(f"{DATASET}: #train={len(train_texts)}  #test={len(test_texts)}")


amazon: #train=454763  #test=113691


In [74]:
# Cell 3: Load GloVe Embeddings
# ────────────────────────────────

def load_glove_embeddings(filepath):
    embeddings_index = {}
    with open(filepath, 'r', encoding='utf8') as f:
        for line in f:
            values = line.strip().split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

glove_path = 'glove.6B/glove.6B.100d.txt'
embeddings_index = load_glove_embeddings(glove_path)


In [75]:
# Cell 4: Prepare Data with Tokenizer
# ────────────────────────────────

# Initialize Tokenizer
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(train_texts)
word_index = tokenizer.word_index

# Tokenize train and test texts
train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

maxlen = 100
X_train = pad_sequences(train_sequences, maxlen=maxlen)
X_test = pad_sequences(test_sequences, maxlen=maxlen)

y_train = np.array(train_labels)
y_test = np.array(test_labels)

y_train = np.array(y_train) - 1
y_test = np.array(y_test) - 1

embedding_dim = 100
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


In [76]:
# Cell 5: Summarize GloVe results with ULMFiT‐style metrics
# ───────────────────────────────────────────────────────

fractions = [0.2, 0.4, 0.6, 0.8]
rows = []
baseline_frac = fractions[0]
baseline_error = None
num_classes = 5

for frac in fractions:
    print(f"→ Training on {int(frac * 100)}% of the data...")
    n = int(len(train_labels) * frac)
    X_frac = X_train[:n]
    y_frac = y_train[:n]

    # Build a fresh model for each run
    model = Sequential([
        Embedding(input_dim=len(word_index) + 1,
                  output_dim=embedding_dim,
                  weights=[embedding_matrix],
                  input_length=maxlen,
                  trainable=False),
        LSTM(64),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    model.fit(X_frac, y_frac, epochs=5, batch_size=8)

    y_pred = model.predict(X_test)
    y_pred_class = np.argmax(y_pred, axis=1)
    
    acc = accuracy_score(y_test, y_pred_class)
    err = 1.0 - acc

    if frac == baseline_frac:
        baseline_error = err
    rel = (baseline_error - err) / baseline_error * 100 if baseline_error else 0.0

    rows.append({
        "fraction_%":        int(frac*100),
        "accuracy":          acc,
        "error_rate":        err,
        "rel_err_reduction": rel
    })

df = pd.DataFrame(rows).set_index("fraction_%")
print(df)

# save to CSV
results_dir = f"./glove/{DATASET}/results"
os.makedirs(results_dir, exist_ok=True)
output_path = os.path.join(results_dir, "glove_ulmfit_metrics.csv")
df.to_csv(output_path)
print(f"→ Saved ULMFiT‐style metrics to {output_path}")


→ Training on 20% of the data...
Epoch 1/5
[1m11369/11369[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m123s[0m 11ms/step - accuracy: 0.6503 - loss: 0.9893
Epoch 2/5
[1m11369/11369[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 11ms/step - accuracy: 0.6967 - loss: 0.8042
Epoch 3/5
[1m11369/11369[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 11ms/step - accuracy: 0.7176 - loss: 0.7410
Epoch 4/5
[1m11369/11369[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m131s[0m 12ms/step - accuracy: 0.7333 - loss: 0.6977
Epoch 5/5
[1m11369/11369[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m132s[0m 12ms/step - accuracy: 0.7472 - loss: 0.6626
[1m3553/3553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 6ms/step
→ Training on 40% of the data...
Epoch 1/5
[1m22739/22739[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m307s[0m 13ms/step - accuracy: 0.6680 - loss: 0.9205
Epoch 2/5
[1m22739/22739[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m306s[0m 13ms/step - accura