In [1]:
# Cell 1: Setup
import pandas as pd
import numpy as np
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")

# Change this to 'amazon', 'ag', or 'imdb'
# DATASET = 'amazon'
# DATASET = 'ag'
DATASET = 'imdb'

TEST_SIZE = 0.2  # fraction for test set

configs = {
    'ag': {
        'path': 'dataset/ag-news-classification-dataset',
        'train_file': 'train.csv',
        'test_file':  'test.csv',
        'text_cols':  ['Title','Description'],
        'label_col':  'Class Index',
        'label_shift': -1,
        'has_test_file': True
    },
    'amazon': {
        'path': 'dataset/amazon-fine-food-reviews',
        'train_file': 'Reviews.csv',
        'test_file':  None,
        'text_cols':  ['Text'],
        'label_col':  'Score',
        'has_test_file': False
    },
    'imdb': {
        'path': 'dataset/imdb-dataset-of-50k-movie-reviews',
        'train_file': 'IMDB Dataset.csv',
        'test_file':  None,
        'text_cols':  ['review'],
        'label_col':  'sentiment',
        'label_transform': lambda x: 1 if x=='positive' else 0,
        'has_test_file': False
    }
}

cfg = configs[DATASET]


In [2]:
# Cell 2: Load and split dataset
# ────────────────────────────────

# load train
train_df = pd.read_csv(f"{cfg['path']}/{cfg['train_file']}")

# build train_texts
if len(cfg['text_cols']) > 1:
    texts = train_df[cfg['text_cols'][0]].astype(str) + " " + train_df[cfg['text_cols'][1]].astype(str)
else:
    texts = train_df[cfg['text_cols'][0]].astype(str)

# build train_labels
if 'label_shift' in cfg:
    labels = (train_df[cfg['label_col']] + cfg['label_shift']).tolist()
elif 'label_transform' in cfg:
    labels = train_df[cfg['label_col']].map(cfg['label_transform']).tolist()
else:
    labels = train_df[cfg['label_col']].tolist()

# split into train/test
if cfg['has_test_file']:
    # built‐in test split
    test_df = pd.read_csv(f"{cfg['path']}/{cfg['test_file']}")
    if len(cfg['text_cols']) > 1:
        test_texts = test_df[cfg['text_cols'][0]].astype(str) + " " + test_df[cfg['text_cols'][1]].astype(str)
    else:
        test_texts = test_df[cfg['text_cols'][0]].astype(str)
    if 'label_shift' in cfg:
        test_labels = (test_df[cfg['label_col']] + cfg['label_shift']).tolist()
    elif 'label_transform' in cfg:
        test_labels = test_df[cfg['label_col']].map(cfg['label_transform']).tolist()
    else:
        test_labels = test_df[cfg['label_col']].tolist()

    train_texts = texts.tolist()
    train_labels = labels
else:
    # sequential split: first (1–TEST_SIZE) for train, last TEST_SIZE for test
    split_idx = int(len(texts) * (1 - TEST_SIZE))
    train_texts = texts.tolist()[:split_idx]
    train_labels = labels[:split_idx]
    test_texts  = texts.tolist()[split_idx:]
    test_labels = labels[split_idx:]

print(f"{DATASET}: #train={len(train_texts)}  #test={len(test_texts)}")


imdb: #train=40000  #test=10000


In [3]:
# Cell 3: Vectorize with TF‐IDF
# ─────────────────────────────

vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train = vectorizer.fit_transform(train_texts)
X_test  = vectorizer.transform(test_texts)

print(f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")


X_train shape: (40000, 5000), X_test shape: (10000, 5000)


In [4]:
# Cell 4: Summarize BoW results with ULMFiT‐style metrics
# ───────────────────────────────────────────────────────

fractions = [0.2, 0.4, 0.6, 0.8]
rows = []
baseline_frac = fractions[0]
baseline_error = None

for frac in fractions:
    n = int(len(train_labels) * frac)
    X_frac = X_train[:n]
    y_frac = np.array(train_labels[:n])

    clf = LogisticRegression(
        max_iter=1_000,
        random_state=42,
        multi_class='multinomial',
        solver='lbfgs'
    )
    clf.fit(X_frac, y_frac)

    y_pred = clf.predict(X_test)
    acc = accuracy_score(test_labels, y_pred)
    err = 1.0 - acc
    if frac == baseline_frac:
        baseline_error = err
    rel = (baseline_error - err) / baseline_error * 100 if baseline_error else 0.0

    rows.append({
        "fraction_%":        int(frac*100),
        "accuracy":          acc,
        "error_rate":        err,
        "rel_err_reduction": rel
    })

df = pd.DataFrame(rows).set_index("fraction_%")
print(df)

# save to CSV
results_dir = f"./bow/{DATASET}/results"
os.makedirs(results_dir, exist_ok=True)
output_path = os.path.join(results_dir, "bow_ulmfit_metrics.csv")
df.to_csv(output_path)
print(f"→ Saved ULMFiT‐style metrics to {output_path}")


            accuracy  error_rate  rel_err_reduction
fraction_%                                         
20            0.8663      0.1337           0.000000
40            0.8777      0.1223           8.526552
60            0.8803      0.1197          10.471204
80            0.8836      0.1164          12.939417
→ Saved ULMFiT‐style metrics to ./bow/imdb/results/bow_ulmfit_metrics.csv
