In [32]:
import os
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from gensim import corpora
from gensim.models.ldamulticore import LdaMulticore
import warnings
warnings.filterwarnings('ignore')

# Change this to 'amazon', 'ag', or 'imdb'
# DATASET = 'amazon'
# DATASET = 'ag'
DATASET = 'amazon'

TEST_SIZE = 0.2  # fraction for test set

configs = {
    'ag': {
        'path': 'dataset/ag-news-classification-dataset',
        'train_file': 'train.csv',
        'test_file':  'test.csv',
        'text_cols':  ['Title','Description'],
        'label_col':  'Class Index',
        'label_shift': -1,
        'has_test_file': True
    },
    'amazon': {
        'path': 'dataset/amazon-fine-food-reviews',
        'train_file': 'Reviews.csv',
        'test_file':  None,
        'text_cols':  ['Text'],
        'label_col':  'Score',
        'has_test_file': False
    },
    'imdb': {
        'path': 'dataset/imdb-dataset-of-50k-movie-reviews',
        'train_file': 'IMDB Dataset.csv',
        'test_file':  None,
        'text_cols':  ['review'],
        'label_col':  'sentiment',
        'label_transform': lambda x: 1 if x=='positive' else 0,
        'has_test_file': False
    }
}

cfg = configs[DATASET]

In [33]:
# Cell 2: Load and split dataset
# ────────────────────────────────

# load train
train_df = pd.read_csv(f"{cfg['path']}/{cfg['train_file']}")

# build train_texts
if len(cfg['text_cols']) > 1:
    texts = train_df[cfg['text_cols'][0]].astype(str) + " " + train_df[cfg['text_cols'][1]].astype(str)
else:
    texts = train_df[cfg['text_cols'][0]].astype(str)

# build train_labels
if 'label_shift' in cfg:
    labels = (train_df[cfg['label_col']] + cfg['label_shift']).tolist()
elif 'label_transform' in cfg:
    labels = train_df[cfg['label_col']].map(cfg['label_transform']).tolist()
else:
    labels = train_df[cfg['label_col']].tolist()

# split into train/test
if cfg['has_test_file']:
    # built‐in test split
    test_df = pd.read_csv(f"{cfg['path']}/{cfg['test_file']}")
    if len(cfg['text_cols']) > 1:
        test_texts = test_df[cfg['text_cols'][0]].astype(str) + " " + test_df[cfg['text_cols'][1]].astype(str)
    else:
        test_texts = test_df[cfg['text_cols'][0]].astype(str)
    if 'label_shift' in cfg:
        test_labels = (test_df[cfg['label_col']] + cfg['label_shift']).tolist()
    elif 'label_transform' in cfg:
        test_labels = test_df[cfg['label_col']].map(cfg['label_transform']).tolist()
    else:
        test_labels = test_df[cfg['label_col']].tolist()

    train_texts = texts.tolist()
    train_labels = labels
else:
    # sequential split: first (1–TEST_SIZE) for train, last TEST_SIZE for test
    split_idx = int(len(texts) * (1 - TEST_SIZE))
    train_texts = texts.tolist()[:split_idx]
    train_labels = labels[:split_idx]
    test_texts  = texts.tolist()[split_idx:]
    test_labels = labels[split_idx:]

print(f"{DATASET}: #train={len(train_texts)}  #test={len(test_texts)}")


amazon: #train=454763  #test=113691


In [34]:
# Cell 3: Load data from nltk
# ─────────────────────────────

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/eduardo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/eduardo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/eduardo/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [35]:
# Cell 4: Defining pre-processing tools
# ─────────────────────────────

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    tokens = text.lower().split()
    tokens = [lemmatizer.lemmatize(t) for t in tokens if t not in stop_words and len(t) > 2]
    return tokens

In [36]:
# Cell 5: Preprocess train data and Create train corpus
# ─────────────────────────────

processed_train = [preprocess(doc) for doc in train_texts]
dictionary = corpora.Dictionary(processed_train)
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=20000)

corpus_train = [dictionary.doc2bow(doc) for doc in processed_train]

In [37]:
# Cell 6: Define a function to extract the topic distribuition vector for
# each document.
# ─────────────────────────────

def get_topic_vector(doc, model):
    bow = dictionary.doc2bow(preprocess(doc))
    topic_dist = model.get_document_topics(bow, minimum_probability=0)
    vec = np.zeros(10)
    for idx, prob in topic_dist:
        vec[idx] = prob
    return vec

In [None]:
# Cell 7: Summarize LDA results with ULMFiT‐style metrics
# ───────────────────────────────────────────────────────

fractions = [0.2, 0.4, 0.6, 0.8]
rows = []
baseline_frac = fractions[0]
baseline_error = None

for frac in fractions:
    n = int(len(train_labels) * frac)
    X_frac = X_train[:n]
    corpus_frac = corpus_train[:n]
    y_frac = np.array(train_labels[:n])

    
    lda_model = LdaMulticore(corpus=corpus_frac,
                         id2word=dictionary,
                         num_topics=10,
                         passes=10,
                         workers=2,
                         random_state=42,
                         chunksize=100)

    X_train = np.array([get_topic_vector(doc, lda_model) for doc in train_texts])
    X_test = np.array([get_topic_vector(doc, lda_model) for doc in test_texts])

    clf = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, random_state=42)
    clf.fit(X_train, train_labels)

    y_pred = clf.predict(X_test)
    acc = accuracy_score(test_labels, y_pred)
    print(f"Test accuracy: {acc:.4f}")

    acc = accuracy_score(test_labels, y_pred)
    err = 1.0 - acc
    if frac == baseline_frac:
        baseline_error = err
    rel = (baseline_error - err) / baseline_error * 100 if baseline_error else 0.0

    rows.append({
        "fraction_%":        int(frac*100),
        "accuracy":          acc,
        "error_rate":        err,
        "rel_err_reduction": rel
    })

df = pd.DataFrame(rows).set_index("fraction_%")
print(df)

# save to CSV
results_dir = f"./lda/{DATASET}/results"
os.makedirs(results_dir, exist_ok=True)
output_path = os.path.join(results_dir, "lda_ulmfit_metrics.csv")
df.to_csv(output_path)
print(f"→ Saved ULMFiT‐style metrics to {output_path}")


Test accuracy: 0.6456
Test accuracy: 0.6461
Test accuracy: 0.6458
Test accuracy: 0.6460
            accuracy  error_rate  rel_err_reduction
fraction_%                                         
20          0.645645    0.354355           0.000000
40          0.646093    0.353907           0.126592
60          0.645838    0.354162           0.054608
80          0.646023    0.353977           0.106734
→ Saved ULMFiT‐style metrics to ./lda/amazon/results/lda_ulmfit_metrics.csv
