In [None]:
IMDB_PATH  = "/content/sample_data/IMDB Dataset.csv"   # CSV (e.g., columns: review, sentiment)
ALEXA_PATH = "/content/sample_data/amazon_alexa.tsv"   # TSV (e.g., columns: verified_reviews, feedback)

N_FOLDS = 5
RANDOM_STATE = 42


In [None]:
import os, re, html
import numpy as np
import pandas as pd
from collections import Counter

from sklearn.model_selection import StratifiedKFold, ShuffleSplit, cross_validate
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings("ignore")


In [None]:
_url_re = re.compile(r'https?://\S+|www\.\S+')
_html_tag_re = re.compile(r'<.*?>')
_nonword_re = re.compile(r"[^\w\s']")

def simple_preprocess(text: str) -> str:
    if not isinstance(text, str): return ''
    t = html.unescape(text)
    t = _url_re.sub(' ', t)
    t = _html_tag_re.sub(' ', t)
    t = t.lower()
    t = _nonword_re.sub(' ', t)
    t = re.sub(r'\s+', ' ', t).strip()
    return t

def to_binary_labels(series: pd.Series):
    s = series.copy()
    s = s.replace({'positive':1, 'negative':0, 'Positive':1, 'Negative':0,
                   'yes':1, 'no':0, 'Yes':1, 'No':0, True:1, False:0})
    try:
        s = s.astype(int)
    except Exception:
        pass
    return s


In [None]:
def load_imdb_csv(path):
    assert os.path.exists(path), f"IMDB file not found: {path}"
    df = pd.read_csv(path)  # usually has 'review','sentiment'
    if {'review','sentiment'}.issubset(df.columns):
        df = df.rename(columns={'review':'text','sentiment':'label'})
    elif {'text','label'}.issubset(df.columns):
        pass
    else:
        raise ValueError("IMDB CSV must contain either ['review','sentiment'] or ['text','label']")
    df = df[['text','label']].dropna().reset_index(drop=True)
    df['label'] = to_binary_labels(df['label'])
    df = df[df['label'].isin([0,1])].reset_index(drop=True)
    df['text'] = df['text'].map(simple_preprocess)
    return df

def load_alexa_tsv(path):
    assert os.path.exists(path), f"Alexa file not found: {path}"
    df = pd.read_csv(path, sep='\t')  # usually 'verified_reviews','feedback'
    if {'verified_reviews','feedback'}.issubset(df.columns):
        df = df.rename(columns={'verified_reviews':'text','feedback':'label'})
    elif {'text','label'}.issubset(df.columns):
        pass
    else:
        # heuristic fallback
        cand_text  = next((c for c in df.columns if 'review' in c.lower() or 'text' in c.lower()), None)
        cand_label = next((c for c in df.columns if any(k in c.lower() for k in ['label','feedback','sentiment'])), None)
        if cand_text is None or cand_label is None:
            raise ValueError(f"Could not infer text/label columns. Found: {list(df.columns)}")
        df = df.rename(columns={cand_text:'text', cand_label:'label'})
    df = df[['text','label']].dropna().reset_index(drop=True)
    df['label'] = to_binary_labels(df['label'])
    df = df[df['label'].isin([0,1])].reset_index(drop=True)
    df['text'] = df['text'].map(simple_preprocess)
    return df

imdb  = load_imdb_csv(IMDB_PATH)
alexa = load_alexa_tsv(ALEXA_PATH)

print("IMDB:", imdb.shape, "pos=", int(imdb.label.sum()), "neg=", len(imdb)-int(imdb.label.sum()))
print("Alexa:", alexa.shape, "pos=", int(alexa.label.sum()), "neg=", len(alexa)-int(alexa.label.sum()))


IMDB: (50000, 2) pos= 25000 neg= 25000
Alexa: (3149, 2) pos= 2893 neg= 256


In [None]:
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, zero_division=0),
    'recall': make_scorer(recall_score, zero_division=0),
    'f1': make_scorer(f1_score, zero_division=0),
}

def make_cv(y, requested=5, random_state=42):
    counts = Counter(y)
    min_class = min(counts.values())
    if min_class >= requested:
        return StratifiedKFold(n_splits=requested, shuffle=True, random_state=random_state)
    if min_class >= 2:
        return StratifiedKFold(n_splits=min(requested, min_class), shuffle=True, random_state=random_state)
    return ShuffleSplit(n_splits=3, test_size=0.33, random_state=random_state)


In [None]:
def baseline_tfidf_rf():
    """Paper-like baseline: TF-IDF (unigram) + RandomForest."""
    return Pipeline([
        ('tfidf', TfidfVectorizer(stop_words='english', ngram_range=(1,1))),
        ('rf', RandomForestClassifier(n_estimators=300, random_state=RANDOM_STATE, n_jobs=-1))
    ])

def ir_tfidf_plus():
    """
    SMALL IR FUNCTION:
    TF-IDF++ (sublinear TF, (1,2)-grams, min_df/max_df pruning, smooth IDF, L2 norm)
    + chi-square feature selection
    + LinearSVC (strong linear margin on sparse text)
    """
    return Pipeline([
        ('tfidf', TfidfVectorizer(
            stop_words='english',
            ngram_range=(1,2),
            min_df=3,
            max_df=0.9,
            sublinear_tf=True,
            smooth_idf=True,
            norm='l2'
        )),
        ('chi2', SelectKBest(chi2, k=20000)),
        ('svm', LinearSVC(random_state=RANDOM_STATE))
    ])


In [None]:
def evaluate_two(df, dataset_name):
    X, Y = df['text'].values, df['label'].values
    cv = make_cv(Y, requested=N_FOLDS, random_state=RANDOM_STATE)

    res = []
    for name, pipe in {
        'PaperBaseline_TFIDF+RF': baseline_tfidf_rf(),
        'Your_IR_TFIDF++_chi2+SVM': ir_tfidf_plus()
    }.items():
        cv_out = cross_validate(pipe, X, Y, cv=cv, scoring=scoring, n_jobs=-1, return_train_score=False)
        row = {m: np.mean(cv_out['test_'+m]) for m in scoring}
        row['dataset'], row['model'] = dataset_name, name
        print(f"{dataset_name:6s} | {name:24s}  "
              f"acc={row['accuracy']:.4f}  prec={row['precision']:.4f}  rec={row['recall']:.4f}  f1={row['f1']:.4f}")
        res.append(row)
    out = pd.DataFrame(res)[['dataset','model','accuracy','precision','recall','f1']]
    return out.sort_values(['dataset','f1'], ascending=[True,False]).reset_index(drop=True)

summary_imdb  = evaluate_two(imdb,  "IMDB")
summary_alexa = evaluate_two(alexa, "Alexa")
summary_imdb, summary_alexa


IMDB   | PaperBaseline_TFIDF+RF    acc=0.8650  prec=0.8652  rec=0.8648  f1=0.8650
IMDB   | Your_IR_TFIDF++_chi2+SVM  acc=0.9047  prec=0.8977  rec=0.9136  f1=0.9056
Alexa  | PaperBaseline_TFIDF+RF    acc=0.9349  prec=0.9350  rec=0.9986  f1=0.9657
Alexa  | Your_IR_TFIDF++_chi2+SVM  acc=0.9406  prec=0.9477  rec=0.9900  f1=0.9684


(  dataset                     model  accuracy  precision   recall        f1
 0    IMDB  Your_IR_TFIDF++_chi2+SVM   0.90474   0.897683  0.91364  0.905580
 1    IMDB    PaperBaseline_TFIDF+RF   0.86500   0.865220  0.86476  0.864974,
   dataset                     model  accuracy  precision    recall        f1
 0   Alexa  Your_IR_TFIDF++_chi2+SVM  0.940614   0.947720  0.989974  0.968384
 1   Alexa    PaperBaseline_TFIDF+RF  0.934899   0.934971  0.998617  0.965741)

In [None]:
# Paper tables (we’ll compare to these)
paper = {
    "IMDB": {
        "PaperBaseline_TFIDF+RF": {"A": 85.80, "P": 85.80, "R": 85.80, "F1": 85.80}
    },
    "Alexa": {
        "PaperBaseline_TFIDF+RF": {"A": 93.81, "P": 94.20, "R": 93.81, "F1": 91.99}
    }
}


In [None]:
def compare_against_paper(summary_df, dataset_name):
    rows = []
    # match baseline row
    base_row = summary_df[summary_df['model']=='PaperBaseline_TFIDF+RF'].iloc[0].to_dict()
    ir_row   = summary_df[summary_df['model']=='Your_IR_TFIDF++_chi2+SVM'].iloc[0].to_dict()

    # Convert to %
    def pctify(d):
        return {k: (100*v if k in ['accuracy','precision','recall','f1'] else v) for k,v in d.items()}

    base_pct = pctify(base_row)
    ir_pct   = pctify(ir_row)

    # Paper baseline
    pb = paper[dataset_name]['PaperBaseline_TFIDF+RF']
    cmp = pd.DataFrame([{
        'dataset': dataset_name,
        'metric': 'Accuracy',
        'paper_baseline': pb['A'],
        'your_baseline': base_pct['accuracy'],
        'your_IR': ir_pct['accuracy'],
        'Δ(your_IR − paper)': ir_pct['accuracy'] - pb['A']
    },{
        'dataset': dataset_name,
        'metric': 'Precision',
        'paper_baseline': pb['P'],
        'your_baseline': base_pct['precision'],
        'your_IR': ir_pct['precision'],
        'Δ(your_IR − paper)': ir_pct['precision'] - pb['P']
    },{
        'dataset': dataset_name,
        'metric': 'Recall',
        'paper_baseline': pb['R'],
        'your_baseline': base_pct['recall'],
        'your_IR': ir_pct['recall'],
        'Δ(your_IR − paper)': ir_pct['recall'] - pb['R']
    },{
        'dataset': dataset_name,
        'metric': 'F1',
        'paper_baseline': pb['F1'],
        'your_baseline': base_pct['f1'],
        'your_IR': ir_pct['f1'],
        'Δ(your_IR − paper)': ir_pct['f1'] - pb['F1']
    }])

    # Simple verdict text per metric
    verdicts = []
    for _, r in cmp.iterrows():
        verdicts.append(f"{dataset_name} | {r['metric']}: Your IR = {r['your_IR']:.2f} vs Paper = {r['paper_baseline']:.2f}  → Δ = {r['Δ(your_IR − paper)']:+.2f} pp")
    return cmp, verdicts

cmp_imdb, verdicts_imdb = compare_against_paper(summary_imdb,  "IMDB")
cmp_alexa, verdicts_alexa = compare_against_paper(summary_alexa, "Alexa")

print("=== Comparison vs Paper (Your IR − Paper) — IMDB ===")
display(cmp_imdb.style.format("{:.2f}"))
for v in verdicts_imdb: print(v)

print("\n=== Comparison vs Paper (Your IR − Paper) — Alexa ===")
display(cmp_alexa.style.format("{:.2f}"))
for v in verdicts_alexa: print(v)


=== Comparison vs Paper (Your IR − Paper) — IMDB ===


ValueError: Unknown format code 'f' for object of type 'str'

<pandas.io.formats.style.Styler at 0x7a5db99888f0>

IMDB | Accuracy: Your IR = 90.47 vs Paper = 85.80  → Δ = +4.67 pp
IMDB | Precision: Your IR = 89.77 vs Paper = 85.80  → Δ = +3.97 pp
IMDB | Recall: Your IR = 91.36 vs Paper = 85.80  → Δ = +5.56 pp
IMDB | F1: Your IR = 90.56 vs Paper = 85.80  → Δ = +4.76 pp

=== Comparison vs Paper (Your IR − Paper) — Alexa ===


ValueError: Unknown format code 'f' for object of type 'str'

<pandas.io.formats.style.Styler at 0x7a5db99dcc50>

Alexa | Accuracy: Your IR = 94.06 vs Paper = 93.81  → Δ = +0.25 pp
Alexa | Precision: Your IR = 94.77 vs Paper = 94.20  → Δ = +0.57 pp
Alexa | Recall: Your IR = 99.00 vs Paper = 93.81  → Δ = +5.19 pp
Alexa | F1: Your IR = 96.84 vs Paper = 91.99  → Δ = +4.85 pp
