In [None]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# -------------------------
# CONFIG (edit these two lines only)
# -------------------------
CSV_PATH  = "/content/sample_data/Surprise_anon.csv"
LABEL_COL = "Surprise"   # must match the dataset

TEXT_COL  = "Sentence"
SPLIT_COL = "Split"      # 0=train, 1=val, 2=test
SEED = 42

def compute_metrics(y_true, y_pred):
    return {
        "acc": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "recall": recall_score(y_true, y_pred, zero_division=0),
        "f1": f1_score(y_true, y_pred, zero_division=0),
    }

df = pd.read_csv(CSV_PATH)
print("Columns:", df.columns.tolist())

df = df[[TEXT_COL, LABEL_COL, SPLIT_COL]].dropna().reset_index(drop=True)

train_df = df[df[SPLIT_COL] == 0]
val_df   = df[df[SPLIT_COL] == 1]
test_df  = df[df[SPLIT_COL] == 2]

X_train = train_df[TEXT_COL].astype(str).tolist()
y_train = train_df[LABEL_COL].astype(int).tolist()
X_val   = val_df[TEXT_COL].astype(str).tolist()
y_val   = val_df[LABEL_COL].astype(int).tolist()
X_test  = test_df[TEXT_COL].astype(str).tolist()
y_test  = test_df[LABEL_COL].astype(int).tolist()

print("\nDATASET:", LABEL_COL)
print("Total:", len(df))
print("Train/Val/Test:", len(X_train), len(X_val), len(X_test))
print("Train label counts:", pd.Series(y_train).value_counts().to_dict())

tfidf = TfidfVectorizer(
    lowercase=True,
    ngram_range=(1, 2),
    min_df=2,
    max_features=50000,
    sublinear_tf=True
)

models = {
    "TFIDF+LogReg": Pipeline([
        ("tfidf", tfidf),
        ("clf", LogisticRegression(
            max_iter=2000,
            class_weight="balanced",
            random_state=SEED
        ))
    ]),
    "TFIDF+LinearSVM": Pipeline([
        ("tfidf", tfidf),
        ("clf", LinearSVC(
            class_weight="balanced",
            random_state=SEED
        ))
    ]),
    "TFIDF+NaiveBayes": Pipeline([
        ("tfidf", tfidf),
        ("clf", MultinomialNB())
    ]),
    "TFIDF+RandomForest": Pipeline([
        ("tfidf", tfidf),
        ("clf", RandomForestClassifier(
            n_estimators=400,
            random_state=SEED,
            n_jobs=-1
        ))
    ]),
}

results = []
for name, model in models.items():
    model.fit(X_train, y_train)

    val_pred  = model.predict(X_val)
    test_pred = model.predict(X_test)

    val_m  = compute_metrics(y_val, val_pred)
    test_m = compute_metrics(y_test, test_pred)

    results.append({
        "model": name,
        "val_acc": val_m["acc"],
        "val_f1": val_m["f1"],
        "val_precision": val_m["precision"],
        "val_recall": val_m["recall"],
        "test_acc": test_m["acc"],
        "test_f1": test_m["f1"],
        "test_precision": test_m["precision"],
        "test_recall": test_m["recall"],
    })

    print(f"{name:18s} | TEST acc={test_m['acc']:.4f} "
          f"f1={test_m['f1']:.4f} prec={test_m['precision']:.4f} rec={test_m['recall']:.4f}")

res_df = pd.DataFrame(results).sort_values("test_f1", ascending=False).reset_index(drop=True)
print("\n================ BASELINE SUMMARY ================")
display(res_df)

out_csv = f"classic_baselines_{LABEL_COL.lower()}.csv"
res_df.to_csv(out_csv, index=False)
print("Saved:", out_csv)


Columns: ['Sentence', 'Surprise', 'Split']

DATASET: Surprise
Total: 826
Train/Val/Test: 614 102 110
Train label counts: {0: 307, 1: 307}
TFIDF+LogReg       | TEST acc=0.6364 f1=0.6000 prec=0.6667 rec=0.5455
TFIDF+LinearSVM    | TEST acc=0.6364 f1=0.6296 prec=0.6415 rec=0.6182
TFIDF+NaiveBayes   | TEST acc=0.5909 f1=0.6154 prec=0.5806 rec=0.6545
TFIDF+RandomForest | TEST acc=0.6455 f1=0.6214 prec=0.6667 rec=0.5818



Unnamed: 0,model,val_acc,val_f1,val_precision,val_recall,test_acc,test_f1,test_precision,test_recall
0,TFIDF+LinearSVM,0.598039,0.57732,0.608696,0.54902,0.636364,0.62963,0.641509,0.618182
1,TFIDF+RandomForest,0.617647,0.571429,0.65,0.509804,0.645455,0.621359,0.666667,0.581818
2,TFIDF+NaiveBayes,0.539216,0.560748,0.535714,0.588235,0.590909,0.615385,0.580645,0.654545
3,TFIDF+LogReg,0.617647,0.589474,0.636364,0.54902,0.636364,0.6,0.666667,0.545455


Saved: classic_baselines_surprise.csv


In [None]:
import os
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# ============================================================
# âœ… Dynamic TF-IDF Baselines Runner (ALL datasets in one run)
# - Assumes each CSV has columns: Sentence, <LABEL>, Split (0/1/2)
# - Saves:
#   1) classic_baselines_ALL.csv (summary across all datasets)
#   2) classic_baselines_<label>.csv (per-dataset summary)
# ============================================================

# -------------------------
# CONFIG (edit only this block)
# -------------------------
DATA_DIR  = "/content/sample_data"  # folder containing all your *_anon.csv files
TEXT_COL  = "Sentence"
SPLIT_COL = "Split"      # 0=train, 1=val, 2=test
SEED = 42

# Option A: list your datasets explicitly (recommended)
DATASETS = [
    {"label": "Surprise",  "csv": "Surprise_anon.csv"},
    {"label": "Sadness",   "csv": "Sadness_anon.csv"},
    {"label": "Joy",       "csv": "Joy_anon.csv"},
    {"label": "Fear",      "csv": "Fear_anon.csv"},
    {"label": "Anger",     "csv": "Anger_anon.csv"},
    {"label": "Disgust",   "csv": "Disgust_anon.csv"},
    {"label": "Trust",     "csv": "Trust_anon.csv"},
    {"label": "Anticipation",     "csv": "Anticipation_anon.csv"},
]
# -------------------------


def compute_metrics(y_true, y_pred):
    return {
        "acc": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "recall": recall_score(y_true, y_pred, zero_division=0),
        "f1": f1_score(y_true, y_pred, zero_division=0),
    }


def build_models(seed=42):
    # fresh TF-IDF object per model (safe)
    def make_tfidf():
        return TfidfVectorizer(
            lowercase=True,
            ngram_range=(1, 2),
            min_df=2,
            max_features=50000,
            sublinear_tf=True
        )

    return {
        "TFIDF+LogReg": Pipeline([
            ("tfidf", make_tfidf()),
            ("clf", LogisticRegression(
                max_iter=2000,
                class_weight="balanced",
                random_state=seed
            ))
        ]),
        "TFIDF+LinearSVM": Pipeline([
            ("tfidf", make_tfidf()),
            ("clf", LinearSVC(
                class_weight="balanced",
                random_state=seed
            ))
        ]),
        "TFIDF+NaiveBayes": Pipeline([
            ("tfidf", make_tfidf()),
            ("clf", MultinomialNB())
        ]),
        "TFIDF+RandomForest": Pipeline([
            ("tfidf", make_tfidf()),
            ("clf", RandomForestClassifier(
                n_estimators=400,
                random_state=seed,
                n_jobs=-1
            ))
        ]),
    }


def run_one_dataset(csv_path, label_col, text_col="Sentence", split_col="Split", seed=42):
    df = pd.read_csv(csv_path)
    required = {text_col, label_col, split_col}
    missing = sorted(list(required - set(df.columns)))
    if missing:
        raise ValueError(f"Missing columns in {os.path.basename(csv_path)}: {missing}")

    df = df[[text_col, label_col, split_col]].dropna().reset_index(drop=True)

    train_df = df[df[split_col] == 0]
    val_df   = df[df[split_col] == 1]
    test_df  = df[df[split_col] == 2]

    X_train = train_df[text_col].astype(str).tolist()
    y_train = train_df[label_col].astype(int).tolist()
    X_val   = val_df[text_col].astype(str).tolist()
    y_val   = val_df[label_col].astype(int).tolist()
    X_test  = test_df[text_col].astype(str).tolist()
    y_test  = test_df[label_col].astype(int).tolist()

    print("\n================================================")
    print("DATASET:", label_col)
    print("FILE   :", os.path.basename(csv_path))
    print("Total  :", len(df))
    print("Train/Val/Test:", len(X_train), len(X_val), len(X_test))
    print("Train label counts:", pd.Series(y_train).value_counts().to_dict())

    models = build_models(seed)
    rows = []

    for name, model in models.items():
        model.fit(X_train, y_train)

        val_pred  = model.predict(X_val)  if len(X_val)  > 0 else []
        test_pred = model.predict(X_test) if len(X_test) > 0 else []

        val_m  = compute_metrics(y_val, val_pred)   if len(X_val)  > 0 else {"acc": np.nan, "f1": np.nan, "precision": np.nan, "recall": np.nan}
        test_m = compute_metrics(y_test, test_pred) if len(X_test) > 0 else {"acc": np.nan, "f1": np.nan, "precision": np.nan, "recall": np.nan}

        rows.append({
            "dataset": label_col,
            "model": name,
            "val_acc": val_m["acc"],
            "val_f1": val_m["f1"],
            "val_precision": val_m["precision"],
            "val_recall": val_m["recall"],
            "test_acc": test_m["acc"],
            "test_f1": test_m["f1"],
            "test_precision": test_m["precision"],
            "test_recall": test_m["recall"],
        })

        print(f"{name:18s} | TEST acc={test_m['acc']:.4f} "
              f"f1={test_m['f1']:.4f} prec={test_m['precision']:.4f} rec={test_m['recall']:.4f}")

    res_df = pd.DataFrame(rows).sort_values("test_f1", ascending=False).reset_index(drop=True)
    return res_df


# -------------------------
# RUN ALL DATASETS
# -------------------------
all_results = []
for ds in DATASETS:
    label = ds["label"]
    csv_path = os.path.join(DATA_DIR, ds["csv"])

    if not os.path.exists(csv_path):
        print(f"\n[SKIP] File not found: {csv_path}")
        continue

    res_df = run_one_dataset(csv_path, label_col=label, text_col=TEXT_COL, split_col=SPLIT_COL, seed=SEED)
    all_results.append(res_df)

    out_csv = f"classic_baselines_{label.lower()}.csv"
    res_df.to_csv(out_csv, index=False)
    print("Saved:", out_csv)

if len(all_results) == 0:
    raise RuntimeError("No datasets were run. Check DATA_DIR and DATASETS filenames.")

all_df = pd.concat(all_results, ignore_index=True)

# nicer summary: best model per dataset (by test_f1)
best_per_dataset = (
    all_df.sort_values(["dataset", "test_f1"], ascending=[True, False])
         .groupby("dataset", as_index=False)
         .head(1)
         .reset_index(drop=True)
)

print("\n================ ALL DATASETS: FULL RESULTS ================")
display(all_df.sort_values(["dataset", "test_f1"], ascending=[True, False]).reset_index(drop=True))

print("\n================ BEST PER DATASET (by TEST F1) ================")
display(best_per_dataset)

all_out = "classic_baselines_ALL.csv"
best_out = "classic_baselines_BEST_PER_DATASET.csv"
all_df.to_csv(all_out, index=False)
best_per_dataset.to_csv(best_out, index=False)
print("Saved:", all_out)
print("Saved:", best_out)



DATASET: Surprise
FILE   : Surprise_anon.csv
Total  : 826
Train/Val/Test: 614 102 110
Train label counts: {0: 307, 1: 307}
TFIDF+LogReg       | TEST acc=0.6364 f1=0.6000 prec=0.6667 rec=0.5455
TFIDF+LinearSVM    | TEST acc=0.6364 f1=0.6296 prec=0.6415 rec=0.6182
TFIDF+NaiveBayes   | TEST acc=0.5909 f1=0.6154 prec=0.5806 rec=0.6545
TFIDF+RandomForest | TEST acc=0.6455 f1=0.6214 prec=0.6667 rec=0.5818
Saved: classic_baselines_surprise.csv

DATASET: Sadness
FILE   : Sadness_anon.csv
Total  : 3606
Train/Val/Test: 2884 361 361
Train label counts: {0: 1457, 1: 1427}
TFIDF+LogReg       | TEST acc=0.6537 f1=0.6612 prec=0.6455 rec=0.6778
TFIDF+LinearSVM    | TEST acc=0.6427 f1=0.6560 prec=0.6308 rec=0.6833
TFIDF+NaiveBayes   | TEST acc=0.6704 f1=0.6775 prec=0.6614 rec=0.6944
TFIDF+RandomForest | TEST acc=0.6648 f1=0.6513 prec=0.6766 rec=0.6278
Saved: classic_baselines_sadness.csv

DATASET: Joy
FILE   : Joy_anon.csv
Total  : 6043
Train/Val/Test: 4834 604 605
Train label counts: {0: 2424, 1: 241

Unnamed: 0,dataset,model,val_acc,val_f1,val_precision,val_recall,test_acc,test_f1,test_precision,test_recall
0,Anger,TFIDF+NaiveBayes,0.511905,0.57732,0.5,0.682927,0.607143,0.611765,0.509804,0.764706
1,Anger,TFIDF+RandomForest,0.488095,0.516854,0.479167,0.560976,0.583333,0.556962,0.488889,0.647059
2,Anger,TFIDF+LogReg,0.488095,0.537634,0.480769,0.609756,0.571429,0.538462,0.477273,0.617647
3,Anger,TFIDF+LinearSVM,0.488095,0.516854,0.479167,0.560976,0.547619,0.5,0.452381,0.558824
4,Anticipation,TFIDF+LogReg,0.617647,0.606061,0.625,0.588235,0.714286,0.73913,0.68,0.809524
5,Anticipation,TFIDF+NaiveBayes,0.588235,0.611111,0.578947,0.647059,0.714286,0.73913,0.68,0.809524
6,Anticipation,TFIDF+LinearSVM,0.529412,0.529412,0.529412,0.529412,0.714286,0.727273,0.695652,0.761905
7,Anticipation,TFIDF+RandomForest,0.705882,0.666667,0.769231,0.588235,0.714286,0.7,0.736842,0.666667
8,Disgust,TFIDF+NaiveBayes,0.611111,0.678899,0.578125,0.822222,0.575758,0.621622,0.560976,0.69697
9,Disgust,TFIDF+RandomForest,0.588889,0.554217,0.605263,0.511111,0.590909,0.557377,0.607143,0.515152





Unnamed: 0,dataset,model,val_acc,val_f1,val_precision,val_recall,test_acc,test_f1,test_precision,test_recall
0,Anger,TFIDF+NaiveBayes,0.511905,0.57732,0.5,0.682927,0.607143,0.611765,0.509804,0.764706
1,Anticipation,TFIDF+LogReg,0.617647,0.606061,0.625,0.588235,0.714286,0.73913,0.68,0.809524
2,Disgust,TFIDF+NaiveBayes,0.611111,0.678899,0.578125,0.822222,0.575758,0.621622,0.560976,0.69697
3,Fear,TFIDF+NaiveBayes,0.693878,0.708995,0.654723,0.773077,0.669759,0.704319,0.670886,0.741259
4,Joy,TFIDF+NaiveBayes,0.769868,0.768719,0.796552,0.742765,0.768595,0.756944,0.792727,0.724252
5,Sadness,TFIDF+NaiveBayes,0.686981,0.722359,0.696682,0.75,0.67036,0.677507,0.661376,0.694444
6,Surprise,TFIDF+LinearSVM,0.598039,0.57732,0.608696,0.54902,0.636364,0.62963,0.641509,0.618182
7,Trust,TFIDF+NaiveBayes,0.555556,0.5625,0.556701,0.568421,0.587302,0.589474,0.57732,0.602151


Saved: classic_baselines_ALL.csv
Saved: classic_baselines_BEST_PER_DATASET.csv
