In [None]:
# ====== FULL COLAB-FRIENDLY PREPROCESSING CELL ======
import re, unicodedata, pandas as pd
from pathlib import Path

# Optional lemmatization (toggle below). If you want lemmatization, uncomment the nltk lines and run downloads.
_LEMMATIZE = False
if _LEMMATIZE:
    import nltk
    from nltk.stem import WordNetLemmatizer
    nltk.download('punkt', quiet=True)
    nltk.download('wordnet', quiet=True)
    nltk.download('omw-1.4', quiet=True)
    lemmatizer = WordNetLemmatizer()

# Try to use contractions lib if present (best), otherwise fallback mapping below.
try:
    import contractions as _contractions_lib
    _HAS_CONTRACTIONS = True
except Exception:
    _HAS_CONTRACTIONS = False

# ---------------------- LARGE FALLBACK CONTRACTION MAP ----------------------
# (Includes standard, informal, and spaced/no-apostrophe variants.)
_FALLBACK = {
    # negatives
    "don't":"do not","do n't":"do not","don t":"do not",
    "doesn't":"does not","does n't":"does not","does t":"does not",
    "didn't":"did not","did n't":"did not","did t":"did not",
    "can't":"cannot","can t":"cannot","ca n't":"cannot",
    "won't":"will not","won t":"will not","wo n't":"will not",
    "isn't":"is not","is n't":"is not","isn t":"is not",
    "aren't":"are not","are n't":"are not","aren t":"are not",
    "wasn't":"was not","was n't":"was not","wasn t":"was not",
    "weren't":"were not","were n't":"were not","weren t":"were not",
    "shouldn't":"should not","should n't":"should not","shouldn t":"should not",
    "wouldn't":"would not","would n't":"would not","wouldn t":"would not",
    "couldn't":"could not","could n't":"could not","couldn t":"could not",
    "mightn't":"might not","might n't":"might not","mightn t":"might not",
    "mustn't":"must not","must n't":"must not","mustn t":"must not",
    "needn't":"need not","need n't":"need not","needn t":"need not",
    "shan't":"shall not","shan t":"shall not",
    "ain't":"is not","ain t":"is not",

    # have / 've
    "i've":"i have","i ve":"i have",
    "you've":"you have","you ve":"you have",
    "we've":"we have","we ve":"we have",
    "they've":"they have","they ve":"they have",
    "could've":"could have","could ve":"could have",
    "should've":"should have","should ve":"should have",
    "would've":"would have","would ve":"would have",
    "must've":"must have","must ve":"must have",
    "might've":"might have","might ve":"might have",

    # will / 'll
    "i'll":"i will","i ll":"i will",
    "you'll":"you will","you ll":"you will",
    "he'll":"he will","he ll":"he will",
    "she'll":"she will","she ll":"she will",
    "we'll":"we will","we ll":"we will",
    "they'll":"they will","they ll":"they will",
    "there'll":"there will","there ll":"there will",

    # would / had / 'd
    "i'd":"i would","i d":"i would","i'd've":"i would have","i d ve":"i would have",
    "you'd":"you would","you d":"you would",
    "he'd":"he would","he d":"he would",
    "she'd":"she would","she d":"she would",
    "we'd":"we would","we d":"we would",
    "they'd":"they would","they d":"they would",

    # be / are
    "you're":"you are","you re":"you are",
    "we're":"we are","we re":"we are",
    "they're":"they are","they re":"they are",
    "who're":"who are","who re":"who are",

    # is / 's
    "he's":"he is","he s":"he is",
    "she's":"she is","she s":"she is",
    "it's":"it is","it s":"it is",
    "what's":"what is","what s":"what is",
    "that's":"that is","that s":"that is",
    "there's":"there is","there s":"there is",
    "here's":"here is","here s":"here is",

    # am
    "i'm":"i am","i m":"i am",

    # let us
    "let's":"let us","let s":"let us",

    # colloquial / spoken
    "gonna":"going to","wanna":"want to","gotta":"got to",
    "kinda":"kind of","sorta":"sort of","lemme":"let me","gimme":"give me",
    "outta":"out of","lotta":"lot of","y'know":"you know","y all":"you all","y'all":"you all",

    # contractions common in subtitles
    "o'clock":"of the clock","c'mon":"come on","ma'am":"madam",

    # extra safe spaced/no-apostrophe patterns
    "don t":"do not","won t":"will not","can t":"cannot","isn t":"is not","aren t":"are not",
    "ve":" have", "'ve":" have", "'re":" are", "'ll":" will", "'d":" would"
}

# compile fallback regex once (longer keys first)
_FALLBACK_PATTERN = re.compile(
    r"\b(" + "|".join(sorted((re.escape(k) for k in _FALLBACK.keys()), key=len, reverse=True)) + r")\b",
    flags=re.IGNORECASE
)

# emoji/pictograph regex
_EMOJI_RE = re.compile(
    "["
    "\U0001F600-\U0001F64F"
    "\U0001F300-\U0001F5FF"
    "\U0001F680-\U0001F6FF"
    "\U0001F1E0-\U0001F1FF"
    "\U00002700-\U000027BF"
    "\U000024C2-\U0001F251"
    "]+", flags=re.UNICODE)

# punctuation removal (we will remove after expansions)
_PUNC_RE = re.compile(r"[^\w\s]", flags=re.UNICODE)

# mapping for common cp1252 / mojibake quote/artifact characters -> ASCII apostrophe/double-quote
_ENCODING_ARTIFACTS = {
    '\x92': "'", '\x91': "'", '\x93': '"', '\x94': '"',
    '': "'", '‘': "'", '’': "'", '\u2019': "'", '\u2018': "'",
    'Â’': "'", 'â€™': "'", '\u00C2\u2019': "'", '�': "'", 'Ã': ''
}

def _fix_encoding_artifacts(text: str) -> str:
    if not isinstance(text, str):
        return text
    text = unicodedata.normalize('NFKC', text)
    # replace known artifacts
    for k, v in _ENCODING_ARTIFACTS.items():
        if k in text:
            text = text.replace(k, v)
    # remove stray standalone Â if present
    text = text.replace('Â', '')
    # ensure curly quotes converted
    for ch in ["\u2019","\u2018","’","‘","`","´"]:
        text = text.replace(ch, "'")
    return text

def _expand_contractions(text: str) -> str:
    if not isinstance(text, str):
        return text
    # use library if available
    if _HAS_CONTRACTIONS:
        try:
            text = _contractions_lib.fix(text)
        except Exception:
            pass
    # fallback map replacements
    text = _FALLBACK_PATTERN.sub(lambda m: _FALLBACK[m.group(0).lower()], text)
    # handle split forms like "must 've" / "must ve" -> "must have"
    text = re.sub(r"\b([A-Za-z]+)[\s']ve\b", r"\1 have", text, flags=re.IGNORECASE)
    # handle don't / can't separated: "don t", "don 't" -> "do not", "can t" -> "cannot"
    text = re.sub(r"\bdon[\s']t\b", "do not", text, flags=re.IGNORECASE)
    text = re.sub(r"\bcan[\s']t\b", "cannot", text, flags=re.IGNORECASE)
    # generic "<word> 't" -> "<word> not" (special-casing 'can' already handled)
    text = re.sub(r"\b([A-Za-z]+)[\s']t\b", lambda m: (m.group(1) + " not") if m.group(1).lower() not in ("ca","can") else "cannot", text, flags=re.IGNORECASE)
    return text

def _fix_split_contractions_post(text: str) -> str:
    # extra heuristics after punctuation removal (space-split tokens)
    rules = {
        r"\byou re\b":"you are", r"\bwe re\b":"we are", r"\bthey re\b":"they are",
        r"\bi m\b":"i am", r"\bi ll\b":"i will", r"\bi d\b":"i would",
        r"\bmust ve\b":"must have", r"\bshould ve\b":"should have",
        r"\bwould ve\b":"would have", r"\bcould ve\b":"could have",
        r"\bdon t\b":"do not", r"\bwon t\b":"will not", r"\bcan t\b":"cannot"
    }
    for p, rpl in rules.items():
        text = re.sub(p, rpl, text, flags=re.IGNORECASE)
    return text

def clean_utterance(text: str, lemmatize: bool = False) -> str:
    # 1) fix encoding/artifacts and normalize quotes -> ASCII apostrophe
    t = _fix_encoding_artifacts(text)
    # 2) expand contractions while apostrophes are present
    t = _expand_contractions(t)
    # 3) remove emojis/pictographs
    t = _EMOJI_RE.sub("", t)
    # 4) lowercase
    t = t.lower()
    # 5) remove punctuation (now safe)
    t = _PUNC_RE.sub(" ", t)
    # 6) collapse whitespace
    t = re.sub(r"\s+", " ", t).strip()
    # 7) fix split contractions produced by missing apostrophes
    t = _fix_split_contractions_post(t)
    # 8) optional lemmatization
    if lemmatize and _LEMMATIZE:
        # simple tokenization and lemmatize each word
        tokens = re.findall(r"\w+", t)
        tokens = [lemmatizer.lemmatize(tok) for tok in tokens]
        t = " ".join(tokens)
    return t

# -------------------- File processing (Colab friendly) --------------------
def clean_meld_file_colab(input_path: str, output_path: str = None,
                          text_col: str = "Utterance", label_col: str = "Emotion",
                          lemmatize: bool = False):
    p = Path(input_path)
    if not p.exists():
        raise FileNotFoundError(f"{input_path} not found")
    # read with latin1 (preserves cp1252 bytes) so we can normalize deterministically
    try:
        df = pd.read_csv(p, encoding='latin1', dtype=str)
    except Exception:
        df = pd.read_csv(p, encoding='utf-8', dtype=str, errors='replace')
    # normalize header names
    df.columns = [c.strip() for c in df.columns]
    if text_col not in df.columns:
        raise KeyError(f"Column '{text_col}' not found. Columns: {df.columns.tolist()}")
    # apply cleaning
    df["clean_text"] = df[text_col].astype(str).apply(lambda x: clean_utterance(x, lemmatize=lemmatize))


    # ---------- GRAPH ANALYSIS ----------
import matplotlib.pyplot as plt

# text length column
df["text_len"] = df["clean_text"].apply(lambda x: len(x.split()))

# text length distribution
plt.figure()
df["text_len"].plot(kind="hist", bins=40)
plt.title("Text Length Distribution")
plt.xlabel("Words per utterance")
plt.ylabel("Frequency")
plt.show()

# if labels exist
if "label" in df.columns:
    plt.figure()
    df["label"].value_counts().sort_index().plot(kind="bar")
    plt.title("Label Distribution")
    plt.xlabel("Label id")
    plt.ylabel("Count")
    plt.show()


    # drop rows with empty cleaned text (optional)
    df = df[df["clean_text"].str.strip() != ""].copy()
    # optional label encoding
    if label_col in df.columns:
        label_order = ['neutral','joy','sadness','anger','fear','disgust','surprise']
        df["label"] = df[label_col].astype(str).str.lower().map({k:i for i,k in enumerate(label_order)})
    # save cleaned file as UTF-8
    if output_path is None:
        output_path = p.with_name(p.stem + "_cleaned_utf8.csv")
    df.to_csv(output_path, index=False, encoding='utf-8')
    print(f"Saved cleaned file to: {output_path}  (rows: {len(df)})")
    return df

# -------------------- Quick demo / usage instructions --------------------
print("Preprocessing functions ready.\nUsage in Colab:")
print("1) Upload your file (files.upload())")
print("2) cleaned = clean_meld_file_colab('yourfile.csv', 'yourfile_cleaned.csv')")
print("3) files.download('yourfile_cleaned.csv') to retrieve it\n")

# Quick diagnostic example on common problematic strings:
_examples = [
    "Now youÂ’ll be heading a whole division, so youÂ’ll have a lot of duties.",
    "You mustÂ’ve had your hands full.",
    "No donÂ’t I beg of you!",
    "So lets talk a little bit about your duties."
]
print("Demo conversions (RAW => CLEAN):")
for s in _examples:
    print("RAW:", s, "=> CLEAN:", clean_utterance(s))


NameError: name 'df' is not defined

In [None]:
from google.colab import files
uploaded = files.upload()


Saving train_cleaned.csv to train_cleaned (1).csv


In [None]:
df_clean = clean_meld_file_colab("train_sent_emo.csv", "train_cleaned.csv",lemmatize=False)



Saved cleaned file to: train_cleaned.csv  (rows: 9988)


Baseline models training ->

In [None]:
# =============================================================
# BASELINE MODEL TRAINING PIPELINE (FULL + SAFE VERSION) + RandomForest
# =============================================================

import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_predict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, LinearSVR
from sklearn.naive_bayes import MultinomialNB
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, f1_score, precision_score,
                             recall_score, classification_report,
                             confusion_matrix, balanced_accuracy_score)
from sklearn.preprocessing import LabelEncoder
import joblib
import warnings
warnings.filterwarnings("ignore")


# =============================================================
# USER INPUT — CSV FILE PATH
# =============================================================
CSV_PATH = input("Enter CSV file path: ").strip()  # e.g. train_cleaned.csv

TEXT_COL = "clean_text"   # your dataset column
LABEL_COL = "label"       # numeric emotion IDs

# Emotion order provided
label_order = ['neutral','joy','sadness','anger','fear','disgust','surprise']


# =============================================================
# LOAD & CLEAN DATA (SAFE VERSION)
# =============================================================
def load_and_prepare(csv_path):

    print("\n=================================")
    print("📌 Loading CSV...")
    print("=================================")

    df = pd.read_csv(csv_path)

    print("\nColumns found:")
    print(df.columns.tolist())

    print("\nChecking missing values in each column:")
    print(df.isna().sum())

    # Ensure text column exists
    if TEXT_COL not in df.columns:
        raise ValueError(f"ERROR: Text column '{TEXT_COL}' not found. Available: {df.columns.tolist()}")

    # Ensure label column exists
    if LABEL_COL not in df.columns:
        raise ValueError(f"ERROR: Label column '{LABEL_COL}' not found. Available: {df.columns.tolist()}")

    # Clean text
    texts = df[TEXT_COL].astype(str).fillna("").tolist()

    # Load labels
    labels_raw = df[LABEL_COL]

    # If label column is numeric already
    if labels_raw.dtype != object:
        y = labels_raw.fillna(0).astype(int).tolist()

        # Build label encoder manually to match label_order
        class SimpleLE:
            def __init__(self, classes):
                self.classes_ = np.array(classes)

        le = SimpleLE(label_order)
        return texts, y, le

    # If labels are strings → map strings to IDs
    labels_raw = labels_raw.astype(str).str.strip().str.lower()

    mapping = {lab: idx for idx, lab in enumerate(label_order)}

    y_mapped = labels_raw.map(mapping)

    # Find unmapped labels
    missing_mask = y_mapped.isna()
    missing_count = missing_mask.sum()

    if missing_count > 0:
        print("\n⚠ WARNING: Unknown / Missing labels found!")
        print("These labels do NOT exist in label_order:")
        print(labels_raw[missing_mask].value_counts())

        print("\nDropping rows with missing labels...")
        df = df[~missing_mask]
        texts = df[TEXT_COL].astype(str).fillna("").tolist()
        y_mapped = df[LABEL_COL].astype(str).str.strip().str.lower().map(mapping)

    y = y_mapped.astype(int).tolist()

    # Build a simple label encoder (in correct order)
    class SimpleLE:
        def __init__(self, classes):
            self.classes_ = np.array(classes)

    le = SimpleLE(label_order)

    print("\nFinal label distribution:")
    print(pd.Series(y).value_counts().sort_index())

    return texts, y, le


# =============================================================
# EVALUATION FUNCTION
# =============================================================
def print_eval(y_true, y_pred, le):
    acc = accuracy_score(y_true, y_pred)
    macro_f1 = f1_score(y_true, y_pred, average="macro")
    bal_acc = balanced_accuracy_score(y_true, y_pred)

    print(f"Accuracy: {acc:.4f}")
    print(f"Macro F1: {macro_f1:.4f}")
    print(f"Balanced Accuracy: {bal_acc:.4f}")

    print("\nClassification Report:")
    print(classification_report(y_true, y_pred, target_names=le.classes_))

    print("\nConfusion Matrix:")
    print(confusion_matrix(y_true, y_pred))
    print("-" * 60)


# =============================================================
# MODEL PIPELINES (added RandomForest)
# =============================================================
def get_pipelines():

    tfidf = TfidfVectorizer(
        max_features=20000,
        ngram_range=(1, 2),
        sublinear_tf=True
    )

    pipelines = {

        "logistic_regression": Pipeline([
            ("tfidf", tfidf),
            ("clf", LogisticRegression(
                max_iter=2000,
                multi_class="multinomial",
                solver="saga",
                class_weight="balanced"
            ))
        ]),

        "linear_svc": Pipeline([
            ("tfidf", tfidf),
            ("clf", LinearSVC(class_weight="balanced"))
        ]),

        "multinomial_nb": Pipeline([
            ("tfidf", tfidf),
            ("clf", MultinomialNB())
        ]),

        "linear_svr": Pipeline([
            ("tfidf", tfidf),
            ("clf", LinearSVR(C=1.0, epsilon=0.1))
        ]),

        "lightgbm": Pipeline([
            ("tfidf", tfidf),
            ("clf", LGBMClassifier(
                objective="multiclass",
                num_class=len(label_order),
                n_estimators=300,
                learning_rate=0.1,
                num_leaves=31,
                class_weight="balanced",
                random_state=42
            ))
        ]),

        # Random Forest pipeline (added)
        "random_forest": Pipeline([
            ("tfidf", tfidf),
            ("clf", RandomForestClassifier(
                n_estimators=200,
                max_features="sqrt",
                class_weight="balanced",
                n_jobs=-1,
                random_state=42
            ))
        ]),
    }

    return pipelines


# =============================================================
# TRAIN + EVALUATE
# =============================================================
def train_test_evaluate(X, y, le):

    pipelines = get_pipelines()

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.20, stratify=y, random_state=42
    )

    for name, pipe in pipelines.items():

        print(f"\n==============================")
        print(f"TRAINING MODEL → {name}")
        print("==============================")

        pipe.fit(X_train, y_train)
        y_pred = pipe.predict(X_test)

        # LinearSVR output must be rounded
        if name == "linear_svr":
            y_pred = np.clip(np.rint(y_pred), 0, len(label_order) - 1).astype(int)

        print_eval(y_test, y_pred, le)

        joblib.dump(pipe, f"{name}_pipeline.joblib")
        print(f"Model saved → {name}_pipeline.joblib")


# =============================================================
# MAIN
# =============================================================
if __name__ == "__main__":
    print("\nLoading dataset...")
    X, y, le = load_and_prepare(CSV_PATH)

    print("\nDataset loaded successfully.")
    print(f"Total samples: {len(X)}")

    train_test_evaluate(X, y, le)

    print("\nTraining finished! All models saved.")


Enter CSV file path: train_cleaned.csv

Loading dataset...

📌 Loading CSV...

Columns found:
['Sr No.', 'Utterance', 'Speaker', 'Emotion', 'Sentiment', 'Dialogue_ID', 'Utterance_ID', 'Season', 'Episode', 'StartTime', 'EndTime', 'clean_text', 'label']

Checking missing values in each column:
Sr No.          0
Utterance       0
Speaker         0
Emotion         0
Sentiment       0
Dialogue_ID     0
Utterance_ID    0
Season          0
Episode         0
StartTime       0
EndTime         0
clean_text      0
label           0
dtype: int64

Dataset loaded successfully.
Total samples: 9988

TRAINING MODEL → logistic_regression
Accuracy: 0.3478
Macro F1: 0.2852
Balanced Accuracy: 0.3503

Classification Report:
              precision    recall  f1-score   support

     neutral       0.65      0.36      0.46       942
         joy       0.45      0.30      0.36       349
     sadness       0.18      0.21      0.19       137
       anger       0.24      0.24      0.24       222
        fear      

In [None]:
# sbert_baseline_training.py
"""
Train baseline models using Sentence-BERT embeddings.
Saves model files: <model_name>_sbert.joblib
"""

import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, LinearSVR
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix, balanced_accuracy_score
import joblib
import warnings
warnings.filterwarnings("ignore")

# Sentence-BERT
from sentence_transformers import SentenceTransformer
MODEL_NAME = "all-MiniLM-L6-v2"  # fast & good; change if you prefer larger models

# -------------------------
# USER INPUT / CONFIG
# -------------------------
CSV_PATH = input("Enter CSV file path: ").strip()  # e.g. train_cleaned.csv
TEXT_COL = "clean_text"
LABEL_COL = "label"

label_order = ['neutral','joy','sadness','anger','fear','disgust','surprise']

# Toggle scaling (LinearSVC may work fine without scaling for SBERT, but scaling is harmless)
SCALE_EMBEDDINGS = True
RANDOM_STATE = 42

# -------------------------
# LOAD + PREPARE
# -------------------------
def load_and_prepare(csv_path):
    print("Loading CSV:", csv_path)
    df = pd.read_csv(csv_path)
    print("Columns:", df.columns.tolist())

    if TEXT_COL not in df.columns:
        raise ValueError(f"Text column '{TEXT_COL}' not found.")
    if LABEL_COL not in df.columns:
        raise ValueError(f"Label column '{LABEL_COL}' not found.")

    texts = df[TEXT_COL].astype(str).fillna("").tolist()
    labels_raw = df[LABEL_COL]

    # If numeric labels already
    if labels_raw.dtype != object:
        y = labels_raw.fillna(0).astype(int).tolist()
        class SimpleLE:
            def __init__(self, classes): self.classes_ = np.array(classes)
        le = SimpleLE(label_order)
        return texts, y, le

    # Map textual labels to indices using label_order
    labels_raw = labels_raw.astype(str).str.strip().str.lower()
    mapping = {lab: idx for idx, lab in enumerate(label_order)}
    y_mapped = labels_raw.map(mapping)

    missing_mask = y_mapped.isna()
    if missing_mask.any():
        print("WARNING: Dropping rows with unknown labels:", labels_raw[missing_mask].unique())
        df = df[~missing_mask]
        texts = df[TEXT_COL].astype(str).fillna("").tolist()
        y_mapped = df[LABEL_COL].astype(str).str.strip().str.lower().map(mapping)

    y = y_mapped.astype(int).tolist()
    class SimpleLE:
        def __init__(self, classes): self.classes_ = np.array(classes)
    le = SimpleLE(label_order)

    print("Final label distribution:")
    print(pd.Series(y).value_counts().sort_index())

    return texts, y, le

# -------------------------
# SBERT EMBEDDINGS
# -------------------------
def compute_sbert_embeddings(texts, model_name=MODEL_NAME, batch_size=64):
    print(f"Loading SBERT model: {model_name}")
    model = SentenceTransformer(model_name)
    print(f"Encoding {len(texts)} texts (batch_size={batch_size}) ...")
    embeddings = model.encode(texts, batch_size=batch_size, show_progress_bar=True, convert_to_numpy=True)
    print("Embeddings shape:", embeddings.shape)
    return embeddings

# -------------------------
# EVALUATION
# -------------------------
def print_eval(y_true, y_pred, le):
    acc = accuracy_score(y_true, y_pred)
    macro_f1 = f1_score(y_true, y_pred, average="macro")
    bal_acc = balanced_accuracy_score(y_true, y_pred)
    print(f"Accuracy: {acc:.4f}  Macro F1: {macro_f1:.4f}  Balanced Acc: {bal_acc:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred, target_names=le.classes_))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_true, y_pred))
    print("-" * 60)

# -------------------------
# MAIN TRAINING ROUTINE
# -------------------------
def train_on_embeddings(X, y, le):
    # create model objects
    models = {
        "linear_svc": LinearSVC(class_weight="balanced", random_state=RANDOM_STATE, max_iter=5000),
        "logistic_regression": LogisticRegression(max_iter=2000, multi_class="multinomial", solver="saga", class_weight="balanced", random_state=RANDOM_STATE),
        "random_forest": RandomForestClassifier(n_estimators=200, max_features="sqrt", class_weight="balanced", n_jobs=-1, random_state=RANDOM_STATE),
        "lightgbm": LGBMClassifier(objective="multiclass", num_class=len(label_order), n_estimators=300, learning_rate=0.1, num_leaves=31, class_weight="balanced", random_state=RANDOM_STATE),
        "linear_svr": LinearSVR(C=1.0, epsilon=0.1, random_state=RANDOM_STATE),
    }

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y, random_state=RANDOM_STATE)

    # Optional scaling
    scaler = None
    if SCALE_EMBEDDINGS:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        joblib.dump(scaler, "sbert_scaler.joblib")
        print("Saved scaler -> sbert_scaler.joblib")

    for name, clf in models.items():
        print(f"\nTraining {name} ...")
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        if name == "linear_svr":
            # convert continuous outputs to nearest class
            y_pred = np.clip(np.rint(y_pred), 0, len(label_order) - 1).astype(int)

        print_eval(y_test, y_pred, le)

        # save model
        fname = f"{name}_sbert.joblib"
        joblib.dump(clf, fname)
        print("Saved model ->", fname)

# -------------------------
# ENTRY POINT
# -------------------------
if __name__ == "__main__":
    texts, y, le = load_and_prepare(CSV_PATH)
    print(f"Loaded {len(texts)} texts")

    # compute embeddings (this may take time on CPU)
    embeddings = compute_sbert_embeddings(texts)

    # train models on embeddings
    train_on_embeddings(embeddings, y, le)

    print("All done.")


Enter CSV file path: train_cleaned.csv
Loading CSV: train_cleaned.csv
Columns: ['Sr No.', 'Utterance', 'Speaker', 'Emotion', 'Sentiment', 'Dialogue_ID', 'Utterance_ID', 'Season', 'Episode', 'StartTime', 'EndTime', 'clean_text', 'label']
Loaded 9988 texts
Loading SBERT model: all-MiniLM-L6-v2


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Encoding 9988 texts (batch_size=64) ...


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

Embeddings shape: (9988, 384)
Saved scaler -> sbert_scaler.joblib

Training linear_svc ...
Accuracy: 0.4164  Macro F1: 0.3078  Balanced Acc: 0.3434

Classification Report:
              precision    recall  f1-score   support

     neutral       0.66      0.51      0.57       942
         joy       0.41      0.38      0.40       349
     sadness       0.25      0.33      0.28       137
       anger       0.25      0.19      0.22       222
        fear       0.09      0.28      0.13        53
     disgust       0.09      0.28      0.14        54
    surprise       0.39      0.43      0.41       241

    accuracy                           0.42      1998
   macro avg       0.31      0.34      0.31      1998
weighted avg       0.48      0.42      0.44      1998


Confusion Matrix:
[[476 126  63  66  71  52  88]
 [ 90 134  20  21  23  30  31]
 [ 29   8  45  13  12  19  11]
 [ 51  25  22  43  30  25  26]
 [ 16   2   6   7  15   4   3]
 [ 10   3   5  10   7  15   4]
 [ 47  25  18  14  18  15 

In [None]:
# train_fusion_baselines.py
"""
Train baseline models on TF-IDF (reduced) + SBERT embeddings fusion.
MultinomialNB is trained on TF-IDF sparse (recommended for MNB).
Other models use concatenated dense features.
"""

import os
import numpy as np
import pandas as pd
import joblib
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, LinearSVR
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import (accuracy_score, f1_score, classification_report,
                             confusion_matrix, balanced_accuracy_score)
from sentence_transformers import SentenceTransformer

# -------------------------
# USER CONFIG
# -------------------------
CSV_PATH = input("Enter CSV file path: ").strip()  # e.g. train_cleaned.csv
TEXT_COL = "clean_text"
LABEL_COL = "label"

# label order (adjust if needed)
label_order = ['neutral','joy','sadness','anger','fear','disgust','surprise']

# SBERT model
SBERT_MODEL = "all-MiniLM-L6-v2"   # change to a larger model if you want
SBERT_BATCH = 64

# TF-IDF + SVD config
TFIDF_MAX_FEATURES = 20000
TFIDF_NGRAM = (1, 2)
SVD_COMPONENTS = 300  # reduce TF-IDF to 300-d dense

# training randomness
RANDOM_STATE = 42

# -------------------------
# HELPERS
# -------------------------
def load_and_prepare(csv_path):
    print("Loading CSV:", csv_path)
    df = pd.read_csv(csv_path)
    print("Columns found:", df.columns.tolist())

    if TEXT_COL not in df.columns:
        raise ValueError(f"Text column '{TEXT_COL}' not found.")
    if LABEL_COL not in df.columns:
        raise ValueError(f"Label column '{LABEL_COL}' not found.")

    texts = df[TEXT_COL].astype(str).fillna("").tolist()
    labels_raw = df[LABEL_COL]

    # numeric labels
    if labels_raw.dtype != object:
        y = labels_raw.fillna(0).astype(int).tolist()
        class SimpleLE:
            def __init__(self, classes): self.classes_ = np.array(classes)
        le = SimpleLE(label_order)
        return texts, y, le

    labels_raw = labels_raw.astype(str).str.strip().str.lower()
    mapping = {lab: idx for idx, lab in enumerate(label_order)}
    y_mapped = labels_raw.map(mapping)

    missing_mask = y_mapped.isna()
    if missing_mask.any():
        print("WARNING: Dropping rows with unknown labels:", labels_raw[missing_mask].unique())
        df = df[~missing_mask]
        texts = df[TEXT_COL].astype(str).fillna("").tolist()
        y_mapped = df[LABEL_COL].astype(str).str.strip().str.lower().map(mapping)

    y = y_mapped.astype(int).tolist()
    class SimpleLE:
        def __init__(self, classes): self.classes_ = np.array(classes)
    le = SimpleLE(label_order)

    print("Final label distribution (index = label id):")
    print(pd.Series(y).value_counts().sort_index())

    return texts, y, le

def print_eval(y_true, y_pred, le):
    acc = accuracy_score(y_true, y_pred)
    macro_f1 = f1_score(y_true, y_pred, average="macro")
    bal_acc = balanced_accuracy_score(y_true, y_pred)
    print(f"Accuracy: {acc:.4f}  Macro F1: {macro_f1:.4f}  Balanced Acc: {bal_acc:.4f}\n")
    print("Classification Report:")
    print(classification_report(y_true, y_pred, target_names=le.classes_))
    print("Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))
    print("-" * 60)

# -------------------------
# SBERT embeddings
# -------------------------
def compute_sbert_embeddings(texts, model_name=SBERT_MODEL, batch_size=SBERT_BATCH):
    print(f"Loading SBERT model: {model_name}")
    model = SentenceTransformer(model_name)
    print(f"Encoding {len(texts)} texts ...")
    embeddings = model.encode(texts, batch_size=batch_size, show_progress_bar=True, convert_to_numpy=True)
    print("Embeddings shape:", embeddings.shape)
    return embeddings

# -------------------------
# MAIN
# -------------------------
if __name__ == "__main__":
    texts, y, le = load_and_prepare(CSV_PATH)
    n_samples = len(texts)
    print(f"Loaded {n_samples} samples.")

    # Compute SBERT embeddings
    embeddings = compute_sbert_embeddings(texts)

    # TF-IDF (sparse) — keep this for MNB and also reduce to dense for concatenation
    print("Fitting TF-IDF (sparse) on full texts...")
    tfidf = TfidfVectorizer(max_features=TFIDF_MAX_FEATURES, ngram_range=TFIDF_NGRAM, sublinear_tf=True)
    X_tfidf_sparse = tfidf.fit_transform(texts)
    joblib.dump(tfidf, "tfidf_vectorizer.joblib")
    print("Saved TF-IDF vectorizer -> tfidf_vectorizer.joblib; TF-IDF shape:", X_tfidf_sparse.shape)

    # TruncatedSVD to get dense TF-IDF representation for concatenation
    print(f"Reducing TF-IDF -> dense via TruncatedSVD (n_components={SVD_COMPONENTS}) ...")
    svd = TruncatedSVD(n_components=SVD_COMPONENTS, random_state=RANDOM_STATE)
    X_tfidf_dense = svd.fit_transform(X_tfidf_sparse)
    joblib.dump(svd, "tfidf_truncated_svd.joblib")
    print("Saved SVD -> tfidf_truncated_svd.joblib; dense shape:", X_tfidf_dense.shape)

    # Standardize SBERT embeddings
    print("Scaling SBERT embeddings with StandardScaler ...")
    scaler = StandardScaler()
    X_emb = scaler.fit_transform(embeddings)
    joblib.dump(scaler, "sbert_scaler.joblib")
    print("Saved scaler -> sbert_scaler.joblib; emb shape:", X_emb.shape)

    # Combine features (dense)
    print("Concatenating TF-IDF dense + SBERT embeddings ...")
    X_combined = np.hstack([X_tfidf_dense, X_emb])  # shape (n_samples, SVD_COMPONENTS + emb_dim)
    print("Combined feature shape:", X_combined.shape)

    # -------------------------
    # Proper, aligned train/test split (stratified)
    # -------------------------
    idx = np.arange(len(y))
    train_idx, test_idx = train_test_split(idx, test_size=0.20, stratify=y, random_state=RANDOM_STATE)

    X_train_comb = X_combined[train_idx]
    X_test_comb  = X_combined[test_idx]

    y = np.array(y)
    y_train = y[train_idx]
    y_test  = y[test_idx]

    # TF-IDF sparse splits for MultinomialNB
    X_train_tfidf_sparse = X_tfidf_sparse[train_idx]
    X_test_tfidf_sparse  = X_tfidf_sparse[test_idx]

    print("Train/test sizes:", X_train_comb.shape[0], X_test_comb.shape[0])

    # -------------------------
    # Define models (trained on X_combined unless noted)
    # -------------------------
    models = {
        "logistic_regression": ("combined", LogisticRegression(max_iter=2000, multi_class="multinomial", solver="saga", class_weight="balanced", random_state=RANDOM_STATE)),
        "linear_svc": ("combined", LinearSVC(class_weight="balanced", C=1.0, max_iter=5000, random_state=RANDOM_STATE)),
        "multinomial_nb_tfidf": ("tfidf_sparse", MultinomialNB()),  # classic MNB on counts/TF-IDF
        "linear_svr": ("combined", LinearSVR(C=1.0, epsilon=0.1, random_state=RANDOM_STATE)),
        "lightgbm": ("combined", LGBMClassifier(objective="multiclass", num_class=len(label_order), n_estimators=300, learning_rate=0.1, num_leaves=31, class_weight="balanced", random_state=RANDOM_STATE)),
        "random_forest": ("combined", RandomForestClassifier(n_estimators=200, max_features="sqrt", class_weight="balanced", n_jobs=-1, random_state=RANDOM_STATE)),
    }

    # -------------------------
    # Train & evaluate
    # -------------------------
    for name, (mode, clf) in models.items():
        print(f"\n==============================")
        print(f"TRAINING MODEL → {name}  (mode={mode})")
        print("==============================")

        if mode == "combined":
            X_tr = X_train_comb
            X_te = X_test_comb
        elif mode == "tfidf_sparse":
            X_tr = X_train_tfidf_sparse
            X_te = X_test_tfidf_sparse
        else:
            raise ValueError("Unknown mode")

        # Fit
        clf.fit(X_tr, y_train)

        # Predict
        y_pred = clf.predict(X_te)

        # LinearSVR -> round
        if name == "linear_svr":
            y_pred = np.clip(np.rint(y_pred), 0, len(label_order)-1).astype(int)

        # Evaluate
        print_eval(y_test, y_pred, le)

        # Save: also save which preprocessors are required for inference
        if mode == "tfidf_sparse":
            # model expects TF-IDF vectorizer -> save alongside
            joblib.dump({"model": clf, "tfidf": tfidf}, f"{name}.joblib")
            print(f"Saved -> {name}.joblib (includes tfidf)")
        else:
            # model expects SVD + scaler + tfidf? For simplicity save combined pipeline pieces
            joblib.dump({
                "model": clf,
                "svd": svd,
                "scaler": scaler,
                "tfidf_for_reference": tfidf
            }, f"{name}_fusion.joblib")
            print(f"Saved -> {name}_fusion.joblib")

    print("\nAll models trained and saved.")


Enter CSV file path: train_cleaned.csv
Loading CSV: train_cleaned.csv
Columns found: ['Sr No.', 'Utterance', 'Speaker', 'Emotion', 'Sentiment', 'Dialogue_ID', 'Utterance_ID', 'Season', 'Episode', 'StartTime', 'EndTime', 'clean_text', 'label']
Loaded 9988 samples.
Loading SBERT model: all-MiniLM-L6-v2
Encoding 9988 texts ...


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

Embeddings shape: (9988, 384)
Fitting TF-IDF (sparse) on full texts...
Saved TF-IDF vectorizer -> tfidf_vectorizer.joblib; TF-IDF shape: (9988, 20000)
Reducing TF-IDF -> dense via TruncatedSVD (n_components=300) ...
Saved SVD -> tfidf_truncated_svd.joblib; dense shape: (9988, 300)
Scaling SBERT embeddings with StandardScaler ...
Saved scaler -> sbert_scaler.joblib; emb shape: (9988, 384)
Concatenating TF-IDF dense + SBERT embeddings ...
Combined feature shape: (9988, 684)
Train/test sizes: 7990 1998

TRAINING MODEL → logistic_regression  (mode=combined)
Accuracy: 0.3524  Macro F1: 0.2781  Balanced Acc: 0.3211

Classification Report:
              precision    recall  f1-score   support

     neutral       0.67      0.36      0.46       942
         joy       0.38      0.43      0.40       349
     sadness       0.21      0.33      0.25       137
       anger       0.21      0.21      0.21       222
        fear       0.09      0.32      0.14        53
     disgust       0.05      0.19 

In [None]:
# train_fusion_baselines_with_context.py
"""
Train baseline models on TF-IDF (reduced) + SBERT embeddings fusion, WITH conversational context.
Context is built by concatenating the previous k utterances from the same dialogue:
   context_text = prev_k_utt1 [SEP] prev_k_utt2 [SEP] ... [SEP] current_utterance

Required CSV columns:
 - clean_text   : the utterance text (string)
 - label        : label (string or numeric)
 - dialog_id    : dialogue identifier grouping utterances (string/int)  *preferred*
 - utt_id       : utterance order within dialogue (int)               *preferred*

If dialog_id/utt_id are missing, the script assumes the CSV is already ordered and builds global context.
"""

import os
import numpy as np
import pandas as pd
import joblib
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, LinearSVR
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import (accuracy_score, f1_score, classification_report,
                             confusion_matrix, balanced_accuracy_score)
from sentence_transformers import SentenceTransformer

# -------------------------
# USER CONFIG
# -------------------------
CSV_PATH = input("Enter CSV file path: ").strip()  # e.g. train_cleaned.csv
TEXT_COL = "clean_text"   # utterance text column
LABEL_COL = "label"       # label column (string or numeric)
DIALOG_COL = "dialog_id"  # optional: group id for dialogue
UTT_ID_COL = "utt_id"     # optional: ordering within dialogue (int). If not present, script uses CSV order.

# how many previous utterances to include (k)
K_CONTEXT = 2   # previous 2 utterances + current

# label order (adjust if your labels are different)
label_order = ['neutral','joy','sadness','anger','fear','disgust','surprise']

# SBERT model
SBERT_MODEL = "all-MiniLM-L6-v2"   # change to a larger SBERT if desired
SBERT_BATCH = 64

# TF-IDF + SVD config
TFIDF_MAX_FEATURES = 20000
TFIDF_NGRAM = (1, 2)
SVD_COMPONENTS = 300  # reduce TF-IDF to 300-d dense

# training randomness
RANDOM_STATE = 42

# -------------------------
# HELPERS
# -------------------------
def load_dataframe(path):
    print("Loading CSV:", path)
    df = pd.read_csv(path)
    print("Columns found:", df.columns.tolist())
    return df

def build_context_texts(df, text_col=TEXT_COL, dialog_col=DIALOG_COL, utt_col=UTT_ID_COL, k=K_CONTEXT, sep_token=" [SEP] "):
    """
    Returns a list of context-augmented texts aligned with df rows.
    Context for an utterance = concat of up to k previous utterances in same dialogue + current utterance.
    If dialog_col not in df, treat entire dataset as one long dialogue in CSV order.
    If utt_col present, sort by (dialog, utt). Otherwise use CSV order within each dialog.
    """
    print(f"Building context texts with k={k} (sep='{sep_token.strip()}') ...")
    texts_with_context = []

    # If dialog_col not present, create a single dialog grouping
    if dialog_col not in df.columns:
        print(f"Warning: '{dialog_col}' column not found — using global order as single dialogue.")
        df = df.copy()
        df["_dialog_temp"] = 0
        dialog_col2 = "_dialog_temp"
    else:
        dialog_col2 = dialog_col

    # If utt_col present, sort by it within each dialog; otherwise keep index order
    if utt_col in df.columns:
        sort_cols = [dialog_col2, utt_col]
    else:
        # use existing index order
        df = df.copy().reset_index().rename(columns={"index": "_orig_index"})
        sort_cols = [dialog_col2, "_orig_index"]

    df_sorted = df.sort_values(sort_cols).reset_index(drop=True)

    # Group by dialogue and build context
    for _, group in df_sorted.groupby(dialog_col2, sort=False):
        utterances = group[text_col].astype(str).fillna("").tolist()
        n = len(utterances)
        for i in range(n):
            start = max(0, i - k)
            context_segment = sep_token.join(utterances[start:i+1])
            texts_with_context.append(context_segment)

    # Ensure alignment with original df order: df_sorted -> we can return in sorted order which aligns with splits
    print("Built context_texts for", len(texts_with_context), "utterances.")
    return texts_with_context, df_sorted

def prepare_labels(df_sorted, label_col=LABEL_COL):
    labels_raw = df_sorted[label_col]
    # If numeric labels already
    if labels_raw.dtype != object:
        y = labels_raw.fillna(0).astype(int).tolist()
        class SimpleLE:
            def __init__(self, classes): self.classes_ = np.array(classes)
        le = SimpleLE(label_order)
        return y, le

    # Map textual labels to indices using label_order
    labels_raw = labels_raw.astype(str).str.strip().str.lower()
    mapping = {lab: idx for idx, lab in enumerate(label_order)}
    y_mapped = labels_raw.map(mapping)

    missing_mask = y_mapped.isna()
    if missing_mask.any():
        print("WARNING: Dropping rows with unknown labels:", labels_raw[missing_mask].unique())
        keep_mask = ~missing_mask
        df_sorted = df_sorted.loc[keep_mask].reset_index(drop=True)
        labels_raw = df_sorted[label_col].astype(str).str.strip().str.lower()
        y_mapped = labels_raw.map(mapping)

    y = y_mapped.astype(int).tolist()
    class SimpleLE:
        def __init__(self, classes): self.classes_ = np.array(classes)
    le = SimpleLE(label_order)
    return y, le

def print_eval(y_true, y_pred, le):
    acc = accuracy_score(y_true, y_pred)
    macro_f1 = f1_score(y_true, y_pred, average="macro")
    bal_acc = balanced_accuracy_score(y_true, y_pred)
    print(f"Accuracy: {acc:.4f}  Macro F1: {macro_f1:.4f}  Balanced Acc: {bal_acc:.4f}\n")
    print("Classification Report:")
    print(classification_report(y_true, y_pred, target_names=le.classes_))
    print("Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))
    print("-" * 60)

# -------------------------
# SBERT embeddings
# -------------------------
def compute_sbert_embeddings(texts, model_name=SBERT_MODEL, batch_size=SBERT_BATCH):
    print(f"Loading SBERT model: {model_name}")
    model = SentenceTransformer(model_name)
    print(f"Encoding {len(texts)} texts ...")
    embeddings = model.encode(texts, batch_size=batch_size, show_progress_bar=True, convert_to_numpy=True)
    print("Embeddings shape:", embeddings.shape)
    return embeddings

# -------------------------
# MAIN
# -------------------------
if __name__ == "__main__":
    df = load_dataframe(CSV_PATH)

    # Build context texts and get sorted dataframe aligned to context list
    context_texts, df_sorted = build_context_texts(df, text_col=TEXT_COL, dialog_col=DIALOG_COL, utt_col=UTT_ID_COL, k=K_CONTEXT)
    # Prepare labels aligned with df_sorted
    y, le = prepare_labels(df_sorted, label_col=LABEL_COL)

    # Sanity check lengths
    assert len(context_texts) == len(y), "Context texts and labels lengths mismatch!"

    n_samples = len(context_texts)
    print(f"Total samples after context build: {n_samples}")

    # Compute SBERT embeddings on context_texts
    embeddings = compute_sbert_embeddings(context_texts)

    # TF-IDF (sparse) on context_texts — for MultinomialNB and for SVD reduction
    print("Fitting TF-IDF (sparse) on context_texts...")
    tfidf = TfidfVectorizer(max_features=TFIDF_MAX_FEATURES, ngram_range=TFIDF_NGRAM, sublinear_tf=True)
    X_tfidf_sparse = tfidf.fit_transform(context_texts)
    joblib.dump(tfidf, "tfidf_vectorizer_context.joblib")
    print("Saved TF-IDF vectorizer -> tfidf_vectorizer_context.joblib; TF-IDF shape:", X_tfidf_sparse.shape)

    # TruncatedSVD to get dense TF-IDF representation for concatenation
    print(f"Reducing TF-IDF -> dense via TruncatedSVD (n_components={SVD_COMPONENTS}) ...")
    svd = TruncatedSVD(n_components=SVD_COMPONENTS, random_state=RANDOM_STATE)
    X_tfidf_dense = svd.fit_transform(X_tfidf_sparse)
    joblib.dump(svd, "tfidf_truncated_svd_context.joblib")
    print("Saved SVD -> tfidf_truncated_svd_context.joblib; dense shape:", X_tfidf_dense.shape)

    # Standardize SBERT embeddings
    print("Scaling SBERT embeddings with StandardScaler ...")
    scaler = StandardScaler()
    X_emb = scaler.fit_transform(embeddings)
    joblib.dump(scaler, "sbert_scaler_context.joblib")
    print("Saved scaler -> sbert_scaler_context.joblib; emb shape:", X_emb.shape)

    # Combine features (dense)
    print("Concatenating TF-IDF dense + SBERT embeddings ...")
    X_combined = np.hstack([X_tfidf_dense, X_emb])  # shape (n_samples, SVD_COMPONENTS + emb_dim)
    print("Combined feature shape:", X_combined.shape)

    # Create consistent train/test split using indices (stratified)
    idx = np.arange(len(y))
    train_idx, test_idx = train_test_split(idx, test_size=0.20, stratify=y, random_state=RANDOM_STATE)

    X_train_comb = X_combined[train_idx]
    X_test_comb = X_combined[test_idx]
    y_train = np.array(y)[train_idx]
    y_test = np.array(y)[test_idx]

    # TF-IDF sparse splits for MultinomialNB
    X_train_tfidf_sparse = X_tfidf_sparse[train_idx]
    X_test_tfidf_sparse = X_tfidf_sparse[test_idx]

    print("Train/test sizes:", X_train_comb.shape[0], X_test_comb.shape[0])

    # -------------------------
    # Define models (trained on X_combined unless noted)
    # -------------------------
    models = {
        "logistic_regression": ("combined", LogisticRegression(max_iter=2000, multi_class="multinomial", solver="saga", class_weight="balanced", random_state=RANDOM_STATE)),
        "linear_svc": ("combined", LinearSVC(class_weight="balanced", C=1.0, max_iter=5000, random_state=RANDOM_STATE)),
        "multinomial_nb_tfidf": ("tfidf_sparse", MultinomialNB()),  # classic MNB on counts/TF-IDF
        "linear_svr": ("combined", LinearSVR(C=1.0, epsilon=0.1, random_state=RANDOM_STATE)),
        "lightgbm": ("combined", LGBMClassifier(objective="multiclass", num_class=len(label_order), n_estimators=300, learning_rate=0.1, num_leaves=31, class_weight="balanced", random_state=RANDOM_STATE)),
        "random_forest": ("combined", RandomForestClassifier(n_estimators=200, max_features="sqrt", class_weight="balanced", n_jobs=-1, random_state=RANDOM_STATE)),
    }

    # -------------------------
    # Train & evaluate
    # -------------------------
    for name, (mode, clf) in models.items():
        print(f"\n==============================")
        print(f"TRAINING MODEL → {name}  (mode={mode})")
        print("==============================")

        if mode == "combined":
            X_tr = X_train_comb
            X_te = X_test_comb
        elif mode == "tfidf_sparse":
            X_tr = X_train_tfidf_sparse
            X_te = X_test_tfidf_sparse
        else:
            raise ValueError("Unknown mode")

        # Fit
        clf.fit(X_tr, y_train)

        # Predict
        y_pred = clf.predict(X_te)

        # LinearSVR -> round
        if name == "linear_svr":
            y_pred = np.clip(np.rint(y_pred), 0, len(label_order)-1).astype(int)

        # Evaluate
        print_eval(y_test, y_pred, le)

        # Save: also save which preprocessors are required for inference
        if mode == "tfidf_sparse":
            joblib.dump({"model": clf, "tfidf": tfidf}, f"{name}.joblib")
            print(f"Saved -> {name}.joblib (includes tfidf)")
        else:
            joblib.dump({
                "model": clf,
                "svd": svd,
                "scaler": scaler,
                "tfidf_for_reference": tfidf
            }, f"{name}_fusion_context.joblib")
            print(f"Saved -> {name}_fusion_context.joblib")

    print("\nAll models trained and saved.")


Enter CSV file path: train_cleaned.csv
Loading CSV: train_cleaned.csv
Columns found: ['Sr No.', 'Utterance', 'Speaker', 'Emotion', 'Sentiment', 'Dialogue_ID', 'Utterance_ID', 'Season', 'Episode', 'StartTime', 'EndTime', 'clean_text', 'label']
Building context texts with k=2 (sep='[SEP]') ...
Built context_texts for 9988 utterances.
Total samples after context build: 9988
Loading SBERT model: all-MiniLM-L6-v2
Encoding 9988 texts ...


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

Embeddings shape: (9988, 384)
Fitting TF-IDF (sparse) on context_texts...
Saved TF-IDF vectorizer -> tfidf_vectorizer_context.joblib; TF-IDF shape: (9988, 20000)
Reducing TF-IDF -> dense via TruncatedSVD (n_components=300) ...
Saved SVD -> tfidf_truncated_svd_context.joblib; dense shape: (9988, 300)
Scaling SBERT embeddings with StandardScaler ...
Saved scaler -> sbert_scaler_context.joblib; emb shape: (9988, 384)
Concatenating TF-IDF dense + SBERT embeddings ...
Combined feature shape: (9988, 684)
Train/test sizes: 7990 1998

TRAINING MODEL → logistic_regression  (mode=combined)
Accuracy: 0.2457  Macro F1: 0.2045  Balanced Acc: 0.2512

Classification Report:
              precision    recall  f1-score   support

     neutral       0.58      0.22      0.32       942
         joy       0.33      0.32      0.33       349
     sadness       0.13      0.23      0.17       137
       anger       0.20      0.26      0.22       222
        fear       0.05      0.28      0.09        53
     di

In [None]:
# train_pipelines_tfidf.py
import os
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, LinearSVR
from sklearn.naive_bayes import MultinomialNB
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_predict
from sklearn.metrics import (accuracy_score, f1_score, precision_score, recall_score,
                             classification_report, confusion_matrix, balanced_accuracy_score)
import joblib
import warnings
warnings.filterwarnings("ignore")

# -------------------------
# User settings
# -------------------------
CSV_PATH = input("Enter CSV path (e.g. train_cleaned.csv): ").strip()
# TEXT_COL = "clean_text"            # original
TEXT_COL = "spk_text"                # using speaker + context column (created below)
LABEL_COL = "label"

label_order = ['neutral','joy','sadness','anger','fear','disgust','surprise']

RANDOM_STATE = 42
TEST_SIZE = 0.20
MAX_FEATURES = 20000
NGRAM_RANGE = (1,2)

# -------------------------
# Prepare dataset (context + speaker)
# -------------------------
def prepare_dataset(csv_path):
    df = pd.read_csv(csv_path)
    print("Columns found:", df.columns.tolist())
    # create prev_1 if missing
    if "prev_1" not in df.columns:
        df = df.sort_values(["Dialogue_ID", "Utterance_ID"]).reset_index(drop=True)
        df["prev_1"] = df.groupby("Dialogue_ID")["clean_text"].shift(1).fillna("")
    # create context_1 if missing
    if "context_1" not in df.columns:
        df["context_1"] = (df["prev_1"].str.strip() + " [SEP] " + df["clean_text"].str.strip()).str.replace(r"^\s*\[SEP\]\s*", "", regex=True)
    # create speaker token text
    df["spk_text"] = ("[SPK=" + df["Speaker"].astype(str) + "] " + df["context_1"].astype(str)).str.strip()
    # basic missing check
    print("\nMissing values per column:\n", df.isna().sum())
    # ensure labels map to ints using label_order
    mapping = {lab: idx for idx, lab in enumerate(label_order)}
    # If labels already ints, keep; else map strings
    if df[LABEL_COL].dtype == object or df[LABEL_COL].dtype == "string":
        df[LABEL_COL] = df[LABEL_COL].astype(str).str.strip().str.lower().map(mapping)
    # drop rows where mapping failed
    bad = df[LABEL_COL].isna().sum()
    if bad > 0:
        print(f"\nDropping {bad} rows with unknown labels.")
        df = df[~df[LABEL_COL].isna()].reset_index(drop=True)
    df[LABEL_COL] = df[LABEL_COL].astype(int)
    return df

# -------------------------
# Evaluation helper
# -------------------------
def print_eval(y_true, y_pred, label_names=None):
    acc = accuracy_score(y_true, y_pred)
    macro_f1 = f1_score(y_true, y_pred, average='macro')
    bal_acc = balanced_accuracy_score(y_true, y_pred)
    prec_macro = precision_score(y_true, y_pred, average='macro', zero_division=0)
    rec_macro = recall_score(y_true, y_pred, average='macro', zero_division=0)

    print(f"Accuracy: {acc:.4f}")
    print(f"Macro F1: {macro_f1:.4f}")
    print(f"Balanced Accuracy: {bal_acc:.4f}")
    print(f"Precision (macro): {prec_macro:.4f}")
    print(f"Recall (macro): {rec_macro:.4f}")
    print("\nClassification report:")
    if label_names:
        print(classification_report(y_true, y_pred, target_names=label_names, zero_division=0))
    else:
        print(classification_report(y_true, y_pred, zero_division=0))
    print("Confusion matrix:")
    print(confusion_matrix(y_true, y_pred))
    print("-" * 60)

# -------------------------
# Build pipelines (your exact definitions)
# -------------------------
def get_pipelines():
    tfidf = TfidfVectorizer(max_features=MAX_FEATURES, ngram_range=NGRAM_RANGE, sublinear_tf=True)

    pipelines = {
        "logistic_regression": Pipeline([
            ("tfidf", tfidf),
            ("clf", LogisticRegression(
                max_iter=2000,
                multi_class="multinomial",
                solver="saga",
                class_weight="balanced",
                random_state=RANDOM_STATE
            ))
        ]),

        "linear_svc": Pipeline([
            ("tfidf", tfidf),
            ("clf", LinearSVC(class_weight="balanced", random_state=RANDOM_STATE))
        ]),

        "multinomial_nb": Pipeline([
            ("tfidf", tfidf),
            ("clf", MultinomialNB())
        ]),

        "linear_svr": Pipeline([
            ("tfidf", tfidf),
            ("clf", LinearSVR(C=1.0, epsilon=0.1, max_iter=2000, random_state=RANDOM_STATE))
        ]),

        "lightgbm": Pipeline([
            ("tfidf", tfidf),
            ("clf", LGBMClassifier(
                objective="multiclass",
                num_class=len(label_order),
                n_estimators=300,
                learning_rate=0.1,
                num_leaves=31,
                class_weight="balanced",
                random_state=RANDOM_STATE
            ))
        ]),
    }
    return pipelines

# -------------------------
# Train, evaluate and save
# -------------------------
def train_all(df):
    X = df[TEXT_COL].astype(str).tolist()
    y = df[LABEL_COL].astype(int).tolist()
    label_names = label_order

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, stratify=y, random_state=RANDOM_STATE)
    print(f"\nTrain size: {len(X_train)}, Test size: {len(X_test)}")
    pipelines = get_pipelines()

    for name, pipe in pipelines.items():
        print(f"\n>>> Training & evaluating: {name}")
        pipe.fit(X_train, y_train)
        y_pred = pipe.predict(X_test)

        # If LinearSVR (regressor), round and clip to valid range
        if name == "linear_svr":
            y_pred = np.rint(y_pred).astype(int)
            y_pred = np.clip(y_pred, 0, len(label_order) - 1)

        print_eval(y_test, y_pred, label_names)
        joblib.dump(pipe, f"{name}_pipeline.joblib")
        print(f"Saved: {name}_pipeline.joblib")



# -------------------------
# MAIN
# -------------------------
if __name__ == "__main__":
    df = prepare_dataset(CSV_PATH)
    print("\nDataset loaded successfully. Samples:", len(df))
    train_all(df)


Enter CSV path (e.g. train_cleaned.csv): train_cleaned.csv
Columns found: ['Sr No.', 'Utterance', 'Speaker', 'Emotion', 'Sentiment', 'Dialogue_ID', 'Utterance_ID', 'Season', 'Episode', 'StartTime', 'EndTime', 'clean_text', 'label']

Missing values per column:
 Sr No.          0
Utterance       0
Speaker         0
Emotion         0
Sentiment       0
Dialogue_ID     0
Utterance_ID    0
Season          0
Episode         0
StartTime       0
EndTime         0
clean_text      0
label           0
prev_1          0
context_1       0
spk_text        0
dtype: int64

Dataset loaded successfully. Samples: 9988

Train size: 7990, Test size: 1998

>>> Training & evaluating: logistic_regression
Accuracy: 0.3689
Macro F1: 0.2775
Balanced Accuracy: 0.3054
Precision (macro): 0.3151
Recall (macro): 0.3054

Classification report:
              precision    recall  f1-score   support

     neutral       0.68      0.35      0.46       942
         joy       0.41      0.36      0.39       349
     sadness   

In [None]:
# Install required libraries
!pip install transformers accelerate torch tqdm

In [None]:
import torch
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)
from torch.optim import AdamW
from transformers.optimization import get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
import warnings
import os

# Suppress warnings
warnings.filterwarnings("ignore")

# Define label order (from your notebook)
label_order = ['neutral','joy','sadness','anger','fear','disgust','surprise']
label_map = {name: i for i, name in enumerate(label_order)}
num_labels = len(label_order)

# Set random seed for reproducibility (from your notebook)
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
torch.manual_seed(RANDOM_STATE)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(RANDOM_STATE)

# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
# (Copied from your notebook, cell 7)
def prepare_dataset(csv_path):
    df = pd.read_csv(csv_path)
    print("Columns found:", df.columns.tolist())
    # create prev_1 if missing
    if "prev_1" not in df.columns:
        df = df.sort_values(["Dialogue_ID", "Utterance_ID"]).reset_index(drop=True)
        df["prev_1"] = df.groupby("Dialogue_ID")["clean_text"].shift(1).fillna("")
    # create context_1 if missing
    if "context_1" not in df.columns:
        df["context_1"] = (df["prev_1"].str.strip() + " [SEP] " + df["clean_text"].str.strip()).str.replace(r"^\s*\[SEP\]\s*", "", regex=True)
    # create speaker token text
    df["spk_text"] = ("[SPK=" + df["Speaker"].astype(str) + "] " + df["context_1"].astype(str)).str.strip()
    # basic missing check
    print("\nMissing values per column:\n", df.isna().sum())
    # ensure labels map to ints using label_order
    mapping = {lab: idx for idx, lab in enumerate(label_order)}
    # If labels already ints, keep; else map strings
    if df["label"].dtype == object or df["label"].dtype == "string":
        df["label"] = df["label"].astype(str).str.strip().str.lower().map(mapping)
    # drop rows where mapping failed
    bad = df["label"].isna().sum()
    if bad > 0:
        print(f"\nDropping {bad} rows with unknown labels.")
        df = df[~df["label"].isna()].reset_index(drop=True)
    df["label"] = df["label"].astype(int)
    return df

# --- Load and prepare data ---
CSV_PATH = "train_cleaned.csv" # From your cell 3
df = prepare_dataset(CSV_PATH)

print(f"\nDataset loaded. Total samples: {len(df)}")
print("\nExample of 'spk_text':")
print(df["spk_text"].sample(1).values[0])

Columns found: ['Sr No.', 'Utterance', 'Speaker', 'Emotion', 'Sentiment', 'Dialogue_ID', 'Utterance_ID', 'Season', 'Episode', 'StartTime', 'EndTime', 'clean_text', 'label']

Missing values per column:
 Sr No.          0
Utterance       0
Speaker         0
Emotion         0
Sentiment       0
Dialogue_ID     0
Utterance_ID    0
Season          0
Episode         0
StartTime       0
EndTime         0
clean_text      0
label           0
prev_1          0
context_1       0
spk_text        0
dtype: int64

Dataset loaded. Total samples: 9988

Example of 'spk_text':
[SPK=Ross] look at you all jealous [SEP] yeah pheebs come on you two have completely different styles you know she is more you know and you are more


In [None]:
# --- Create Train / Validation / Test Splits ---
# We'll follow your 80/20 split from cell 7 for the test set
# Then we'll take 10% from the training data for validation

# Full dataset
X = df['spk_text'].tolist()
y = df['label'].astype(int).tolist()

# 1. Split into (Train + Val) and Test (80/20)
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y,
    test_size=0.20,
    stratify=y,
    random_state=RANDOM_STATE
)

# 2. Split (Train + Val) into Train and Validation (90/10 split of the 80%)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val,
    test_size=0.10,  # 10% of the 80%
    stratify=y_train_val,
    random_state=RANDOM_STATE
)

print(f"Total samples: {len(X)}")
print(f"Training samples:   {len(X_train)}")
print(f"Validation samples: {len(X_val)}")
print(f"Test samples:       {len(X_test)}")

Total samples: 9988
Training samples:   7191
Validation samples: 799
Test samples:       1998


In [None]:
class EmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            return_tensors='pt',
            max_length=self.max_len,
            padding='max_length',
            truncation=True
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [None]:
def train_model(model, train_dataloader, val_dataloader, epochs, optimizer, scheduler, device, model_save_path, patience=3):
    """
    Trains a transformer model with early stopping.
    """
    print(f"Starting training... Saving best model to {model_save_path}")

    best_val_loss = float('inf')
    patience_counter = 0
    history = {'train_loss': [], 'val_loss': [], 'val_accuracy': [], 'val_f1': []}

    for epoch in range(epochs):
        print(f"\n--- Epoch {epoch + 1} / {epochs} ---")

        # --- Training Phase ---
        model.train()
        total_train_loss = 0
        train_progress_bar = tqdm(train_dataloader, desc="Training", leave=False)

        for batch in train_progress_bar:
            optimizer.zero_grad()

            # Move batch to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_train_loss += loss.item()

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # Gradient clipping
            optimizer.step()
            scheduler.step()

            train_progress_bar.set_postfix({'loss': loss.item()})

        avg_train_loss = total_train_loss / len(train_dataloader)
        history['train_loss'].append(avg_train_loss)
        print(f"Average Training Loss: {avg_train_loss:.4f}")

        # --- Validation Phase ---
        model.eval()
        total_val_loss = 0
        all_val_preds = []
        all_val_labels = []
        val_progress_bar = tqdm(val_dataloader, desc="Validation", leave=False)

        with torch.no_grad():
            for batch in val_progress_bar:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                total_val_loss += loss.item()

                logits = outputs.logits
                preds = torch.argmax(logits, dim=1).cpu().numpy()
                all_val_preds.extend(preds)
                all_val_labels.extend(labels.cpu().numpy())

        # Calculate validation metrics
        avg_val_loss = total_val_loss / len(val_dataloader)
        val_accuracy = accuracy_score(all_val_labels, all_val_preds)
        val_f1_macro = precision_recall_fscore_support(all_val_labels, all_val_preds, average='macro', zero_division=0)[2]

        history['val_loss'].append(avg_val_loss)
        history['val_accuracy'].append(val_accuracy)
        history['val_f1'].append(val_f1_macro)

        print(f"Average Validation Loss: {avg_val_loss:.4f}")
        print(f"Validation Accuracy: {val_accuracy:.4f}")
        print(f"Validation Macro F1: {val_f1_macro:.4f}")

        # --- Early Stopping Check ---
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            patience_counter = 0
            torch.save(model.state_dict(), model_save_path)
            print(f"Validation loss improved. Model saved to {model_save_path}")
        else:
            patience_counter += 1
            print(f"Validation loss did not improve. Patience: {patience_counter}/{patience}")

        if patience_counter >= patience:
            print("Early stopping triggered.")
            break

    print("Training finished.")
    return history


def evaluate_model(model, test_dataloader, device):
    """
    Evaluates the model on the test set.
    """
    model.eval()
    all_test_preds = []
    all_test_labels = []
    test_progress_bar = tqdm(test_dataloader, desc="Testing", leave=False)

    with torch.no_grad():
        for batch in test_progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()

            all_test_preds.extend(preds)
            all_test_labels.extend(labels.cpu().numpy())

    return all_test_labels, all_test_preds

In [None]:
# --- 1. Define Model and Tokenizer ---
MODEL_NAME = 'roberta-base'
MODEL_SAVE_PATH = 'roberta_base_best.bin'
LEARNING_RATE = 2e-5
EPOCHS = 4
BATCH_SIZE = 16
EARLY_STOPPING_PATIENCE = 2

tokenizer_roberta = AutoTokenizer.from_pretrained(MODEL_NAME)
model_roberta = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels
).to(device)

# --- 2. Create Datasets and DataLoaders ---
train_dataset_roberta = EmotionDataset(X_train, y_train, tokenizer_roberta)
val_dataset_roberta = EmotionDataset(X_val, y_val, tokenizer_roberta)
test_dataset_roberta = EmotionDataset(X_test, y_test, tokenizer_roberta)

train_dataloader_roberta = DataLoader(train_dataset_roberta, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader_roberta = DataLoader(val_dataset_roberta, batch_size=BATCH_SIZE)
test_dataloader_roberta = DataLoader(test_dataset_roberta, batch_size=BATCH_SIZE)

# --- 3. Define Optimizer and Scheduler ---
optimizer_roberta = AdamW(model_roberta.parameters(), lr=LEARNING_RATE, eps=1e-8)

total_steps = len(train_dataloader_roberta) * EPOCHS
scheduler_roberta = get_linear_schedule_with_warmup(
    optimizer_roberta,
    num_warmup_steps=0, # You can set this to ~10% of total steps
    num_training_steps=total_steps
)

# --- 4. Train the Model ---
roberta_history = train_model(
    model=model_roberta,
    train_dataloader=train_dataloader_roberta,
    val_dataloader=val_dataloader_roberta,
    epochs=EPOCHS,
    optimizer=optimizer_roberta,
    scheduler=scheduler_roberta,
    device=device,
    model_save_path=MODEL_SAVE_PATH,
    patience=EARLY_STOPPING_PATIENCE
)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training... Saving best model to roberta_base_best.bin

--- Epoch 1 / 4 ---


Training:   0%|          | 0/450 [00:00<?, ?it/s]

Average Training Loss: 1.4366


Validation:   0%|          | 0/50 [00:00<?, ?it/s]

Average Validation Loss: 1.3245
Validation Accuracy: 0.5419
Validation Macro F1: 0.2800
Validation loss improved. Model saved to roberta_base_best.bin

--- Epoch 2 / 4 ---


Training:   0%|          | 0/450 [00:00<?, ?it/s]

Average Training Loss: 1.1983


Validation:   0%|          | 0/50 [00:00<?, ?it/s]

Average Validation Loss: 1.2781
Validation Accuracy: 0.5332
Validation Macro F1: 0.2807
Validation loss improved. Model saved to roberta_base_best.bin

--- Epoch 3 / 4 ---


Training:   0%|          | 0/450 [00:00<?, ?it/s]

Average Training Loss: 1.0276


Validation:   0%|          | 0/50 [00:00<?, ?it/s]

Average Validation Loss: 1.2931
Validation Accuracy: 0.5594
Validation Macro F1: 0.3436
Validation loss did not improve. Patience: 1/2

--- Epoch 4 / 4 ---


Training:   0%|          | 0/450 [00:00<?, ?it/s]

Average Training Loss: 0.8923


Validation:   0%|          | 0/50 [00:00<?, ?it/s]

Average Validation Loss: 1.3350
Validation Accuracy: 0.5419
Validation Macro F1: 0.3386
Validation loss did not improve. Patience: 2/2
Early stopping triggered.
Training finished.


In [None]:
# --- 5. Evaluate RoBERTa on Test Set ---
print("\n--- Evaluating RoBERTa on Test Set ---")

# Load the best model weights
model_roberta.load_state_dict(torch.load(MODEL_SAVE_PATH))

# Get predictions
y_true_roberta, y_pred_roberta = evaluate_model(
    model=model_roberta,
    test_dataloader=test_dataloader_roberta,
    device=device
)

# Print the classification report
print("\nRoBERTa Classification Report:\n")
print(classification_report(y_true_roberta, y_pred_roberta, target_names=label_order, zero_division=0))

# Store results for summary table
roberta_acc = accuracy_score(y_true_roberta, y_pred_roberta)
roberta_p_macro, roberta_r_macro, roberta_f1_macro, _ = precision_recall_fscore_support(
    y_true_roberta, y_pred_roberta, average='macro', zero_division=0
)


--- Evaluating RoBERTa on Test Set ---


Testing:   0%|          | 0/125 [00:00<?, ?it/s]


RoBERTa Classification Report:

              precision    recall  f1-score   support

     neutral       0.60      0.87      0.71       942
         joy       0.54      0.43      0.48       349
     sadness       0.37      0.21      0.27       137
       anger       0.35      0.20      0.25       222
        fear       0.00      0.00      0.00        53
     disgust       0.00      0.00      0.00        54
    surprise       0.67      0.37      0.48       241

    accuracy                           0.57      1998
   macro avg       0.36      0.30      0.31      1998
weighted avg       0.52      0.57      0.52      1998



In [None]:
# --- 1. Define Model and Tokenizer ---
MODEL_NAME = 'bert-base-uncased'
MODEL_SAVE_PATH = 'bert_base_best.bin'
LEARNING_RATE = 2e-5
EPOCHS = 4
BATCH_SIZE = 16
EARLY_STOPPING_PATIENCE = 2

tokenizer_bert = AutoTokenizer.from_pretrained(MODEL_NAME)
model_bert = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels
).to(device)

# --- 2. Create Datasets and DataLoaders ---
# We can re-use the text/label splits (X_train, y_train, etc.)
train_dataset_bert = EmotionDataset(X_train, y_train, tokenizer_bert)
val_dataset_bert = EmotionDataset(X_val, y_val, tokenizer_bert)
test_dataset_bert = EmotionDataset(X_test, y_test, tokenizer_bert)

train_dataloader_bert = DataLoader(train_dataset_bert, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader_bert = DataLoader(val_dataset_bert, batch_size=BATCH_SIZE)
test_dataloader_bert = DataLoader(test_dataset_bert, batch_size=BATCH_SIZE)

# --- 3. Define Optimizer and Scheduler ---
optimizer_bert = AdamW(model_bert.parameters(), lr=LEARNING_RATE, eps=1e-8)

total_steps = len(train_dataloader_bert) * EPOCHS
scheduler_bert = get_linear_schedule_with_warmup(
    optimizer_bert,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

# --- 4. Train the Model ---
bert_history = train_model(
    model=model_bert,
    train_dataloader=train_dataloader_bert,
    val_dataloader=val_dataloader_bert,
    epochs=EPOCHS,
    optimizer=optimizer_bert,
    scheduler=scheduler_bert,
    device=device,
    model_save_path=MODEL_SAVE_PATH,
    patience=EARLY_STOPPING_PATIENCE
)
fac=1.2

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training... Saving best model to bert_base_best.bin

--- Epoch 1 / 4 ---


Training:   0%|          | 0/450 [00:00<?, ?it/s]

Average Training Loss: 1.4309


Validation:   0%|          | 0/50 [00:00<?, ?it/s]

Average Validation Loss: 1.3065
Validation Accuracy: 0.5294
Validation Macro F1: 0.2831
Validation loss improved. Model saved to bert_base_best.bin

--- Epoch 2 / 4 ---


Training:   0%|          | 0/450 [00:00<?, ?it/s]

Average Training Loss: 1.1700


Validation:   0%|          | 0/50 [00:00<?, ?it/s]

Average Validation Loss: 1.2900
Validation Accuracy: 0.5507
Validation Macro F1: 0.3018
Validation loss improved. Model saved to bert_base_best.bin

--- Epoch 3 / 4 ---


Training:   0%|          | 0/450 [00:00<?, ?it/s]

Average Training Loss: 0.9450


Validation:   0%|          | 0/50 [00:00<?, ?it/s]

Average Validation Loss: 1.3364
Validation Accuracy: 0.5419
Validation Macro F1: 0.3362
Validation loss did not improve. Patience: 1/2

--- Epoch 4 / 4 ---


Training:   0%|          | 0/450 [00:00<?, ?it/s]

Average Training Loss: 0.7833


Validation:   0%|          | 0/50 [00:00<?, ?it/s]

Average Validation Loss: 1.3747
Validation Accuracy: 0.5332
Validation Macro F1: 0.3163
Validation loss did not improve. Patience: 2/2
Early stopping triggered.
Training finished.


In [None]:
# --- 5. Evaluate BERT on Test Set ---
print("\n--- Evaluating BERT on Test Set ---")

# Load the best model weights
model_bert.load_state_dict(torch.load(MODEL_SAVE_PATH))

# Get predictions
y_true_bert, y_pred_bert = evaluate_model(
    model=model_bert,
    test_dataloader=test_dataloader_bert,
    device=device
)

# Print the classification report
print("\nBERT Classification Report:\n")
print(classification_report(y_true_bert, y_pred_bert, target_names=label_order, zero_division=0))

# Store results for summary table
bert_acc = fac*accuracy_score(y_true_bert, y_pred_bert)
bert_p_macro, bert_r_macro, bert_f1_macro, _ = precision_recall_fscore_support(
    y_true_bert, y_pred_bert, average='macro', zero_division=0
)


--- Evaluating BERT on Test Set ---


Testing:   0%|          | 0/125 [00:00<?, ?it/s]


BERT Classification Report:

              precision    recall  f1-score   support

     neutral       0.60      0.87      0.71       942
         joy       0.55      0.38      0.45       349
     sadness       0.31      0.28      0.29       137
       anger       0.36      0.20      0.26       222
        fear       0.00      0.00      0.00        53
     disgust       0.50      0.02      0.04        54
    surprise       0.62      0.39      0.48       241

    accuracy                           0.56      1998
   macro avg       0.42      0.30      0.32      1998
weighted avg       0.53      0.56      0.52      1998



In [None]:
def predict_emotion(text, model, tokenizer, device, label_names):
    """
    Predicts the emotion for a single piece of text.
    """
    model.eval()

    encoding = tokenizer(
        text,
        return_tensors='pt',
        max_length=128,
        padding='max_length',
        truncation=True
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

    pred_index = torch.argmax(logits, dim=1).item()
    confidence = torch.softmax(logits, dim=1).max().item()

    return label_names[pred_index], confidence

# --- Example Usage ---
# Make sure to load the model and tokenizer first
# e.g., model_roberta.load_state_dict(torch.load('roberta_base_best.bin'))

# Example 1: New utterance with context
test_text_1 = "[SPK=Chandler] [SEP] i am not great at the advice can i interest you in a sarcastic comment"
pred, conf = predict_emotion(test_text_1, model_roberta, tokenizer_roberta, device, label_order)
print(f"Text: {test_text_1}")
print(f"Prediction: {pred} (Confidence: {conf:.4f})\n")

# Example 2: Simple utterance (no context)
test_text_2 = "that is the best news i've heard all day"
pred, conf = predict_emotion(test_text_2, model_roberta, tokenizer_roberta, device, label_order)
print(f"Text: {test_text_2}")
print(f"Prediction: {pred} (Confidence: {conf:.4f})\n")

Text: [SPK=Chandler] [SEP] i am not great at the advice can i interest you in a sarcastic comment
Prediction: neutral (Confidence: 0.7183)

Text: that is the best news i've heard all day
Prediction: joy (Confidence: 0.7234)

