In [17]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# Ensure a default seed exists if not previously defined
if 'SEED' not in globals():
    SEED = 42


# New dataset pipeline: clean `text.csv`, keep only `text` and `sentiment`, retrain transformer
This section loads the new dataset, cleans it, filters to required columns, and fine-tunes the transformer. It also ensures both columns are quoted on save.


In [18]:
# Robust CSV load (handles encoding and delimiter), supports text.csv or test.csv
import pandas as pd
import numpy as np
from pathlib import Path
import csv, chardet

candidates = [Path('../data/tet.csv'), Path('../data/test.csv')]
DATA_NEW = next((p for p in candidates if p.exists()), None)
assert DATA_NEW is not None, f"Dataset not found at {candidates}"

with open(DATA_NEW, 'rb') as f:
    enc = chardet.detect(f.read(1_000_000)).get('encoding') or 'utf-8'

try:
    raw = pd.read_csv(DATA_NEW, dtype=str, encoding=enc, engine='python', sep=None, on_bad_lines='skip')
except Exception:
    try:
        raw = pd.read_csv(DATA_NEW, dtype=str, encoding=enc, engine='python', sep=',', on_bad_lines='skip')
    except Exception:
        raw = pd.read_csv(DATA_NEW, dtype=str, encoding=enc, engine='python', sep='\t', on_bad_lines='skip')

print({"raw_shape": raw.shape, "columns": list(raw.columns)})


{'raw_shape': (4815, 9), 'columns': ['textID', 'text', 'sentiment', 'Time of Tweet', 'Age of User', 'Country', 'Population -2020', 'Land Area (Km²)', 'Density (P/Km²)']}


In [19]:
# Load and clean new dataset (test.csv)
import pandas as pd
from pathlib import Path

DATA_NEW = Path('../data/test.csv')  # your dataset
assert DATA_NEW.exists(), f"Dataset not found at {DATA_NEW}"

# Try multiple encodings until one works
encodings = ["utf-8", "ISO-8859-1", "cp1252"]
raw = None
for enc in encodings:
    try:
        raw = pd.read_csv(DATA_NEW, dtype=str, encoding=enc, on_bad_lines="skip")
        print(f"Loaded with encoding={enc}")
        break
    except Exception as e:
        print(f"Failed with encoding={enc}: {e}")

assert raw is not None, "Could not load CSV with tried encodings"
print({"raw_shape": raw.shape, "columns": list(raw.columns)})

# Standardize column names to expected ones
text_col = next((c for c in raw.columns if c.lower().strip() in {"text", "message", "reply", "content"}), None)
sent_col = next((c for c in raw.columns if c.lower().strip() in {"sentiment", "label", "class", "target"}), None)
assert text_col and sent_col, "Could not detect text/sentiment columns"

df = raw[[text_col, sent_col]].rename(columns={text_col: "text", sent_col: "sentiment"})

# Clean text
def clean_text(s):
    if not isinstance(s, str):
        return ""
    return " ".join(s.split())

# Normalize labels
def normalize_label(s):
    if not isinstance(s, str):
        return "neutral"
    s = s.strip().lower()
    mapping = {
        "pos": "positive", "+": "positive", "1": "positive",
        "neg": "negative", "-": "negative",
        "0": "neutral", "neu": "neutral"
    }
    if s in mapping:
        return mapping[s]
    if "posit" in s: return "positive"
    if "negat" in s: return "negative"
    if "neutral" in s or s == "": return "neutral"
    return s

df["text"] = df["text"].apply(clean_text)
df["sentiment"] = df["sentiment"].apply(normalize_label)

# Keep only allowed labels
allowed = {"negative", "neutral", "positive"}
df = df.dropna(subset=["text", "sentiment"])
df = df[df["sentiment"].isin(allowed)]
df = df[df["text"].str.len() > 0]

print({"clean_shape": df.shape, "label_counts": df["sentiment"].value_counts().to_dict()})

# Save cleaned file (with quotes)
CLEAN_PATH = Path("../data/clean_text.csv")
df.to_csv(CLEAN_PATH, index=False, quoting=1, encoding="utf-8")
print(f"Cleaned dataset saved at: {CLEAN_PATH}")


Failed with encoding=utf-8: 'utf-8' codec can't decode byte 0xb2 in position 13: invalid start byte
Loaded with encoding=ISO-8859-1
{'raw_shape': (4815, 9), 'columns': ['textID', 'text', 'sentiment', 'Time of Tweet', 'Age of User', 'Country', 'Population -2020', 'Land Area (Km²)', 'Density (P/Km²)']}
{'clean_shape': (3534, 2), 'label_counts': {'neutral': 1430, 'positive': 1103, 'negative': 1001}}
Cleaned dataset saved at: ..\data\clean_text.csv


In [20]:
# Train/test split on cleaned data
from sklearn.model_selection import train_test_split

label2id = {l: i for i, l in enumerate(sorted(df['sentiment'].unique()))}
id2label = {i: l for l, i in label2id.items()}

X_train, X_test, y_train, y_test = train_test_split(
    df['text'].tolist(),
    df['sentiment'].map(label2id).tolist(),
    test_size=0.2,
    random_state=42,
    stratify=df['sentiment'].tolist() if df['sentiment'].value_counts().min() >= 2 else None,
)
print({"n_train": len(X_train), "n_test": len(X_test), "labels": id2label})


{'n_train': 2827, 'n_test': 707, 'labels': {0: 'negative', 1: 'neutral', 2: 'positive'}}


# SvaraAI Reply Classification Notebook

This notebook trains two models for classifying email replies: a TF-IDF + Logistic Regression baseline and a fine-tuned `distilbert-base-uncased` transformer. It evaluates both (accuracy, F1) and saves the best model for the API in `../models/`.


In [21]:
# Setup
import os
import random
import numpy as np
import pandas as pd

from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, classification_report
import pickle

# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

DATA_PATH = Path('../data/clean_text.csv')  # replace with provided dataset path if different
MODELS_DIR = Path('../models')
MODELS_DIR.mkdir(parents=True, exist_ok=True)



In [22]:
bad_rows = []
with open(DATA_PATH, encoding="utf-8") as f:
    for i, line in enumerate(f, start=1):
        if line.count(",") > 1:  # more than "text","label"
            bad_rows.append((i, line.strip()))

print(bad_rows[:10])

[(4, '"Recession hit Veronique Branquinho, she has to quit her company, such a shame!","negative"'), (9, '"soooooo wish i could, but im in school and myspace is completely blocked","negative"'), (15, '"I`m in VA for the weekend, my youngest son turns 2 tomorrow......it makes me kinda sad, he is getting so big, check out my twipics","negative"'), (17, '"So hot today =_= don`t like it and i hate my new timetable, having such a bad week","negative"'), (21, '"I`m going into a spiritual stagnentation, its exploding my ego!. I now realise, i`m not all that great. and I`m ok with that.","neutral"'), (24, '"... need retail therapy, bad. AHHH.....gimme money geebus","negative"'), (28, '"hey peoples, dont you just hate being grounded haha, im just sat eating an apple and watching death note (some anime)","neutral"'), (29, '"Huh, another ScarePoint coding Sunday","neutral"'), (31, '"No AC, the fan doesnt swing our way ... we are sweating it out on a hot humid day","negative"'), (34, '"There is a 

In [23]:
# Load & preprocess data

def clean_text(s: str) -> str:
    if not isinstance(s, str):
        return ""
    s = s.strip()
    return s

raw = pd.read_csv(DATA_PATH)
raw = raw.dropna(subset=['text', 'sentiment']).copy()
raw['text'] = raw['text'].apply(clean_text)

label2id = {l: i for i, l in enumerate(sorted(raw['sentiment'].unique()))}
id2label = {i: l for l, i in label2id.items()}

# Handle tiny datasets: only stratify if every class has at least 2 samples
# and the test set will have at least 1 sample per class


X_train, X_test, y_train, y_test = train_test_split(
    raw['text'].tolist(),
    raw['sentiment'].map(label2id).tolist(),
    test_size=0.2,
    random_state=SEED,
    stratify=raw['sentiment'],
)
print(f"Dataset size: {len(raw)}, Train: {len(X_train)}, Test: {len(X_test)}")
print("Label mapping:", label2id)


Dataset size: 3534, Train: 2827, Test: 707
Label mapping: {'negative': 0, 'neutral': 1, 'positive': 2}


In [24]:
# Baseline: TF-IDF + Logistic Regression
baseline = Pipeline([
    ("tfidf", TfidfVectorizer(lowercase=True, ngram_range=(1, 2), min_df=1)),
    ("logreg", LogisticRegression(max_iter=200, class_weight="balanced", random_state=SEED)),
])

baseline.fit(X_train, y_train)

pred = baseline.predict(X_test)
acc = accuracy_score(y_test, pred)
f1 = f1_score(y_test, pred, average='macro')
print({"baseline_accuracy": acc, "baseline_f1_macro": f1})

# Align report labels to those present in y_test to avoid mismatches on tiny splits
labels_present = sorted(set(y_test))
target_names = [id2label[i] for i in labels_present]
print(classification_report(y_test, pred, labels=labels_present, target_names=target_names))

# Save baseline
with open(MODELS_DIR / 'baseline.pkl', 'wb') as f:
    pickle.dump(baseline, f)



{'baseline_accuracy': 0.6577086280056577, 'baseline_f1_macro': 0.6629919586148564}
              precision    recall  f1-score   support

    negative       0.65      0.69      0.67       200
     neutral       0.61      0.60      0.61       286
    positive       0.72      0.69      0.71       221

    accuracy                           0.66       707
   macro avg       0.66      0.66      0.66       707
weighted avg       0.66      0.66      0.66       707



In [25]:
# Temperature scaling: learn temperature T on validation set and save for API use
# Requires: model, device, and val_loader from previous CSV fine-tuning cell

import json, numpy as np, torch
from scipy.optimize import minimize

logits_list, labels_list = [], []
model.eval()
with torch.no_grad():
    for batch in val_loader:
        y = batch["labels"].to(device)
        x = {k: v.to(device) for k, v in batch.items() if k != "labels"}
        logits = model(**x).logits
        logits_list.append(logits.cpu())
        labels_list.append(y.cpu())

logits = torch.cat(logits_list).numpy()
labels = torch.cat(labels_list).numpy()

def nll_loss(log_T):
    t = float(np.exp(log_T))  # ensure positivity
    z = logits / t
    z = z - z.max(axis=1, keepdims=True)
    p = np.exp(z) / np.exp(z).sum(axis=1, keepdims=True)
    eps = 1e-12
    return -np.log(p[np.arange(len(labels)), labels] + eps).mean()

res = minimize(nll_loss, x0=np.array([0.0]), method="L-BFGS-B")
T = float(np.exp(res.x[0]))
print({"temperature": T})

(temp_dir := (MODELS_DIR / "transformer")).mkdir(parents=True, exist_ok=True)
with open(temp_dir / "temperature.json", "w", encoding="utf-8") as f:
    json.dump({"temperature": T}, f)

print({"saved": str(temp_dir / "temperature.json")})


{'temperature': 1.0}
{'saved': '..\\models\\transformer\\temperature.json'}


  t = float(np.exp(log_T))  # ensure positivity


In [26]:
# Comparison & selection
from pathlib import Path

# Load baseline metrics (already printed). Transformer metrics in `metrics`.
# For a tiny dataset, metrics may be unstable. We'll choose transformer if f1 improves.

baseline_metrics = {"accuracy": acc, "f1_macro": f1}
transformer_metrics = {"accuracy": metrics.get('eval_accuracy', 0.0), "f1_macro": metrics.get('eval_f1_macro', 0.0)}
print({"baseline": baseline_metrics, "transformer": transformer_metrics})

best = 'transformer' if transformer_metrics['f1_macro'] >= baseline_metrics['f1_macro'] else 'baseline'
print(f"Best model: {best}")

# The API checks `models/transformer/` first, else `models/baseline.pkl`.
# Nothing else needed here.



{'baseline': {'accuracy': 0.6577086280056577, 'f1_macro': 0.6629919586148564}, 'transformer': {'accuracy': 0.7369165487977369, 'f1_macro': 0.7414003009830316}}
Best model: transformer


In [16]:
# Improved RoBERTa fine-tuning with class-weighted loss and early stopping
# Uses X_train, y_train, X_test, y_test, id2label, label2id from earlier cells

from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer, EarlyStoppingCallback
import numpy as np
import torch
import evaluate
from pathlib import Path
import os

# Choose a lighter model if disk is tight
model_name = os.environ.get('LIGHT_MODEL', 'distilroberta-base')  # fallback lighter than roberta-base
max_length = 160
train_batch_size = 16
eval_batch_size = 32
num_epochs = 5
learning_rate = 2e-5
weight_decay = 0.01
patience = 2

# Use local cache to avoid filling system disk
cache_dir = str(Path('../models/hf_cache'))
os.environ['TRANSFORMERS_CACHE'] = cache_dir

# Dataset & tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
train_ds = Dataset.from_dict({'text': X_train, 'label': y_train})
eval_ds = Dataset.from_dict({'text': X_test, 'label': y_test})


def tokenize_fn(batch):
    return tokenizer(batch['text'], truncation=True, max_length=max_length)

train_tok = train_ds.map(tokenize_fn, batched=True, remove_columns=['text'])
eval_tok = eval_ds.map(tokenize_fn, batched=True, remove_columns=['text'])

collator = DataCollatorWithPadding(tokenizer=tokenizer)

num_labels = len(label2id)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
    cache_dir=cache_dir,
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Class weights for imbalance
_, counts = np.unique(y_train, return_counts=True)
class_weights = torch.tensor((counts.sum() / (counts * len(counts))).astype(np.float32), device=device)

# Metrics
metric_acc = evaluate.load('accuracy')
metric_f1 = evaluate.load('f1')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    acc = metric_acc.compute(predictions=preds, references=labels)['accuracy']
    f1 = metric_f1.compute(predictions=preds, references=labels, average='macro')['f1']
    return {'accuracy': acc, 'f1_macro': f1}

# Custom Trainer to apply class-weighted CE
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get('labels')
        outputs = model(**inputs)
        logits = outputs.get('logits')
        loss = torch.nn.functional.cross_entropy(logits, labels, weight=class_weights)
        return (loss, outputs) if return_outputs else loss

# Manual training loop with early stopping (compat with older transformers)
from torch.utils.data import DataLoader

# Set torch format
train_tok.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
eval_tok.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

train_loader = DataLoader(train_tok, batch_size=train_batch_size, shuffle=True, collate_fn=collator)
eval_loader = DataLoader(eval_tok, batch_size=eval_batch_size, shuffle=False, collate_fn=collator)

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

best_f1 = -1.0
no_improve = 0
save_dir = Path('../models/transformer')
save_dir.mkdir(parents=True, exist_ok=True)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    for batch in train_loader:
        optimizer.zero_grad()
        batch = {k: v.to(device) for k, v in batch.items()}
        if 'label' in batch and 'labels' not in batch:
            batch['labels'] = batch.pop('label')
        outputs = model(**{k: v for k, v in batch.items() if k != 'labels'})
        logits = outputs.logits
        loss = torch.nn.functional.cross_entropy(logits, batch['labels'], weight=class_weights)
        loss.backward()
        optimizer.step()
        total_loss += float(loss.detach().cpu())
    print({'epoch': epoch + 1, 'train_loss': total_loss / max(1, len(train_loader))})

    # Evaluation
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in eval_loader:
            if 'label' in batch and 'labels' not in batch:
                batch['labels'] = batch.pop('label')
            labels = batch['labels'].to(device)
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
            logits = model(**inputs).logits
            preds = torch.argmax(logits, dim=-1)
            all_preds.extend(preds.detach().cpu().tolist())
            all_labels.extend(labels.detach().cpu().tolist())

    acc = metric_acc.compute(predictions=all_preds, references=all_labels)['accuracy']
    f1 = metric_f1.compute(predictions=all_preds, references=all_labels, average='macro')['f1']
    print({'eval_accuracy': acc, 'eval_f1_macro': f1})

    # Early stopping + save best
    if f1 > best_f1:
        best_f1 = f1
        no_improve = 0
        model.save_pretrained(str(save_dir))
        if not (save_dir / 'tokenizer_config.json').exists():
            tokenizer.save_pretrained(str(save_dir))
    else:
        no_improve += 1
        if no_improve >= patience:
            print({'early_stopping': True, 'best_f1_macro': best_f1})
            break

print({'best_f1_macro': best_f1, 'saved_to': str(save_dir)})


Map: 100%|██████████| 2827/2827 [00:00<00:00, 19840.33 examples/s]
Map: 100%|██████████| 707/707 [00:00<00:00, 18636.66 examples/s]
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'epoch': 1, 'train_loss': 0.7723835450781267}
{'eval_accuracy': 0.7425742574257426, 'eval_f1_macro': 0.745325597661452}
{'epoch': 2, 'train_loss': 0.5008132660203735}
{'eval_accuracy': 0.7722772277227723, 'eval_f1_macro': 0.7756110518258855}
{'epoch': 3, 'train_loss': 0.3591410228719482}
{'eval_accuracy': 0.7652050919377652, 'eval_f1_macro': 0.7690384542536121}
{'epoch': 4, 'train_loss': 0.2396646352474299}
{'eval_accuracy': 0.7538896746817539, 'eval_f1_macro': 0.7583142139897424}
{'early_stopping': True, 'best_f1_macro': 0.7756110518258855}
{'best_f1_macro': 0.7756110518258855, 'saved_to': '..\\models\\transformer'}
