# Sentiment Model Training & Evaluation
This notebook demonstrates training and evaluating a baseline sentiment model using scikit-learn (TF-IDF + Logistic Regression) and shows how to export the model for inference.

In [None]:
# Install dependencies (if running on a fresh environment)
%pip -q install -r ../requirements.txt || true

In [None]:
import os
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib

DATA_PATH = '../data/processed/train.csv'
MODEL_DIR = '../artifacts'
os.makedirs(MODEL_DIR, exist_ok=True)

## Load dataset
Expecting columns: `text` and `label` (label in {negative, neutral, positive}).

In [None]:
if not os.path.exists(DATA_PATH):
    # Create a tiny demo dataset if not found
    demo = pd.DataFrame({
        'text': [
            'I love this product so much!',
            'This is the worst experience ever.',
            'It is okay, not great but not bad either.',
            'Absolutely fantastic performance and quality.',
            'Pretty disappointing and frustrating to use.'
        ],
        'label': ['positive','negative','neutral','positive','negative']
    })
    demo_path = '../data/processed'
    os.makedirs(demo_path, exist_ok=True)
    demo.to_csv(DATA_PATH, index=False)

df = pd.read_csv(DATA_PATH)
df.head()

## Train/Validation split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df['text'].astype(str), df['label'].astype(str), test_size=0.2, random_state=42, stratify=df['label']
)
len(X_train), len(X_test)

## Vectorizer + Model pipeline (TF-IDF + Logistic Regression)

In [None]:
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,2), min_df=2)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

clf = LogisticRegression(max_iter=1000, n_jobs=None)
clf.fit(X_train_vec, y_train)

preds = clf.predict(X_test_vec)
probs = clf.predict_proba(X_test_vec) if hasattr(clf, 'predict_proba') else None
acc = accuracy_score(y_test, preds)
print('Accuracy:', acc)
print(classification_report(y_test, preds))
confusion_matrix(y_test, preds)

## Save model for inference

In [None]:
model_path = os.path.join(MODEL_DIR, 'sklearn_logreg_tfidf.joblib')
vec_path = os.path.join(MODEL_DIR, 'tfidf_vectorizer.joblib')
joblib.dump(clf, model_path)
joblib.dump(vectorizer, vec_path)
model_path, vec_path

## Quick inference demo

In [None]:
loaded_model = joblib.load(model_path)
loaded_vec = joblib.load(vec_path)

sample_texts = [
    'What a wonderful surprise, this is great!',
    'Terrible outcome, would not recommend.',
    'It works as expected.'
]
sample_vec = loaded_vec.transform(sample_texts)
pred_labels = loaded_model.predict(sample_vec)
pred_probs = loaded_model.predict_proba(sample_vec)
list(zip(sample_texts, pred_labels, pred_probs.max(axis=1)))