# Q1 - Char TF-IDF (char_wb)

Fixed structure: Vectorization, After vectorization cleaning, Modeling, Results.

## 1. Vectorization


In [None]:
from pathlib import Path
import sys
import time

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Local helpers shared across vectorization notebooks
ROOT_DIR = Path.cwd()
if (ROOT_DIR / 'Q1').exists():
    DATA_DIR = ROOT_DIR
    FUNCTIONS_DIR = ROOT_DIR / 'Q1' / 'functions'
else:
    DATA_DIR = ROOT_DIR.parent
    FUNCTIONS_DIR = ROOT_DIR / 'functions'

sys.path.insert(0, str(FUNCTIONS_DIR))

from common import (
    load_dataset,
    pick_first_existing,
    train_val_split,
    clean_after_vectorization,
    compute_metrics,
)

# Load processed data
train_df = load_dataset(DATA_DIR / 'train_df_processed.csv')
test_df = load_dataset(DATA_DIR / 'test_df_processed.csv')

text_col = pick_first_existing(train_df, ['processed_text', 'text'])
label_col = pick_first_existing(train_df, ['sentiment', 'sentiment_class'])

X = train_df[text_col].fillna('').astype(str)
y = train_df[label_col]

X_train, X_val, y_train, y_val = train_val_split(X, y, test_size=0.2, random_state=42)

# Keep raw text for error analysis later
X_train_text = X_train.copy()
X_val_text = X_val.copy()

# Char-level TF-IDF vectorization
vectorizer = TfidfVectorizer(
    analyzer='char_wb',
    ngram_range=(3, 5),
    min_df=2,
    max_df=0.9,
    sublinear_tf=True,
)
X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)


## 2. After vectorization cleaning


In [None]:
# Remove rows with empty vectors (all-zero features)
X_train_vec, y_train, X_train_text, train_mask = clean_after_vectorization(
    X_train_vec, y_train, X_train_text
)
X_val_vec, y_val, X_val_text, val_mask = clean_after_vectorization(
    X_val_vec, y_val, X_val_text
)

print('Removed train rows:', (~train_mask).sum())
print('Removed val rows:', (~val_mask).sum())
print('Train size after cleaning:', X_train_vec.shape[0])
print('Val size after cleaning:', X_val_vec.shape[0])


## 3. Modeling


In [None]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

# Define models directly in the notebook
models = {
    'MultinomialNB': MultinomialNB(alpha=1.0),
    'LogisticRegression': LogisticRegression(
        C=1.0,
        solver='liblinear',
        max_iter=2000,
        random_state=42,
    ),
    'LinearSVC': LinearSVC(C=1.0),
    'SGDClassifier': SGDClassifier(
        loss='hinge',
        alpha=1e-4,
        max_iter=1000,
        random_state=42,
    ),
}

results = []
preds_by_model = {}

for name, model in models.items():
    start_time = time.time()
    model.fit(X_train_vec, y_train)
    train_time = time.time() - start_time

    preds = model.predict(X_val_vec)
    preds_by_model[name] = preds

    metrics = compute_metrics(y_val, preds)
    metrics['model'] = name
    metrics['train_time_sec'] = train_time
    results.append(metrics)


## 4. Results


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay

# Results table
results_df = pd.DataFrame(results).sort_values('macro_f1', ascending=False)
display(results_df)

# Confusion matrices for the top two models
top_models = results_df.head(2)['model'].tolist()
for model_name in top_models:
    preds = preds_by_model[model_name]
    disp = ConfusionMatrixDisplay.from_predictions(
        y_val,
        preds,
        cmap='Blues',
        colorbar=False,
    )
    disp.ax_.set_title(f'Confusion Matrix - {model_name}')
    plt.show()

# Misclassified examples from the best model
best_model = top_models[0]
errors = pd.DataFrame({
    'text': X_val_text,
    'true_label': y_val,
    'pred_label': preds_by_model[best_model],
})
errors = errors[errors['true_label'] != errors['pred_label']].head(10)
display(errors)
