# Imports

In [None]:
import os
import tarfile
import pandas as pd
import numpy as np
import random
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, classification_report
import joblib


In [None]:
SEED = 42
np.random.seed(SEED)
random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)

# Load Dataset

In [None]:

train_df = pd.read_csv(r"C:\Users\Acer\Desktop\projects\week_4\train.csv")
test_df = pd.read_csv(r"C:\Users\Acer\Desktop\projects\week_4\test.csv")

# Train/test/val split

In [None]:
train_df, val_df = train_test_split(
    train_df,
    test_size=0.1,
    random_state=SEED,
    stratify=train_df["label"]
)
print(f"Train size: {len(train_df)}, Val size: {len(val_df)}, Test size: {len(test_df)}")


# Clean Text

In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"<.*?>", " ", text)           # remove HTML tags
    text = re.sub(r"[^a-zA-Z']", " ", text)      # keep letters only
    text = re.sub(r"\s+", " ", text).strip()     # remove extra spaces
    return text

train_df["text"] = train_df["text"].apply(clean_text)
val_df["text"] = val_df["text"].apply(clean_text)
test_df["text"] = test_df["text"].apply(clean_text)


# Vectorization

In [None]:
vectorizer = TfidfVectorizer(max_features=20000, ngram_range=(1,2))
X_train = vectorizer.fit_transform(train_df["text"])
X_val = vectorizer.transform(val_df["text"])
X_test = vectorizer.transform(test_df["text"])

y_train = train_df["label"]
y_val = val_df["label"]
y_test = test_df["label"]


# Model definition , training and metrics

In [None]:
clf = LogisticRegression(max_iter=500, random_state=SEED)
clf.fit(X_train, y_train)


In [None]:
from sklearn.metrics import f1_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Predict on test set
log_test_pred = clf.predict(X_test)

# Macro F1 score
print("Macro F1:", f1_score(y_test, log_test_pred, average="macro"))

# Classification report
print("\nClassification Report:\n", classification_report(y_test, log_test_pred))

# Confusion matrix
cm = confusion_matrix(y_test, log_test_pred)
print("\nConfusion Matrix:\n", cm)

# Optional: Visualize confusion matrix
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=np.unique(y_test), yticklabels=np.unique(y_test))
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()


# Save model

In [None]:
joblib.dump(clf, "logreg_model.joblib")
joblib.dump(vectorizer, "tfidf_vectorizer.joblib")
print("Model and vectorizer saved successfully!")

In [None]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, f1_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# -----------------------------------------
# 1. Train SVM model
# -----------------------------------------
svm_model = LinearSVC()
svm_model.fit(X_train, y_train)

# -----------------------------------------
# 3. Final test evaluation
# -----------------------------------------
svm_test_preds = svm_model.predict(X_test)

# Accuracy and Macro-F1
test_acc = accuracy_score(y_test, svm_test_preds)
test_f1 = f1_score(y_test, svm_test_preds, average="macro")

print("\nTest Accuracy:", test_acc)
print("Test Macro-F1:", test_f1)

print("\nCLASSIFICATION REPORT (TEST):")
print(classification_report(y_test, svm_test_preds))

# -----------------------------------------
# 4. Confusion Matrix
# -----------------------------------------
cm = confusion_matrix(y_test, svm_test_preds)
print("\nConfusion Matrix:\n", cm)

# Optional: visualize confusion matrix
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=np.unique(y_test),
            yticklabels=np.unique(y_test))
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()


# Error Buckets

In [None]:
import pandas as pd

def create_error_buckets(df, preds, model_name):
    text = df["text"].values
    true = df["label"].values

    buckets = []

    for i in range(len(df)):
        t = true[i]
        p = preds[i]

        if t == 1 and p == 1:
            bucket = "TP (Correct Positive)"
        elif t == 0 and p == 0:
            bucket = "TN (Correct Negative)"
        elif t == 0 and p == 1:
            bucket = "FP (Predicted Positive but Actually Negative)"
        elif t == 1 and p == 0:
            bucket = "FN (Predicted Negative but Actually Positive)"

        buckets.append((model_name, text[i], t, p, bucket))

    return pd.DataFrame(buckets, columns=["Model", "Text", "True", "Pred", "Bucket"])


In [None]:
# For SVM
svm_test_errors = create_error_buckets(test_df, svm_test_preds, "SVM")

# For Logistic Regression
log_test_errors = create_error_buckets(test_df, log_test_pred, "Logistic Regression")

# Combine both
all_test_errors = pd.concat([svm_test_errors, log_test_errors], axis=0)

all_test_errors.head()


In [None]:
all_test_errors.groupby(["Model", "Bucket"]).size()


In [None]:
all_test_errors[(all_test_errors["Bucket"].str.startswith("FP"))].head(10)


In [None]:
all_test_errors[(all_test_errors["Bucket"].str.startswith("FN"))].head(10)
