In [18]:
import pandas as pd
import numpy as np
import re
import string

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import RandomizedSearchCV
import scipy.stats as stats

import pickle
import scipy.sparse as sp

In [19]:
# Load vectorizer
with open('vectorizer.pkl', 'rb') as f:
    vectorizer = pickle.load(f)

# Load vectorized data
X_train_vec = sp.load_npz('X_train_vec.npz')
X_test_vec = sp.load_npz('X_test_vec.npz')

# Load labels
y_train = np.load('y_train.npy', allow_pickle=True)
y_test = np.load('y_test.npy', allow_pickle=True)

print(f"X_train_vec shape: {X_train_vec.shape}")
print(f"X_test_vec shape: {X_test_vec.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train_vec shape: (19236, 2000)
X_test_vec shape: (8245, 2000)
y_train shape: (19236,)
y_test shape: (8245,)


In [20]:
param_dist = {
    'C': stats.loguniform(0.1, 100),
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

In [21]:
svc = SVC(probability=True, cache_size=1000, random_state=42)

random_search = RandomizedSearchCV(
    svc,
    param_distributions=param_dist,
    n_iter=1,                   
    cv=2,                      
    scoring='f1_weighted',
    n_jobs=-1,
    verbose=2,
    random_state=42
)

In [None]:
random_search.fit(X_train_vec, y_train)

Fitting 2 folds for each of 1 candidates, totalling 2 fits


In [None]:
best_svm = random_search.best_estimator_
print("\nBest SVM Parameters:", random_search.best_params_)
print("Best CV score (f1_weighted):", random_search.best_score_)

y_pred_svm = best_svm.predict(X_test_vec)
acc_svm = accuracy_score(y_test, y_pred_svm)
f1_svm = f1_score(y_test, y_pred_svm, average='weighted')

In [None]:
print(f"\nTuned SVM Accuracy: {acc_svm:.4f}")
print(f"Tuned SVM F1 Score: {f1_svm:.4f}")

In [None]:
# 8. Confusion Matrix Plot for best SVM
plt.figure(figsize=(6, 5))
cm = confusion_matrix(y_test, y_pred_svm, labels=best_svm.classes_)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=best_svm.classes_, yticklabels=best_svm.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Tuned SVM Confusion Matrix')
plt.tight_layout()
plt.show()

In [None]:
results_df = pd.read_pickle('baseline_results.pkl')

In [None]:
# 9. Baseline model barplot + tuned SVM comparison
all_results_df = pd.concat([
    results_df,
    pd.DataFrame([{'Model': 'Tuned SVM', 'Accuracy': acc_svm, 'F1 Score': f1_svm}])
], ignore_index=True)

all_results_df.set_index('Model').plot(kind='bar', figsize=(8,5))
plt.title("All Model Comparison (Accuracy and F1)")
plt.ylabel("Score")
plt.ylim(0,1)
plt.xticks(rotation=15)
plt.show()