In [1]:
# --- 0. IMPORTS & SETUP ---
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from scipy.stats import uniform, loguniform

# New library for SMOTE
from imblearn.over_sampling import SMOTE
from collections import Counter

# --- 1. DATA LOADING & CLEANING ---
print("Loading and preparing data...")
# Replace with your actual file path
df = pd.read_csv("7282_1.csv", sep=',')

# Keep only relevant columns
df = df[['reviews.text', 'reviews.rating']].copy()

# Ensure ratings are numeric
df['reviews.rating'] = pd.to_numeric(df['reviews.rating'], errors='coerce')

# Define Binary Labels:
# 0 (Negative) for ratings 1 & 2
# 1 (Positive) for ratings 4 & 5
# -1 for others (to be removed)
df['sentiment'] = np.where(df['reviews.rating'].isin([1, 2]), 0, 
                  np.where(df['reviews.rating'].isin([4, 5]), 1, -1))

# Filter out neutrals/NaNs
df_clean = df[df['sentiment'] != -1].copy()

print(f"Original row count: {len(df)}")
print(f"Cleaned row count: {len(df_clean)}")
print(f"Class balance before split:\n{df_clean['sentiment'].value_counts()}")

# Define Features (X) and Labels (y)
X = df_clean['reviews.text']
y = df_clean['sentiment']

# --- 2. TRAIN/TEST SPLIT ---
# Stratify ensures the test set keeps the original imbalance for a fair evaluation
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

print(f"\nTraining set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")

# --- 3. TF-IDF VECTORIZATION ---
print("\nVectorizing text data...")
# Limiting to 5000 features to keep SMOTE fast
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1, 2))

# Fit on train, transform both
X_train_vec = vectorizer.fit_transform(X_train.astype(str))
X_test_vec = vectorizer.transform(X_test.astype(str))

# --- 4. APPLYING SMOTE (The Fix) ---
print("\n--- Applying SMOTE Oversampling ---")
print(f"Original Training Counts: {Counter(y_train)}")

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_vec, y_train)

print(f"Resampled Training Counts: {Counter(y_train_resampled)}")
print("Note: The 'Negative' class has been upsampled to match the 'Positive' class.")

# --- 5. HYPERPARAMETER TUNING (Randomized Search) ---
print("\n--- Starting Randomized Search on Balanced Data ---")

# Parameter grid to search
param_distributions = {
    'C': loguniform(0.1, 100), 
    'kernel': ['linear', 'rbf'], 
    'gamma': ['scale', 'auto'],
}

# Base SVM
# Note: removed class_weight='balanced' because SMOTE already balanced the data
base_svm = SVC(random_state=42, probability=True)

# Randomized Search Setup
random_search = RandomizedSearchCV(
    estimator=base_svm, 
    param_distributions=param_distributions, 
    n_iter=10,            # You can increase this to 20 or 30 if you have time
    scoring='f1',         # Optimizing for F1 Score
    cv=3, 
    verbose=1, 
    random_state=42, 
    n_jobs=-1             # Use all CPU cores
)

# Fit on the RESAMPLED training data
random_search.fit(X_train_resampled, y_train_resampled)

print(f"\nBest CV F1-Score: {random_search.best_score_:.4f}")
print(f"Best Hyperparameters: {random_search.best_params_}")

# --- 6. FINAL EVALUATION ---
# We evaluate on the ORIGINAL Test set (X_test_vec), not a resampled one.
# This ensures we test on "real" data distributions.

best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test_vec)

print("\n" + "="*60)
print("FINAL MODEL PERFORMANCE (TRAINED WITH SMOTE)")
print("="*60)
print(classification_report(y_test, y_pred, target_names=['Negative (0)', 'Positive (1)']))

# Check Accuracy
acc = accuracy_score(y_test, y_pred)
print(f"Overall Accuracy: {acc:.2%}")

Loading and preparing data...
Original row count: 35912
Cleaned row count: 28473
Class balance before split:
sentiment
1    22426
0     6047
Name: count, dtype: int64

Training set size: 19931
Testing set size: 8542

Vectorizing text data...

--- Applying SMOTE Oversampling ---
Original Training Counts: Counter({1: 15698, 0: 4233})
Resampled Training Counts: Counter({1: 15698, 0: 15698})
Note: The 'Negative' class has been upsampled to match the 'Positive' class.

--- Starting Randomized Search on Balanced Data ---
Fitting 3 folds for each of 10 candidates, totalling 30 fits

Best CV F1-Score: 0.9151
Best Hyperparameters: {'C': 6.358358856676251, 'gamma': 'auto', 'kernel': 'linear'}

FINAL MODEL PERFORMANCE (TRAINED WITH SMOTE)
              precision    recall  f1-score   support

Negative (0)       0.64      0.78      0.70      1814
Positive (1)       0.94      0.88      0.91      6728

    accuracy                           0.86      8542
   macro avg       0.79      0.83      0.81 