# SVM Training â€“ Class Weighted Approach

This notebook contains:

*   Data preprocessing
* Class imbalance handling
* Model training
* Evaluation metrics
* Confusion matrix



In [None]:
!pip install -q scikit-learn pandas numpy matplotlib seaborn joblib

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os
from google.colab import files


df = pd.read_csv("clean_amazon_reviews.csv")
print("\nDataset shape:", df.shape)
print("Columns:", df.columns.tolist())

df = df[['clean_review', 'sentiment']].dropna().reset_index(drop=True)

print(f"\nCleaned dataset shape: {df.shape}")
print("\nOriginal sentiment distribution:")
sentiment_counts = df['sentiment'].value_counts()
print(sentiment_counts)
print(f"\nClass distribution percentages:")
for label, count in sentiment_counts.items():
    print(f"  {label}: {count} ({count/len(df)*100:.1f}%)")


le = LabelEncoder()
df['label'] = le.fit_transform(df['sentiment'])

label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(f"\nLabel mapping: {label_mapping}")


class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(df['label']),
    y=df['label']
)

class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}
print(f"\nCalculated class weights: {class_weight_dict}")


X_train, X_test, y_train, y_test = train_test_split(
    df['clean_review'],
    df['label'],
    test_size=0.2,
    random_state=42,
    stratify=df['label']
)

print(f"\nTrain samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")

train_dist = pd.Series(y_train).value_counts().sort_index()
test_dist = pd.Series(y_test).value_counts().sort_index()
print(f"\nTrain distribution: {train_dist.to_dict()}")
print(f"Test distribution: {test_dist.to_dict()}")


print("\nðŸ”„ Vectorizing text data with TF-IDF...")

tfidf = TfidfVectorizer(
    max_features=15000,
    ngram_range=(1, 2),
    min_df=5,
    max_df=0.8,
    sublinear_tf=True
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print(f"TF-IDF shape: {X_train_tfidf.shape}")
print(f"Vocabulary size: {len(tfidf.vocabulary_)}")


print("\nðŸš€ Training SVM with GridSearchCV and class weights...")

param_grid = {
    "C": [0.1, 1, 10],
    "max_iter": [2000, 5000]
}

base_svm = LinearSVC(
    class_weight='balanced',
    random_state=42,
    dual=False
)

grid = GridSearchCV(
    base_svm,
    param_grid,
    cv=3,
    scoring='f1_weighted',
    n_jobs=-1,
    verbose=2
)

grid.fit(X_train_tfidf, y_train)

print(f"\nâœ… Best parameters: {grid.best_params_}")
print(f"Best cross-validation F1 score: {grid.best_score_:.4f}")

best_svm = grid.best_estimator_


print("\nðŸ”§ Calibrating SVM for probability estimates...")

calibrated_svm = CalibratedClassifierCV(
    best_svm,
    method='sigmoid',
    cv=3
)

calibrated_svm.fit(X_train_tfidf, y_train)
print("âœ… Calibration complete!")


print("\nðŸ“Š Evaluating model on test set...")

y_pred = calibrated_svm.predict(X_test_tfidf)
y_pred_proba = calibrated_svm.predict_proba(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
f1_weighted = f1_score(y_test, y_pred, average='weighted')
f1_macro = f1_score(y_test, y_pred, average='macro')
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print("\nTest Set Metrics:")
print(f"  Accuracy: {accuracy:.4f}")
print(f"  F1 (weighted): {f1_weighted:.4f}")
print(f"  F1 (macro): {f1_macro:.4f}")
print(f"  Precision: {precision:.4f}")
print(f"  Recall: {recall:.4f}")

y_true_labels = le.inverse_transform(y_test)
y_pred_labels = le.inverse_transform(y_pred)

print("\nðŸ“‹ Detailed Classification Report:")
print(classification_report(y_true_labels, y_pred_labels, target_names=le.classes_))


cm = confusion_matrix(y_true_labels, y_pred_labels, labels=le.classes_)
df_cm = pd.DataFrame(cm, index=le.classes_, columns=le.classes_)

plt.figure(figsize=(8, 6))
sns.heatmap(df_cm, annot=True, fmt="d", cmap="Blues", cbar_kws={'label': 'Count'})
plt.title("Confusion Matrix - SVM Class Weighted Model")
plt.ylabel("True Label")
plt.xlabel("Predicted Label")
plt.tight_layout()
plt.show()


print("\nðŸ“ˆ Top predictive features per class:")

feature_names = np.array(tfidf.get_feature_names_out())

if hasattr(best_svm, 'coef_'):
    coef = best_svm.coef_

    for idx, class_name in enumerate(le.classes_):
        print(f"\n{class_name.upper()}:")
        top_indices = np.argsort(coef[idx])[-10:][::-1]
        top_features = feature_names[top_indices]
        top_scores = coef[idx][top_indices]

        for feature, score in zip(top_features, top_scores):
            print(f"  {feature}: {score:.4f}")


print("\nðŸ’¾ Saving model and components...")
model_save_path = "svm_sentiment_weighted"
os.makedirs(model_save_path, exist_ok=True)

joblib.dump(calibrated_svm, f"{model_save_path}/svm_model.pkl")
joblib.dump(tfidf, f"{model_save_path}/tfidf.pkl")
joblib.dump(le, f"{model_save_path}/label_encoder.pkl")

metadata = {
    'class_weights': class_weight_dict,
    'label_mapping': label_mapping,
    'num_classes': len(le.classes_),
    'classes': le.classes_.tolist(),
    'best_params': grid.best_params_,
    'vocabulary_size': len(tfidf.vocabulary_),
    'test_accuracy': accuracy,
    'test_f1': f1_weighted
}
joblib.dump(metadata, f"{model_save_path}/model_metadata.pkl")

print("âœ… Model saved successfully!")

!zip -r svm_sentiment_weighted.zip svm_sentiment_weighted

print("ðŸ“¥ Downloading model...")
files.download("svm_sentiment_weighted.zip")


class SVMSentimentPredictor:
    def __init__(self, model_path="svm_sentiment_weighted"):

        self.model = joblib.load(f"{model_path}/svm_model.pkl")
        self.tfidf = joblib.load(f"{model_path}/tfidf.pkl")
        self.label_encoder = joblib.load(f"{model_path}/label_encoder.pkl")
        self.metadata = joblib.load(f"{model_path}/model_metadata.pkl")

        print(f"âœ… SVM Sentiment predictor loaded successfully!")
        print(f"   Classes: {self.metadata['classes']}")

    def predict(self, texts):
        if isinstance(texts, str):
            texts = [texts]

        X_vec = self.tfidf.transform(texts)

        preds = self.model.predict(X_vec)
        probs = self.model.predict_proba(X_vec)

        labels = self.label_encoder.inverse_transform(preds)

        results = []
        for i, text in enumerate(texts):
            prob_dict = {
                label: float(probs[i][idx])
                for idx, label in enumerate(self.label_encoder.classes_)
            }

            results.append({
                "text": text,
                "predicted_label": labels[i],
                "confidence": float(np.max(probs[i])),
                "probabilities": prob_dict
            })

        return results[0] if len(texts) == 1 else results

predictor = SVMSentimentPredictor()


print("\nðŸ§ª Testing the trained SVM model:")

test_cases = [
    "This product is amazing and works perfectly!",
    "Worst purchase ever, totally disappointed",
    "Quality is okay, nothing special",
    "The quality was bad but delivery was okay",
    "Excellent service and fast shipping",
    "Not worth the money, poor quality"
]

print("\nSingle predictions:")
for text in test_cases:
    result = predictor.predict(text)
    print(f"Text: '{text}'")
    print(f"Prediction: {result['predicted_label']} (confidence: {result['confidence']:.3f})")
    print(f"Probabilities: {result['probabilities']}")
    print("-" * 50)

print("\nBatch prediction:")
batch_results = predictor.predict(test_cases)
for result in batch_results:
    print(f"{result['predicted_label']}: {result['text']}")

print("\nðŸŽ‰ Complete! Your optimized SVM class-weighted sentiment model is ready!")
