In [None]:
# Load the Labeled Dataset
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE

In [None]:
# Load the labeled dataset
df = pd.read_csv('/content/labelled_data.csv')

In [None]:
# Ensure the dataset contains necessary columns
if 'review_text' not in df.columns or 'majority_vote' not in df.columns:
    raise ValueError("Dataset must contain 'review_text' and 'majority_vote' columns.")

In [None]:
# Features (X) and Labels (y)
X = df['review_text']
y = df['majority_vote']

# Map labels to numeric values
label_mapping = {'positive': 1, 'negative': 0, 'neutral': 2}
y = y.map(label_mapping)

# Check class distribution
print("Class Distribution:")
print(y.value_counts())

Class Distribution:
majority_vote
1    11227
0      565
2       16
Name: count, dtype: int64


In [None]:
# 1. Count Vectorizer
count_vectorizer = CountVectorizer(max_features=3000, min_df=5, max_df=0.8)
X_count = count_vectorizer.fit_transform(X)

In [None]:
# 2. TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=3000, min_df=5, max_df=0.8)
X_tfidf = tfidf_vectorizer.fit_transform(X)

In [None]:
# Display shapes of feature matrices
print(f"Count Vectorizer Shape: {X_count.shape}")
print(f"TF-IDF Vectorizer Shape: {X_tfidf.shape}")

# Train-Test Split
X_train_count, X_test_count, y_train, y_test = train_test_split(X_count, y, test_size=0.2, random_state=42)
X_train_tfidf, X_test_tfidf, _, _ = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)


Count Vectorizer Shape: (11808, 3000)
TF-IDF Vectorizer Shape: (11808, 3000)


In [None]:
# SVM Implementation: Baseline Model
print("\n=== Baseline SVM (No Balancing) ===")
svm_baseline = SVC(kernel='linear', random_state=42)


=== Baseline SVM (No Balancing) ===


In [None]:
# Train and Evaluate on Count Vectorizer Features
svm_baseline.fit(X_train_count, y_train)
y_pred_count = svm_baseline.predict(X_test_count)
print("\nPerformance with Count Vectorizer:")
print(classification_report(y_test, y_pred_count))
print(f"Accuracy: {accuracy_score(y_test, y_pred_count):.2f}")


Performance with Count Vectorizer:
              precision    recall  f1-score   support

           0       0.65      0.72      0.68        92
           1       0.99      0.99      0.99      2266
           2       0.00      0.00      0.00         4

    accuracy                           0.97      2362
   macro avg       0.55      0.57      0.56      2362
weighted avg       0.97      0.97      0.97      2362

Accuracy: 0.97


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Train and Evaluate on TF-IDF Features
svm_baseline.fit(X_train_tfidf, y_train)
y_pred_tfidf = svm_baseline.predict(X_test_tfidf)
print("\nPerformance with TF-IDF Vectorizer:")
print(classification_report(y_test, y_pred_tfidf))
print(f"Accuracy: {accuracy_score(y_test, y_pred_tfidf):.2f}")


Performance with TF-IDF Vectorizer:
              precision    recall  f1-score   support

           0       0.77      0.62      0.69        92
           1       0.98      0.99      0.99      2266
           2       0.00      0.00      0.00         4

    accuracy                           0.98      2362
   macro avg       0.58      0.54      0.56      2362
weighted avg       0.97      0.98      0.98      2362

Accuracy: 0.98


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# SVM with Class Weight Balancing
print("\n=== SVM with Class Weight Balancing ===")
svm_weighted = SVC(kernel='linear', class_weight='balanced', random_state=42)



=== SVM with Class Weight Balancing ===


In [None]:
# Train and Evaluate on Count Vectorizer Features
svm_weighted.fit(X_train_count, y_train)
y_pred_weighted_count = svm_weighted.predict(X_test_count)
print("\nPerformance with Count Vectorizer (Balanced Weights):")
print(classification_report(y_test, y_pred_weighted_count))
print(f"Accuracy: {accuracy_score(y_test, y_pred_weighted_count):.2f}")



Performance with Count Vectorizer (Balanced Weights):
              precision    recall  f1-score   support

           0       0.66      0.73      0.69        92
           1       0.99      0.99      0.99      2266
           2       0.00      0.00      0.00         4

    accuracy                           0.97      2362
   macro avg       0.55      0.57      0.56      2362
weighted avg       0.97      0.97      0.97      2362

Accuracy: 0.97


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Train and Evaluate on TF-IDF Features
svm_weighted.fit(X_train_tfidf, y_train)
y_pred_weighted_tfidf = svm_weighted.predict(X_test_tfidf)
print("\nPerformance with TF-IDF Vectorizer (Balanced Weights):")
print(classification_report(y_test, y_pred_weighted_tfidf))
print(f"Accuracy: {accuracy_score(y_test, y_pred_weighted_tfidf):.2f}")



Performance with TF-IDF Vectorizer (Balanced Weights):
              precision    recall  f1-score   support

           0       0.55      0.84      0.67        92
           1       0.99      0.97      0.98      2266
           2       0.00      0.00      0.00         4

    accuracy                           0.97      2362
   macro avg       0.52      0.60      0.55      2362
weighted avg       0.97      0.97      0.97      2362

Accuracy: 0.97


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# SMOTE Oversampling
print("\n=== SVM with SMOTE Oversampling ===")
# Apply SMOTE to the Count Vectorizer features
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_count, y_train)

print("Class distribution after SMOTE:")
print(pd.Series(y_train_smote).value_counts())



=== SVM with SMOTE Oversampling ===
Class distribution after SMOTE:
majority_vote
1    8961
0    8961
2    8961
Name: count, dtype: int64


In [None]:
# Train and Evaluate on SMOTE-balanced data
svm_smote = SVC(kernel='linear', random_state=42)
svm_smote.fit(X_train_smote, y_train_smote)
y_pred_smote = svm_smote.predict(X_test_count)
print("\nPerformance with SMOTE Oversampling:")
print(classification_report(y_test, y_pred_smote))
print(f"Accuracy: {accuracy_score(y_test, y_pred_smote):.2f}")



Performance with SMOTE Oversampling:
              precision    recall  f1-score   support

           0       0.56      0.64      0.60        92
           1       0.98      0.98      0.98      2266
           2       0.00      0.00      0.00         4

    accuracy                           0.97      2362
   macro avg       0.52      0.54      0.53      2362
weighted avg       0.97      0.97      0.97      2362

Accuracy: 0.97


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Calculate Evaluation Metrics (Precision, Recall, F1-Score) for Baseline Model
precision = precision_score(y_test, y_pred_count, average='weighted')
recall = recall_score(y_test, y_pred_count, average='weighted')
f1 = f1_score(y_test, y_pred_count, average='weighted')

print("\n=== Evaluation Metrics for Baseline SVM ===")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")



=== Evaluation Metrics for Baseline SVM ===
Precision: 0.97
Recall: 0.97
F1-Score: 0.97


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
