In [44]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from imblearn.over_sampling import SMOTE
from scipy.sparse import load_npz

In [45]:
# Load the labeled dataset
df = pd.read_csv('/content/labelled_data.csv')

In [46]:
# Ensure the dataset contains necessary columns
if 'review_text' not in df.columns or 'majority_vote' not in df.columns:
    raise ValueError("Dataset must contain 'review_text' and 'majority_vote' columns.")

In [47]:
# Filter for two labels: 'positive' and 'negative'
# Store the original index before filtering
df = df[df['majority_vote'].isin(['positive', 'negative'])]

In [52]:
# Encode ground_truth labels
df['ground_truth'] = pd.Categorical(df['ground_truth'])
df['ground_truth'] = df['ground_truth'].cat.codes

In [53]:
# Load pre-saved sparse vector representations
count_vectors = load_npz('/content/count_matrix.npz')
tfidf_vectors = load_npz('/content/tfidf_matrix .npz')

In [54]:
# Reset index to ensure it starts from 0 for filtered data
df = df.reset_index(drop=True)

In [55]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    df['review_text'], df['ground_truth'], test_size=0.2, random_state=42, stratify=df['ground_truth']
)

In [56]:
# Assuming you used CountVectorizer and TfidfVectorizer previously
count_vectorizer = CountVectorizer()
tfidf_vectorizer = TfidfVectorizer()

In [57]:
# Fit and transform on training data
X_train_count = count_vectorizer.fit_transform(X_train)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

In [58]:
# Transform testing data
X_test_count = count_vectorizer.transform(X_test)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [59]:
# Train and evaluate SVM with Count Vectorizer features
train_evaluate_svm(X_train_count, X_test_count, y_train, y_test, 'Count Vectorizer', use_smote=True)


Class distribution after SMOTE (Count Vectorizer):
ground_truth
1    8981
0    8981
Name: count, dtype: int64

Performance with Count Vectorizer (with SMOTE):
Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.69      0.68       113
           1       0.98      0.98      0.98      2246

    accuracy                           0.97      2359
   macro avg       0.83      0.84      0.83      2359
weighted avg       0.97      0.97      0.97      2359

Accuracy: 0.9691
Precision: 0.9844, Recall: 0.9831, F1-Score: 0.9837


In [60]:
# Train and evaluate SVM with TF-IDF Vectorizer features
train_evaluate_svm(X_train_tfidf, X_test_tfidf, y_train, y_test, 'TF-IDF Vectorizer', use_smote=True)



Class distribution after SMOTE (TF-IDF Vectorizer):
ground_truth
1    8981
0    8981
Name: count, dtype: int64

Performance with TF-IDF Vectorizer (with SMOTE):
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.72      0.75       113
           1       0.99      0.99      0.99      2246

    accuracy                           0.98      2359
   macro avg       0.89      0.85      0.87      2359
weighted avg       0.98      0.98      0.98      2359

Accuracy: 0.9775
Precision: 0.9858, Recall: 0.9907, F1-Score: 0.9882
