In [1]:
# Load the Labeled Dataset
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE

In [2]:
# Load the labeled dataset
df = pd.read_csv('/content/labelled_data.csv')

In [3]:
# Ensure the dataset contains necessary columns
if 'review_text' not in df.columns or 'majority_vote' not in df.columns:
    raise ValueError("Dataset must contain 'review_text' and 'majority_vote' columns.")

In [4]:
# Filter for two labels: 'positive' and 'negative'
df = df[df['majority_vote'].isin(['positive', 'negative'])]

# Features (X) and Labels (y)
X = df['review_text']
y = df['majority_vote']

# Map labels to numeric values 
label_mapping = {'positive': 1, 'negative': 0}
y = y.map(label_mapping)

# Check class distribution
print("Class Distribution:")
print(y.value_counts())

Class Distribution:
majority_vote
1    11227
0      565
Name: count, dtype: int64


In [6]:
# Vectorization
count_vectorizer = CountVectorizer(max_features=3000, min_df=5, max_df=0.8)
X_count = count_vectorizer.fit_transform(X)

tfidf_vectorizer = TfidfVectorizer(max_features=3000, min_df=5, max_df=0.8)
X_tfidf = tfidf_vectorizer.fit_transform(X) 

In [7]:
# Function to train and evaluate SVM
def train_evaluate_svm_binary(X_train, X_test, y_train, y_test, vectorizer_name, use_smote=False):
    # Apply SMOTE 
    if use_smote:
        smote = SMOTE(random_state=42)
        X_train, y_train = smote.fit_resample(X_train, y_train)
        print(f"\nClass distribution after SMOTE ({vectorizer_name}):")
        print(pd.Series(y_train).value_counts())

    # Initialize SVM with class weight balancing
    svm_model = SVC(kernel='linear', class_weight='balanced', random_state=42)

    # Train SVM
    svm_model.fit(X_train, y_train)

    # Predict
    y_pred = svm_model.predict(X_test)

    # Evaluate Performance
    print(f"\nPerformance with {vectorizer_name} {'(with SMOTE)' if use_smote else ''}:")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")

    # Return evaluation metrics
    precision = precision_score(y_test, y_pred, average='binary')
    recall = recall_score(y_test, y_pred, average='binary')
    f1 = f1_score(y_test, y_pred, average='binary')
    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")

    return svm_model

In [9]:
!pip install scikit-learn

from sklearn.model_selection import train_test_split
# Split data into training and testing sets for Count Vectorizer
X_train_count, X_test_count, y_train, y_test = train_test_split(
    X_count, y, test_size=0.2, random_state=42  
)

# Split data into training and testing sets for TF-IDF Vectorizer
X_train_tfidf, X_test_tfidf, _, _ = train_test_split(
    X_tfidf, y, test_size=0.2, random_state=42  
)



In [11]:
# SVM with SMOTE Oversampling
train_evaluate_svm_binary(X_train_count, X_test_count, y_train, y_test, 'Count Vectorizer', use_smote=True)
train_evaluate_svm_binary(X_train_tfidf, X_test_tfidf, y_train, y_test, 'TF-IDF Vectorizer', use_smote=True)


Class distribution after SMOTE (Count Vectorizer):
majority_vote
1    8987
0    8987
Name: count, dtype: int64

Performance with Count Vectorizer (with SMOTE):
Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.70      0.68       119
           1       0.98      0.98      0.98      2240

    accuracy                           0.97      2359
   macro avg       0.83      0.84      0.83      2359
weighted avg       0.97      0.97      0.97      2359

Accuracy: 0.9674
Precision: 0.9839, Recall: 0.9817, F1-Score: 0.9828

Class distribution after SMOTE (TF-IDF Vectorizer):
majority_vote
1    8987
0    8987
Name: count, dtype: int64

Performance with TF-IDF Vectorizer (with SMOTE):
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.75      0.75       119
           1       0.99      0.99      0.99      2240

    accuracy                           0.97      2359
   macro avg    