# Import Libraries

In [2]:
import pandas as pd
import numpy as np
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from tqdm import tqdm

# Load and preprocess data

In [4]:
data = pd.read_csv('Reviews.csv', encoding='ISO-8859-1', usecols=['Text', 'Score'])
data.dropna(subset=['Text'], inplace=True)
data['Label'] = data['Score'].apply(lambda x: 'negative' if x in [1, 2] else 'neutral' if x == 3 else 'positive')

# Sample for faster development

In [6]:
data = data.sample(10000, random_state=42).reset_index(drop=True)

# Lexicon-Based Sentiment Analysis with Progress Bar

In [8]:
def lexicon_sentiment(text):
    blob = TextBlob(text)
    tb_polarity = blob.sentiment.polarity
    tb_label = 'positive' if tb_polarity > 0 else 'negative' if tb_polarity < 0 else 'neutral'

    vader_score = SentimentIntensityAnalyzer().polarity_scores(text)['compound']
    vader_label = 'positive' if vader_score > 0.05 else 'negative' if vader_score < -0.05 else 'neutral'

    return pd.Series([tb_label, vader_label])

tqdm.pandas()
data[['TextBlob_Pred', 'VADER_Pred']] = data['Text'].progress_apply(lexicon_sentiment)

100%|██████████| 10000/10000 [01:24<00:00, 118.41it/s]


# Feature Extraction (TF-IDF)

In [10]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_tfidf = vectorizer.fit_transform(data['Text'])
y = data['Label']

# Train-Test Split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42, stratify=y)

# Model Training - Naive Bayes & SVM

In [14]:
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

# Model Evaluation

In [16]:
nb_pred = nb_model.predict(X_test)
svm_pred = svm_model.predict(X_test)

print("Naive Bayes Classification Report:\n")
print(classification_report(y_test, nb_pred))

print("SVM Classification Report:\n")
print(classification_report(y_test, svm_pred))

Naive Bayes Classification Report:

              precision    recall  f1-score   support

    negative       0.79      0.04      0.07       280
     neutral       0.00      0.00      0.00       150
    positive       0.79      1.00      0.88      1570

    accuracy                           0.79      2000
   macro avg       0.53      0.35      0.32      2000
weighted avg       0.73      0.79      0.70      2000

SVM Classification Report:

              precision    recall  f1-score   support

    negative       0.73      0.46      0.57       280
     neutral       0.33      0.01      0.03       150
    positive       0.85      0.98      0.91      1570

    accuracy                           0.84      2000
   macro avg       0.64      0.49      0.50      2000
weighted avg       0.79      0.84      0.80      2000



# Export Results to CSV

In [18]:
results_df = data[['Text', 'Label', 'TextBlob_Pred', 'VADER_Pred']].copy()
results_df['NaiveBayes_Pred'] = nb_model.predict(X_tfidf)
results_df['SVM_Pred'] = svm_model.predict(X_tfidf)
results_df.to_csv('Sentiment_Analysis_Results.csv', index=False)
print("Exported results to Sentiment_Analysis_Results.csv")

Exported results to Sentiment_Analysis_Results.csv


### Discussion - Sentiment Model Comparison

**Name (ID Number):** Akmal Yazid Bin Abd Rahim (SN01082199), Muhammad Adlin Bin Yuzaimi (IS01082209)

**Naive Bayes:**
- Strengths: Simple, fast, performs decently on large text datasets.
- Weaknesses: Assumes word independence, may misclassify nuanced sentiments.

**SVM:**
- Strengths: Handles high-dimensional TF-IDF vectors well, good generalization.
- Weaknesses: Slower to train on large datasets, sensitive to parameter tuning.

**Lexicon-based models (TextBlob & VADER):**
- Strengths: No training required, interpretable.
- Weaknesses: Not context-aware, can't adapt to new data or domain.

Overall, SVM and Naive Bayes both outperform lexicon-based approaches in classification tasks, especially when trained on large, labeled datasets.
