In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv(r"C:\Users\night\Downloads\Reviews.csv", nrows=20000)

In [3]:
df.dropna(subset=["Text", "Score"], inplace=True)
df.drop_duplicates(subset=["UserId", "ProfileName", "Text"], inplace=True)

In [4]:
def get_sentiment(score):
    if score <= 2:
        return 'negative'
    elif score == 3:
        return 'neutral'
    else:
        return 'positive'

df['Sentiment'] = df['Score'].apply(get_sentiment)

In [5]:
def clean_text(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    return text.lower()

df['Clean_Text'] = df['Text'].apply(clean_text)

In [6]:
df = df.reset_index(drop=True)

In [7]:
X = df['Clean_Text']
y = df['Sentiment']

tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf.fit_transform(X)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y, test_size=0.2, random_state=42
)


In [9]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [10]:
y_pred = model.predict(X_test)

In [11]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.71      0.50      0.59       568
     neutral       0.49      0.11      0.18       296
    positive       0.86      0.97      0.91      2998

    accuracy                           0.84      3862
   macro avg       0.69      0.53      0.56      3862
weighted avg       0.81      0.84      0.81      3862



In [12]:
results_df = pd.DataFrame({
    'Text': df.loc[y_test.index, 'Text'].values,
    'Clean_Text': df.loc[y_test.index, 'Clean_Text'].values,
    'Actual_Sentiment': y_test.values,
    'Predicted_Sentiment': y_pred
})

results_df.to_csv("sentiment_predictions_logistic.csv", index=False)
print("CSV saved as 'sentiment_predictions_logistic.csv'")

CSV saved as 'sentiment_predictions_logistic.csv'


In [30]:
"""
Name: Muaz
ID: XXXXXXXXX

Discussion:
The Naive Bayes model performs well on large and balanced datasets. It is fast and easy to implement.
However, it assumes feature independence, which is often not true in natural language.

On the other hand, VADER, being rule and lexicon-based, is good at handling short texts and social media-like content.
It does not require training but may miss out on contextual cues and complex expressions.

In general, ML-based models like Naive Bayes tend to outperform lexicon-based models when trained on large, domain-specific datasets.
"""

'\nName: Muaz\nID: XXXXXXXXXX\n\nDiscussion:\nThe Naive Bayes model performs well on large and balanced datasets. It is fast and easy to implement.\nHowever, it assumes feature independence, which is often not true in natural language.\n\nOn the other hand, VADER, being rule and lexicon-based, is good at handling short texts and social media-like content.\nIt does not require training but may miss out on contextual cues and complex expressions.\n\nIn general, ML-based models like Naive Bayes tend to outperform lexicon-based models when trained on large, domain-specific datasets.\n'