In [2]:

# SECTION 1: Setup
# ---------------------------------------------------------------


# Install required libraries (run once per environment)
# !pip install pandas scikit-learn nltk


import re
import random
import pandas as pd
from nltk.corpus import movie_reviews
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [16]:
# SECTION 2: Load and Prepare Dataset
# ---------------------------------------------------------------


# Download the NLTK movie reviews corpus 
import nltk
nltk.download('movie_reviews')


docs = [(movie_reviews.raw(fileid), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
random.shuffle(docs)


df = pd.DataFrame(docs, columns=["text", "label"])
df = df.sample(2000, random_state=42)


X_train, X_test, y_train, y_test = train_test_split(
df["text"], df["label"], test_size=0.2, random_state=42, stratify=df["label"])

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/taranesh/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


In [23]:

# SECTION 3: Rule-Based Sentiment Classifier
# ---------------------------------------------------------------


positive_words = {"good", "great", "amazing", "excellent", "happy", "wonderful", "fantastic", "love", "best", "awesome"}
negative_words = {"bad", "terrible", "awful", "horrible", "sad", "hate", "worst", "boring", "poor", "annoying"}


def rule_based_sentiment(text):
  text = text.lower()
  score = 0


  for word in positive_words:
    if re.search(rf"\b{word}\b", text):
      score += 1
  for word in negative_words:
    if re.search(rf"\b{word}\b", text):
      score -= 1

  #this is one example rule: if "not" then reverse sentiment
  if re.search(r"not\s+\b(good|great|amazing|wonderful|recommend)\b", text):
    score -= 2
  if re.search(r"not\s+\b(bad|terrible|awful|boring)\b", text):
    score += 2
  if re.search(r"(very|extremely|so)\s+\b(good|great|amazing|happy)\b", text):
    score += 1

  if "!" in text:
    if score > 0: score += 1
    if score < 0: score -= 1
  #implement more rules here!


  if score > 0:
    return "pos"
  elif score < 0:
    return "neg"
  else:
    return "neutral"


#These are example to test your implementation with. 
examples = [
"I loved this movie, it was amazing!",
    "This film was not good at all.",     
    "The acting was terrible and the plot was boring.", 
    "It was not a bad experience.",        
    "I am very happy with the result!",  
    "The movie was okay, nothing special.", 
    "I would not recommend this to anyone.",
    "The weather is cloudy today."
]
for e in examples:
  print(e, "->", rule_based_sentiment(e))


rule_preds = X_test.apply(rule_based_sentiment)
print("\nRule-based accuracy:", accuracy_score(y_test, rule_preds))

I loved this movie, it was amazing! -> pos
This film was not good at all. -> neg
The acting was terrible and the plot was boring. -> neg
It was not a bad experience. -> neg
I am very happy with the result! -> pos
The movie was okay, nothing special. -> neutral
I would not recommend this to anyone. -> neg
The weather is cloudy today. -> neutral

Rule-based accuracy: 0.535


In [24]:

#  SECTION 4: Machine Learning Classifier (Logistic Regression)
# ---------------------------------------------------------------


vectorizer = CountVectorizer(stop_words="english", max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_vec, y_train)


y_pred = clf.predict(X_test_vec)
print("\nML classifier accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


feature_weights = sorted(zip(clf.coef_[0], vectorizer.get_feature_names_out()))
print("\nTop positive features:", feature_weights[-10:])
print("Top negative features:", feature_weights[:10])


ML classifier accuracy: 0.845
              precision    recall  f1-score   support

         neg       0.85      0.84      0.84       200
         pos       0.84      0.85      0.85       200

    accuracy                           0.84       400
   macro avg       0.85      0.84      0.84       400
weighted avg       0.85      0.84      0.84       400


Top positive features: [(0.41285935829562886, 'pulp'), (0.4196824138885708, 'pace'), (0.42110582436530747, 'american'), (0.4444855884498654, 'different'), (0.45137610746669304, 'perfectly'), (0.46100053999446794, 'terrific'), (0.4715821854529027, 'excellent'), (0.5623434558237801, 'overall'), (0.5977160670580506, 'great'), (0.7385084307400356, 'fun')]
Top negative features: [(-0.7954349688401184, 'supposed'), (-0.7846747406523165, 'bad'), (-0.6658456069693203, 'boring'), (-0.635418236197664, 'worst'), (-0.5965813753896935, 'poor'), (-0.5815403770708105, 'guess'), (-0.5782089730408099, 'looks'), (-0.568185534830745, 'maybe'), (-0.5589

In [26]:

#  SECTION 5: Comparison
# ---------------------------------------------------------------


comparison = pd.DataFrame({
"text": X_test.values[:10],
"rule_based": rule_preds.values[:10],
"ml_based": y_pred[:10],
"true_label": y_test.values[:10]
})
print("\nSample comparison:")
print(comparison)


print("\nRule-based accuracy:", accuracy_score(y_test, rule_preds))
print("ML accuracy:", accuracy_score(y_test, y_pred))


Sample comparison:
                                                text rule_based ml_based  \
0  from dusk till dawn ( director/editor : robert...    neutral      neg   
1  i have a great idea for a movie , one that can...    neutral      neg   
2  i think the first thing this reviewer should m...        pos      neg   
3  hey , i've got a great idea for a movie ! \nok...        pos      neg   
4  eddie murphy has a lot riding on harlem nights...    neutral      neg   
5  mike myers , you certainly did throw us a ? fr...        pos      pos   
6  all those who were offended by there's somethi...    neutral      pos   
7  in the james bond film " diamonds are forever ...        pos      neg   
8  disillusioned and trying to find the spice of ...        neg      neg   
9  this christmas , little ralphie parker ( peter...        pos      pos   

  true_label  
0        neg  
1        neg  
2        pos  
3        neg  
4        neg  
5        pos  
6        pos  
7        neg  
8       