In [2]:
from datasets import load_dataset
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
import pandas as pd


In [3]:
from transformers import pipeline

In [4]:
# load in hugginface imdb data
from datasets import load_dataset
dataset = load_dataset("imdb")

train_data = dataset['train']
test_data = dataset['test']


In [5]:
# print examples from train
print(train_data[0])  
print(train_data[1])

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

In [6]:
# print examples from test 
print(test_data[0])  
print(test_data[1])

{'text': 'I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn\'t match the background, and painfully one-dimensional characters cannot be overcome with a \'sci-fi\' setting. (I\'m sure there are those of you out there who think Babylon 5 is good sci-fi TV. It\'s not. It\'s clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It\'s really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it\'s rubbish as 

### Baseline Model #1: Bag of Words (BOW)

In [6]:
train_text = dataset['train']['text']
train_label = dataset['train']['label']
test_text = dataset['train']['text']
test_label = dataset['train']['label']

In [7]:
# convert text to BoW vector
vectorizer = CountVectorizer(stop_words='english', max_features=2500)
X_train = vectorizer.fit_transform(train_text)
X_test = vectorizer.transform(test_text)

In [8]:
# split data
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, train_label, test_size=0.2, random_state=42
)

# start with logistic regression
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_split, y_train_split)


In [9]:
# predict on the val
val_predictions = model.predict(X_val_split)

# check performance 
val_accuracy = accuracy_score(y_val_split, val_predictions)
print("Validation Accuracy:", val_accuracy)
print(classification_report(y_val_split, val_predictions))


Validation Accuracy: 0.8592
              precision    recall  f1-score   support

           0       0.86      0.86      0.86      2515
           1       0.86      0.86      0.86      2485

    accuracy                           0.86      5000
   macro avg       0.86      0.86      0.86      5000
weighted avg       0.86      0.86      0.86      5000



Starting accuracy is at 85.88% which is good but can be improved if we are able to get more context. BoW ignores the order of words so "not bad" and "bad not" could be treated the same. It might not be able to handle synonyms and can't generalize well. 

In [10]:
# feature names 
feature_names = vectorizer.get_feature_names_out()
coefficients = model.coef_[0]  

important_features = sorted(zip(coefficients, feature_names), key=lambda x: x[0], reverse=True)

print("Top Positive Words:", important_features[:20])
print("Top Negative Words:", important_features[-20:])


Top Positive Words: [(1.8126817354520277, 'refreshing'), (1.734332590052363, 'wonderfully'), (1.6008805474702326, 'freedom'), (1.6001857913068684, 'funniest'), (1.5874899367504318, 'appreciated'), (1.5535899521847263, 'rare'), (1.5386611101257917, 'answers'), (1.5358963769266767, 'finest'), (1.4969918905337112, 'steals'), (1.4933707549826836, 'captures'), (1.4125456662090632, 'incredible'), (1.4001116269086566, 'impressed'), (1.2776312011392739, 'surprisingly'), (1.260728088579651, 'victoria'), (1.2314536566963015, 'delightful'), (1.2263214929892707, 'sensitive'), (1.1757558768837, 'legendary'), (1.1393467912788544, 'discovers'), (1.137077899674682, 'wonderful'), (1.1148123041040041, 'sympathetic')]
Top Negative Words: [(-1.3101022411132965, 'disappointing'), (-1.3270037965045736, 'paper'), (-1.3501438598040991, 'badly'), (-1.3561810573377455, 'wooden'), (-1.3650885313775127, 'laughable'), (-1.4594345890384546, 'lacks'), (-1.4778186394756445, 'inept'), (-1.486684393454093, 'alright'), 

In [11]:
# dataframe with top positive words
postive_words = pd.DataFrame(important_features[:20])
postive_words

Unnamed: 0,0,1
0,1.812682,refreshing
1,1.734333,wonderfully
2,1.600881,freedom
3,1.600186,funniest
4,1.58749,appreciated
5,1.55359,rare
6,1.538661,answers
7,1.535896,finest
8,1.496992,steals
9,1.493371,captures


In [12]:
# dataframe with top negative words
negative_words = pd.DataFrame(important_features[-20:])
negative_words

Unnamed: 0,0,1
0,-1.310102,disappointing
1,-1.327004,paper
2,-1.350144,badly
3,-1.356181,wooden
4,-1.365089,laughable
5,-1.459435,lacks
6,-1.477819,inept
7,-1.486684,alright
8,-1.513043,awful
9,-1.55532,lousy


### Baseline Model #2: TF-IDF

In [13]:
# TFIDF 
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)


X_train_tfidf = tfidf_vectorizer.fit_transform(train_text)
X_test_tfidf = tfidf_vectorizer.transform(test_text)


In [14]:
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train_tfidf, train_label, test_size=0.2, random_state=42
)

tfidf_model = LogisticRegression(max_iter=1000, random_state=42)
tfidf_model.fit(X_train_split, y_train_split)


In [15]:
val_predictions_tfidf = tfidf_model.predict(X_val_split)

val_accuracy_tfidf = accuracy_score(y_val_split, val_predictions_tfidf)
print("Validation Accuracy (TF-IDF):", val_accuracy_tfidf)
print(classification_report(y_val_split, val_predictions_tfidf))


Validation Accuracy (TF-IDF): 0.8836
              precision    recall  f1-score   support

           0       0.90      0.87      0.88      2515
           1       0.87      0.90      0.88      2485

    accuracy                           0.88      5000
   macro avg       0.88      0.88      0.88      5000
weighted avg       0.88      0.88      0.88      5000



Accuracy improved a bit with 88.36%. TF-IDF still ignores word order but it is a bit better than BoW because it gives weight to words. TF-IDF makes sure that repetitive words do not get all of the attention. 

 We can find the most commonly occuring words and give them importance. We are able to find which words are more important. If we see the word "great" in a review but we see it multiple times it won't have as much weight as it would have in the BoW.

### Baseline Model #3: Prompt Engineering

In [24]:
from transformers import pipeline

sentiment_pipeline = pipeline("sentiment-analysis")

def prompt_engineered_sentiment(text):
    prompt = f"Classify the sentiment of the following review: '{text}'. Respond with Positive or Negative."
    result = sentiment_pipeline(prompt)
    return result

sample_review = "Alive"
result = prompt_engineered_sentiment(sample_review)
print(result)


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'label': 'POSITIVE', 'score': 0.9976444840431213}]
