In [None]:
from nltk.corpus import movie_reviews
import random

# Download dataset
import nltk
nltk.download('movie_reviews')

# Load and label
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


In [None]:
import pandas as pd
df = pd.DataFrame(documents, columns=['review', 'label'])

# Show the shape
print("Shape of dataset:", df.shape)  # (rows, columns)

# Optional: show first few rows
df.head()

Shape of dataset: (2000, 2)


Unnamed: 0,review,label
0,"[i, am, continually, amazed, at, movies, like,...",neg
1,"[john, von, neumann, ,, progenitor, of, the, c...",pos
2,"["", varsity, blues, "", is, the, best, film, of...",neg
3,"[recently, one, night, a, young, director, nam...",neg
4,"[ingredients, :, pouring, rain, ,, small, floo...",pos


In [None]:
num_documents = len(movie_reviews.fileids())
print("Number of documents:", num_documents)

Number of documents: 2000


In [None]:
num_words = len(movie_reviews.words())
print("Total number of words:", num_words)

Total number of words: 1583820


VADER

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

analyzer = SentimentIntensityAnalyzer()
text = "This movie was amazing and heart-touching!"
score = analyzer.polarity_scores(text)
print(score)

print("=== VADER Sentiment ===")


{'neg': 0.0, 'neu': 0.55, 'pos': 0.45, 'compound': 0.6239}
=== VADER Sentiment ===


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


BERT

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokens = tokenizer.tokenize("This movie is so boring and slow.")
print(tokens)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

['this', 'movie', 'is', 'so', 'boring', 'and', 'slow', '.']


In [None]:
from transformers import pipeline
classifier = pipeline('sentiment-analysis')
classifier("This movie is fantastic!")



No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Device set to use cpu


[{'label': 'POSITIVE', 'score': 0.9998828172683716}]

LOGISTIC REGRESSION

In [None]:
from nltk.corpus import movie_reviews
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

docs = [(movie_reviews.raw(fileid), category)
        for category in movie_reviews.categories()
        for fileid in movie_reviews.fileids(category)]

texts, labels = zip(*docs)
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(texts)
y = [1 if label == 'pos' else 0 for label in labels]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model = LogisticRegression(max_iter=500)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.85


In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.82      0.88      0.85       193
           1       0.88      0.82      0.85       207

    accuracy                           0.85       400
   macro avg       0.85      0.85      0.85       400
weighted avg       0.85      0.85      0.85       400



In [None]:
for i in range(5):
    print(texts[i])
    print("Actual:", y_test[i], "Predicted:", y_pred[i])

plot : two teen couples go to a church party , drink and then drive . 
they get into an accident . 
one of the guys dies , but his girlfriend continues to see him in her life , and has nightmares . 
what's the deal ? 
watch the movie and " sorta " find out . . . 
critique : a mind-fuck movie for the teen generation that touches on a very cool idea , but presents it in a very bad package . 
which is what makes this review an even harder one to write , since i generally applaud films which attempt to break the mold , mess with your head and such ( lost highway & memento ) , but there are good and bad ways of making all types of films , and these folks just didn't snag this one correctly . 
they seem to have taken this pretty neat concept , but executed it terribly . 
so what are the problems with the movie ? 
well , its main problem is that it's simply too jumbled . 
it starts off " normal " but then downshifts into this " fantasy " world in which you , as an audience member , have no id