In [1]:
import pandas as pd

df = pd.read_csv("IMDB_cleaned.csv")

print(df.shape)
df.head()


(5000, 3)


Unnamed: 0,review,cleaned_review,sentiment
0,One of the other reviewers has mentioned that ...,one of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,a wonderful little production the filming tech...,positive
2,I thought this was a wonderful way to spend ti...,i thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,basically there is a family where a little boy...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",petter mattei s love in the time of money is a...,positive


In [2]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')

sia = SentimentIntensityAnalyzer()

# Take 3 sample reviews
sample_reviews = df['review'].head(3)

for review in sample_reviews:
    scores = sia.polarity_scores(review)
    print("Review:", review[:100], "...")
    print("VADER Scores:", scores)
    print("-" * 60)


Review: One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. The ...
VADER Scores: {'neg': 0.203, 'neu': 0.748, 'pos': 0.048, 'compound': -0.9951}
------------------------------------------------------------
Review: A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-B ...
VADER Scores: {'neg': 0.053, 'neu': 0.776, 'pos': 0.172, 'compound': 0.9641}
------------------------------------------------------------
Review: I thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air con ...
VADER Scores: {'neg': 0.094, 'neu': 0.714, 'pos': 0.192, 'compound': 0.9605}
------------------------------------------------------------


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [3]:
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')

review_text = df['review'][0]

sentences = sent_tokenize(review_text)
tokens = word_tokenize(review_text)

print("Sample Review:")
print(review_text[:200])

print("\nNumber of Sentences:", len(sentences))
print("Number of Tokens:", len(tokens))


Sample Review:
One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me abo

Number of Sentences: 10
Number of Tokens: 380


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')

text = df['cleaned_review'][0]

tokens = word_tokenize(text)
stop_words = set(stopwords.words('english'))

filtered_tokens = [w for w in tokens if w.lower() not in stop_words]

print("Before:", tokens[:30])
print("After:", filtered_tokens[:30])


Before: ['one', 'of', 'the', 'other', 'reviewers', 'has', 'mentioned', 'that', 'after', 'watching', 'just', 'oz', 'episode', 'you', 'will', 'be', 'hooked', 'they', 'are', 'right', 'as', 'this', 'is', 'exactly', 'what', 'happened', 'with', 'me', 'the', 'first']
After: ['one', 'reviewers', 'mentioned', 'watching', 'oz', 'episode', 'hooked', 'right', 'exactly', 'happened', 'first', 'thing', 'struck', 'oz', 'brutality', 'unflinching', 'scenes', 'violence', 'set', 'right', 'word', 'go', 'trust', 'show', 'faint', 'hearted', 'timid', 'show', 'pulls', 'punches']


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download('wordnet')

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

words = filtered_tokens[:10]

for word in words:
    print(word, "→", stemmer.stem(word), "|", lemmatizer.lemmatize(word))


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


one → one | one
reviewers → review | reviewer
mentioned → mention | mentioned
watching → watch | watching
oz → oz | oz
episode → episod | episode
hooked → hook | hooked
right → right | right
exactly → exactli | exactly
happened → happen | happened


In [6]:
import re

text = df['review'][0]

normalized_text = re.sub(r'\s+', ' ', text.strip().lower())

print(normalized_text[:200])


one of the other reviewers has mentioned that after watching just 1 oz episode you'll be hooked. they are right, as this is exactly what happened with me.<br /><br />the first thing that struck me abo


In [7]:
text = df['review'][0]

clean_text = re.sub(r'[^a-zA-Z\s]', '', text)

print(clean_text[:200])


One of the other reviewers has mentioned that after watching just  Oz episode youll be hooked They are right as this is exactly what happened with mebr br The first thing that struck me about Oz was i


In [8]:
sample_data = df['cleaned_review'].head(10)


In [9]:
from sklearn.feature_extraction.text import CountVectorizer

bow = CountVectorizer(max_features=20)
bow_matrix = bow.fit_transform(sample_data)

print("BoW Vocabulary:")
print(bow.get_feature_names_out())

print("\nBoW Matrix:")
print(bow_matrix.toarray())


BoW Vocabulary:
['all' 'and' 'are' 'as' 'be' 'but' 'for' 'in' 'is' 'it' 'not' 'of' 'one'
 'that' 'the' 'this' 'to' 'with' 'would' 'you']

BoW Matrix:
[[ 1  6  2  4  2  2  5  3 10  6  6  7  1  5 16  3  6  5  2  3]
 [ 2  7  2  0  0  2  0  0  3  3  2  5  1  0 16  0  2  3  0  1]
 [ 0  4  1  0  2  3  0  4  4  2  2  4  1  1  8  5  4  2  1  0]
 [ 3  4  3  2  0  0  2  2  4  0  0  2  0  0  6  1  3  3  0  2]
 [ 2  5  1  1  1  1  1  6  7  0  1  6  6  1 20  2  7  1  0  0]
 [ 2  5  1  1  1  2  1  3  4  3  1  2  1  1  4  1  4  0  1  0]
 [ 0  5  0  0  3  0  3  3  1  2  0  4  0  0  4  3  6  2  6  3]
 [ 0  2  1  2  1  1  2  1  7  8  3  4  1  5 15  4  3  1  1  0]
 [ 0  3  0  0  1  0  1  2  6  1  0  4  2  1  7  5  3  0  0  0]
 [ 0  0  1  0  0  0  0  0  0  1  0  0  0  0  0  2  0  0  0  4]]


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=20)
tfidf_matrix = tfidf.fit_transform(sample_data)

print("TF-IDF Vocabulary:")
print(tfidf.get_feature_names_out())

print("\nTF-IDF Matrix:")
print(tfidf_matrix.toarray())


TF-IDF Vocabulary:
['all' 'and' 'are' 'as' 'be' 'but' 'for' 'in' 'is' 'it' 'not' 'of' 'one'
 'that' 'the' 'this' 'to' 'with' 'would' 'you']

TF-IDF Matrix:
[[0.0517374  0.21169505 0.07735283 0.20694959 0.08494097 0.09354369
  0.21235242 0.11602925 0.35282508 0.2320585  0.28063106 0.24697755
  0.04247048 0.23385922 0.56452012 0.10584752 0.21169505 0.21235242
  0.1034748  0.15521219]
 [0.14702016 0.35091326 0.10990527 0.         0.         0.13290973
  0.         0.         0.1503914  0.16485791 0.13290973 0.25065233
  0.06034336 0.         0.80208746 0.         0.10026093 0.18103009
  0.         0.07351008]
 [0.         0.27089697 0.0742388  0.         0.16304291 0.26933355
  0.         0.2969552  0.27089697 0.1484776  0.1795557  0.27089697
  0.08152146 0.08977785 0.54179395 0.33862122 0.27089697 0.16304291
  0.09930916 0.        ]
 [0.3556342  0.32336805 0.2658552  0.23708946 0.         0.
  0.19462332 0.1772368  0.32336805 0.         0.         0.16168403
  0.         0.         0.485