In [3]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [4]:
df = pd.read_csv('IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [13]:
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sathwik\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sathwik\AppData\Roaming\nltk_data...


In [14]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'<.*?>','',text)
    text = re.sub(r'[^a-zA-Z]',' ',text)
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return " ".join(tokens)

In [15]:
df['clean_review'] = df['review'].apply(preprocess_text)
df.head()

Unnamed: 0,review,sentiment,clean_review
0,One of the other reviewers has mentioned that ...,positive,one reviewer mentioned watching oz episode hoo...
1,A wonderful little production. <br /><br />The...,positive,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,negative,basically family little boy jake think zombie ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei love time money visually stunnin...


In [16]:
from sklearn.model_selection import train_test_split

X = df['clean_review']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))

X_train_vec = tfidf.fit_transform(X_train)

X_test_vec = tfidf.transform(X_test)

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

clf = LogisticRegression(max_iter=1000)

clf.fit(X_train_vec, y_train)

y_pred = clf.predict(X_test_vec)

print(f"Accuracy :{accuracy_score(y_test,y_pred)}")

Accuracy :0.8912


In [20]:
new_rev = "I love this movie, it was amazing!"

clean_text = preprocess_text(new_rev)

new_vec = tfidf.transform([clean_text])

prediction1 = clf.predict(new_vec)[0]

print("Predicted Sentiment: ",prediction1)

Predicted Sentiment:  positive


In [21]:
new_rev1 = "I hate this movie, it was boring!"

clean_text1 = preprocess_text(new_rev1)

new_vec1 = tfidf.transform([clean_text1])

prediction2 = clf.predict(new_vec1)[0]

print("Predicted Sentiment: ",prediction2)

Predicted Sentiment:  negative
