dataset = https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

In [31]:
import pandas as pd

In [32]:
df = pd.read_csv("IMDB Dataset.csv")
df.shape

(50000, 2)

In [33]:
df['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [34]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [35]:
import nltk
nltk.download('stopwords') 

[nltk_data] Downloading package stopwords to C:\Users\Sina's
[nltk_data]     Pc/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [36]:
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [37]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [38]:
len(stop_words)

198

In [39]:
def preprocess_text(text):
    text = re.sub('<.*?>', ' ', text)
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    words = [w for w in text.split() if w not in stop_words]
    words = [stemmer.stem(w) for w in words]
    return ' '.join(words)

In [40]:
cleaned = []
for review in df['review']:
    cleaned.append(preprocess_text(review))

In [41]:
df['clean_review'] = cleaned

In [42]:
from sklearn.model_selection import train_test_split

In [43]:
X_train, X_test, y_train, y_test = train_test_split(df['clean_review'], df['sentiment'], test_size=0.2, random_state=42)

In [55]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer_cv = CountVectorizer(max_features=20000)
vectorizer_cv.fit(X_train)

words = list(vectorizer_cv.vocabulary_.keys())
print(f"۵۰ کلمه اول: {words[:50]}")
print(len(vectorizer_cv.vocabulary_))

۵۰ کلمه اول: ['kept', 'ask', 'mani', 'fight', 'scream', 'match', 'swear', 'gener', 'mayhem', 'permeat', 'minut', 'comparison', 'also', 'stand', 'think', 'one', 'dimension', 'charact', 'littl', 'depth', 'virtual', 'imposs', 'care', 'happen', 'badli', 'written', 'cypher', 'director', 'hang', 'belief', 'topic', 'done', 'much', 'better', 'drama', 'tv', 'cinema', 'must', 'confess', 'realli', 'spot', 'bad', 'perform', 'film', 'said', 'nichola', 'heroin', 'slutti', 'best', 'friend']
20000


In [56]:
X_train_cv =vectorizer_cv.transform(X_train)
X_test_cv  = vectorizer_cv.transform(X_test)

In [57]:
X_train_cv

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 3635036 stored elements and shape (40000, 20000)>

In [58]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer_tfidf = TfidfVectorizer(max_features=20000)

vectorizer_tfidf.fit(X_train)

words = list(vectorizer_cv.vocabulary_.keys())
print(f"۵۰ کلمه اول: {words[:50]}")
print(len(vectorizer_cv.vocabulary_))

۵۰ کلمه اول: ['kept', 'ask', 'mani', 'fight', 'scream', 'match', 'swear', 'gener', 'mayhem', 'permeat', 'minut', 'comparison', 'also', 'stand', 'think', 'one', 'dimension', 'charact', 'littl', 'depth', 'virtual', 'imposs', 'care', 'happen', 'badli', 'written', 'cypher', 'director', 'hang', 'belief', 'topic', 'done', 'much', 'better', 'drama', 'tv', 'cinema', 'must', 'confess', 'realli', 'spot', 'bad', 'perform', 'film', 'said', 'nichola', 'heroin', 'slutti', 'best', 'friend']
20000


In [59]:
X_train_tfidf = vectorizer_tfidf.transform(X_train)
X_test_tfidf  = vectorizer_tfidf.transform(X_test)

In [60]:
X_train_tfidf

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 3635036 stored elements and shape (40000, 20000)>

Multinomial Naive Bayes

In [89]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt

In [None]:
nb_cv = MultinomialNB()
nb_cv.fit(X_train_cv, y_train)
y_pred_nb_cv = nb_cv.predict(X_test_cv)

In [78]:
acc_nb_cv = accuracy_score(y_test, y_pred_nb_cv)
cm_nb_cv  = confusion_matrix(y_test, y_pred_nb_cv)

print("Accuracy:", acc_nb_cv)
print( cm_nb_cv)

Accuracy: 0.854
[[4312  649]
 [ 811 4228]]


In [68]:
nb_tfidf = MultinomialNB()
nb_tfidf.fit(X_train_tfidf, y_train)
y_pred_nb_tfidf = nb_tfidf.predict(X_test_tfidf)

In [82]:
acc_nb_tf = accuracy_score(y_test, y_pred_nb_tfidf)
cm_nb_tf  = confusion_matrix(y_test, y_pred_nb_tfidf)

print("Accuracy:", acc_nb_tf)
print( cm_nb_tf)


Accuracy: 0.8588
[[4279  682]
 [ 730 4309]]


 Logistic Regression

In [83]:
from sklearn.linear_model import LogisticRegression

In [84]:
lr_cv = LogisticRegression(max_iter=1000)
lr_cv.fit(X_train_cv, y_train)
y_pred_lr_cv = lr_cv.predict(X_test_cv)

In [85]:
acc_lr_cv = accuracy_score(y_test, y_pred_lr_cv)
cm_lr_cv  = confusion_matrix(y_test, y_pred_lr_cv)


print("Accuracy:", acc_lr_cv)
print("Confusion Matrix:\n", cm_lr_cv)


Accuracy: 0.8785
Confusion Matrix:
 [[4302  659]
 [ 556 4483]]


In [86]:
lr_tfidf = LogisticRegression(max_iter=1000)
lr_tfidf.fit(X_train_tfidf, y_train)
y_pred_lr_tfidf = lr_tfidf.predict(X_test_tfidf)

In [88]:
acc_lr_tfidf = accuracy_score(y_test, y_pred_lr_tfidf)
cm_lr_tfidf  = confusion_matrix(y_test, y_pred_lr_tfidf)


print(" Accuracy:", acc_lr_tfidf)
print("Confusion Matrix:\n", cm_lr_tfidf)



 Accuracy: 0.8916
Confusion Matrix:
 [[4346  615]
 [ 469 4570]]


Predict New review

In [120]:
new_review = "Notbad"


In [121]:
new_review = [preprocess_text(new_review)]

In [122]:
new_review_vectorized_cv = vectorizer_cv.transform(new_review)
new_review_vectorized_tf = vectorizer_tfidf.transform(new_review)

In [123]:
prediction_nbcv = nb_cv.predict(new_review_vectorized_cv)
prediction_lgcv = lr_cv.predict(new_review_vectorized_cv)
prediction_nbtf = nb_tfidf.predict(new_review_vectorized_tf)
prediction_lgtf = lr_tfidf.predict(new_review_vectorized_tf)


print("nbcv:", prediction_nbcv[0])
print("lgcv:", prediction_lgcv[0])
print("nbtf:", prediction_nbtf[0])
print("lgtf:", prediction_lgtf[0])


nbcv: negative
lgcv: positive
nbtf: negative
lgtf: negative
