In [1]:
import pandas as pd
import random
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [2]:
df = pd.read_csv("IMDB Dataset sentiment.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
print(df)

                                                  review  sentiment
0      I really liked this Summerslam due to the look...          1
1      Not many television shows appeal to quite as m...          1
2      The film quickly gets to a major chase scene w...          0
3      Jane Austen would definitely approve of this o...          1
4      Expectations were somewhat high for me when I ...          0
...                                                  ...        ...
49995  `Shadow Magic' recaptures the joy and amazemen...          1
49996  I found this movie to be quite enjoyable and f...          1
49997  Avoid this one! It is a terrible movie. So wha...          0
49998  This production was quite a surprise for me. I...          1
49999  This is a decent movie. Although little bit sh...          1

[50000 rows x 2 columns]


In [4]:
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.3, random_state=42)
vectorizer = CountVectorizer(stop_words='english')
X_train_counts = vectorizer.fit_transform(X_train)
clf = MultinomialNB()
clf.fit(X_train_counts, y_train)
X_test_counts = vectorizer.transform(X_test)
y_pred = clf.predict(X_test_counts)
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(classification_report(y_test, y_pred))


Accuracy: 0.8546
              precision    recall  f1-score   support

           0       0.84      0.88      0.86      7464
           1       0.87      0.83      0.85      7536

    accuracy                           0.85     15000
   macro avg       0.86      0.85      0.85     15000
weighted avg       0.86      0.85      0.85     15000



In [5]:
def classify_review(review):
    review_counts = vectorizer.transform([review])
    return 'positive' if clf.predict(review_counts)[0] == 1 else 'negative'
review = "the movie was phenomenal"
print(f'Review: {review}\nSentiment: {classify_review(review)}')
review2= "the movie was a torture "
print(f'Review2: {review2}\nSentiment: {classify_review(review2)}')

Review: the movie was phenomenal
Sentiment: positive
Review2: the movie was a torture 
Sentiment: negative
