In [2]:
import pandas as pd
import numpy as np

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix

### Problem
Binary sentiment classification of movie reviews.

### Approach
TF-IDF vectorization + Multinomial Naive Bayes.

### Key Limitation
Does not fully handle negation; can be improved with n-grams or contextual models.


In [4]:
data = pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")

In [6]:
data.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [9]:
# Dataset understanding 

data['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [11]:
# Now, as Machine can't understand text,,......we need to convert to numbers 

vectorizer = TfidfVectorizer (
    stop_words='english',
    max_features=5000
)

X= vectorizer.fit_transform(data['review'])
y= data['sentiment']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size = 0.2,   
    random_state= 42
)

In [15]:
model = MultinomialNB() 
model.fit(X_train, y_train)

In [16]:
y_pred = model.predict(X_test) 

In [17]:
accuracy_score(y_test, y_pred)

0.8511

In [22]:
confusion_matrix(y_test, y_pred)

array([[4204,  757],
       [ 732, 4307]])

In [28]:
# test a random comment (let by me)

def predict_sentiment (text) :
    vec = vectorizer.transform ([text]) 
    return model.predict(vec)[0] 


In [34]:
predict_sentiment("the movie was absolutely amazing")

np.str_('positive')

In [33]:
predict_sentiment("the movie was not absolutely amazing")

np.str_('positive')

In [35]:
predict_sentiment("the movie is bad")

np.str_('negative')