In [1]:
#Importing all the libraries
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
# Loading the dataset for movie review
# Dataset consists of 20 reviews
data = pd.read_csv('Movie_Review.csv')

In [3]:
# Convert 'positive' sentiment labels to 1 and 'negative' sentiment labels to 0
data['sentiment'] = data['sentiment'].map({'positive': 1, 'negative': 0})


In [4]:
# Splitting the dataset into training and testing
train_data, test_data, train_labels, test_labels = train_test_split(data['review'], data['sentiment'], test_size=0.2, random_state=42)


In [5]:
# Vectorize the text data

my_stop_words = ['the', 'a', 'an', 'and', 'of', 'to','this','that','too']
vectorizer = CountVectorizer(stop_words=my_stop_words)
train_vectors = vectorizer.fit_transform(train_data)
test_vectors = vectorizer.transform(test_data)

In [6]:
# Training the model
clf = MultinomialNB()
clf.fit(train_vectors, train_labels)

In [8]:
# Looping through the reviews in the file and predicting the sentiment
#Printing the first 100 words of the review
#Printing the Top Words - according to the sentiment
for i in range(len(data)):
    review = data['review'][i]
    vector = vectorizer.transform([review])
    prediction = clf.predict(vector)
    probas = clf.predict_proba(vector)
    feature_names = np.array(vectorizer.get_feature_names_out())
    topn = 10  # Change this value to print more or fewer top features
    if prediction == 1:
        print("Positive review: ", review[:100])
        log_prob_positive = clf.feature_log_prob_[1]
        coefs_with_fns = sorted(list(zip(log_prob_positive, feature_names)), reverse=True)
        top_positive = [x for x in coefs_with_fns if x[1] not in my_stop_words and x[1].isalpha() and len(x[1]) >= 4]
        top_positive_review = sorted(list(zip(vector.toarray()[0], feature_names)), reverse=True)[:topn]
        top_positive_review_words = [x[1] for x in top_positive_review if x[1] not in my_stop_words and x[1].isalpha() and len(x[1]) > 4]
        print("Top positive words:", top_positive_review_words)
        
    else:
        print("Negative review: ", review[:100])
        log_prob_negative = clf.feature_log_prob_[0]
        coefs_with_fns = sorted(list(zip(log_prob_negative, feature_names)))
        top_negative_review = sorted(list(zip(vector.toarray()[0], feature_names)), reverse=True)[:topn]
        top_negative_review_words = [x[1] for x in top_negative_review if x[1].isalpha() and len(x[1]) > 3 and not any(c.isdigit() for c in x[1])]
        print("Top negative words:", top_negative_review_words)
    print("="*80)


Negative review:  The box office success of Rani Mukerji's Mrs. Chatterjee Vs Norway is a testimony to the fact that v
Top negative words: ['watch', 'want', 'these', 'still', 'quality', 'movies']
Positive review:  This is quite possibly one of the worst movies ever made. Everything about it--acting, directing, sc
Top positive words: ['would', 'worst', 'script', 'quite']
Positive review:  It's been a year since the release of @ssrajamouli's Action Epic RRR, the movie is still making the 
Top positive words: ['still', 'ssrajamouli', 'since', 'sensation', 'rrrmovie', 'release', 'movie', 'making']
Positive review:  I loved Pathaan movie. One of best movies of SRK after his romance era. Action and plotting everythi
Top positive words: ['romance', 'plotting', 'pathaan', 'movies', 'movie', 'loved']
Negative review:  I watch a lot of movies and I like to give them all a chance just in case there is something interes
Top negative words: ['just', 'would', 'with', 'wish', 'watch', 'warrant']
Nega