In [1]:
import nltk

In [2]:
import random

In [3]:
from nltk.corpus import movie_reviews
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy

In [4]:
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\movie_reviews.zip.


True

In [5]:
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

In [6]:
random.shuffle(documents)

In [7]:
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words)[:2000]

In [8]:
def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features[word] = (word in document_words)
    return features

In [9]:
featuresets = [(document_features(d), c) for (d, c) in documents]

In [10]:
# Split into training and testing
train_set = featuresets[:1600]
test_set = featuresets[1600:]

In [11]:
classifier = NaiveBayesClassifier.train(train_set)

In [12]:
# Evaluate
print("Accuracy:", accuracy(classifier, test_set))


Accuracy: 0.8475


In [14]:

# Test custom review
review = "This movie was boring  and I don't like that"
review_words = review.lower().split()
print("Prediction:", classifier.classify(document_features(review_words)))

Prediction: neg


In [18]:
import nltk
from nltk.corpus import movie_reviews
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# Download dataset (only first time)
nltk.download('movie_reviews')

# Load dataset
texts = []
labels = []

for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        texts.append(movie_reviews.raw(fileid))
        labels.append(category)

# Convert text into TF-IDF features
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(texts)

# Train model
model = LogisticRegression(max_iter=1000)
model.fit(X, labels)

# ----------------------------------
# Function to analyze new review
# ----------------------------------

def analyze_review(review):
    review_vector = vectorizer.transform([review])
    prediction = model.predict(review_vector)[0]
    probability = model.predict_proba(review_vector).max()
    
    if prediction == "pos":
        print("Review Sentiment: POSITIVE ðŸ˜Š")
    else:
        print("Review Sentiment: NEGATIVE ðŸ˜ž")
        
    print("Confidence Score:", round(probability * 100, 2), "%")

# ----------------------------------
# Give your own review here
# ----------------------------------

my_review = input("Enter your movie review: ")

analyze_review(my_review)


[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


Enter your movie review:  it is nice moive


Review Sentiment: POSITIVE ðŸ˜Š
Confidence Score: 55.53 %
