In [1]:
from nltk.corpus import movie_reviews
from nltk.classify import NaiveBayesClassifier
from nltk import classify
from nltk import FreqDist
from nltk.tokenize import word_tokenize

In [2]:
movie_reviews.categories()

['neg', 'pos']

In [3]:
movie_reviews.fileids()[:10]

['neg/cv000_29416.txt',
 'neg/cv001_19502.txt',
 'neg/cv002_17424.txt',
 'neg/cv003_12683.txt',
 'neg/cv004_12641.txt',
 'neg/cv005_29357.txt',
 'neg/cv006_17022.txt',
 'neg/cv007_4992.txt',
 'neg/cv008_29326.txt',
 'neg/cv009_29417.txt']

In [26]:
def extract_features(words):
    return dict(FreqDist(words))

In [27]:
documents = [(list(movie_reviews.words(fileid)), category)
            for category in movie_reviews.categories()
            for fileid in movie_reviews.fileids(category)]

In [28]:
import random
random.shuffle(documents)

In [29]:
split_ratio = int(len(documents) * 0.8)
train_set, test_set = documents[:split_ratio], documents[split_ratio:]

In [30]:
training_features = [(extract_features(words), category) for (words, category) in
train_set]
testing_features = [(extract_features(words), category) for (words, category) in
test_set]

In [31]:
classifier = NaiveBayesClassifier.train(training_features)

In [32]:
accuracy = classify.accuracy(classifier, testing_features)
print("Accuracy:", accuracy)

Accuracy: 0.78


In [33]:
new_sentences = [
"This movie was fantastic!",
"I didn't like the plot of this film.",
"The acting was superb in this movie.",
"The screenplay was terrible."
]

In [34]:
for sentence in new_sentences:
    words = word_tokenize(sentence)
    features = extract_features(words)
    category = classifier.classify(features)
    print(f"Predicted category for '{sentence}': {category}")

Predicted category for 'This movie was fantastic!': pos
Predicted category for 'I didn't like the plot of this film.': neg
Predicted category for 'The acting was superb in this movie.': pos
Predicted category for 'The screenplay was terrible.': neg
