In [1]:
%pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [None]:
import nltk
nltk.download() # Dowload 'popular' packages

In [11]:
import random
import nltk
from nltk.corpus import movie_reviews

documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# Shuffle the documents so we don't have all positive/negative reviews in a sequence
random.shuffle(documents)

print('Number of Documents: {}'.format(len(documents)))

# Print specific review
#print('First Review: {}'.format(documents[1]))

all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)

# Prints to analyse dataset
#print('Most common words: {}'.format(all_words.most_common(15)))
#print('The word happy: {}'.format(all_words["happy"]))

Number of Documents: 2000


In [4]:
# We'll use the 4000 most common words as features
print(len(all_words)) # of unique words
word_features = list(all_words.keys())[:4000]

39768


In [None]:
# Define a function to extract features from a document
# The function checks whether each of the 4000 feature words is present in the document
def find_features(document):
    
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words) # Boolean: True if word is present, False otherwise

    return features


# Test the feature extraction on a specific negative review
features = find_features(movie_reviews.words('neg/cv000_29416.txt'))
# Print only the feature words that are present in the document
for key, value in features.items():
    if value == True:
        print(key)

In [6]:
# Create a dataset of features for all documents (Feature Extraction)
# Each entry is a tuple: (feature dictionary, category)
featuresets = [(find_features(rev), category) for (rev, category) in documents]

In [7]:
# Sklearn import for train/test split
from sklearn import model_selection

# Define a seed for reproducibility
seed = 1

# Split the data into training and testing datasets
training, testing = model_selection.train_test_split(featuresets, test_size = 0.25, random_state=seed)

In [9]:
# We can use sklearn algorithms in NLTK
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC

model = SklearnClassifier(SVC(kernel = 'linear'))

# Train the model on the training data
model.train(training)

# Test on the testing dataset
accuracy = nltk.classify.accuracy(model, testing)*100
print("SVC Accuracy: {} %".format(accuracy))

SVC Accuracy: 78.4 %


In [15]:
# Predict sentiment for a custom review (for testing)
def predict_sentiment(review):
    words = review.lower().split()
    features = find_features(words)
    return model.classify(features)

# Test with a custom review
custom_review = "The movie was fantastic, with a thrilling plot and excellent acting."
sentiment = predict_sentiment(custom_review)
print(f"Custom Review Sentiment: {sentiment}")

custom_review_2 = "The movie was a complete waste of time, very boring and poorly made."
sentiment_2 = predict_sentiment(custom_review_2)
print(f"Custom Review Sentiment: {sentiment_2}")

custom_review_3_mixed = "Wow I really liked the acting and the actors but the plot was horrible."
sentiment_3 = predict_sentiment(custom_review_3_mixed)
print(f"Custom Review Sentiment Mixed: {sentiment_3}")

Custom Review Sentiment: pos
Custom Review Sentiment: neg
Custom Review Sentiment Mixed: neg
