In [1]:
import nltk
from nltk.corpus import movie_reviews

len(movie_reviews.words())

1583820

In [2]:
movie_reviews.fileids()[:5]

['neg/cv000_29416.txt',
 'neg/cv001_19502.txt',
 'neg/cv002_17424.txt',
 'neg/cv003_12683.txt',
 'neg/cv004_12641.txt']

In [3]:
import string
text = " ".join(movie_reviews.words())
text_filtered = text.translate(str.maketrans('', '', string.punctuation))

In [4]:
from nltk import word_tokenize
from nltk.corpus import stopwords

# remove stopwords by making all lowercase
stopwords = stopwords.words('english')
tokens = word_tokenize(text_filtered)
words_filtered = [w.lower() for w in tokens if w not in stopwords]

In [5]:
# get the frequecy distribution for each word in courpus
counter_dict = nltk.FreqDist(words_filtered)

In [6]:
print(counter_dict.most_common(2))

[('film', 9519), ('one', 5853)]


In [8]:
docs = [(list(movie_reviews.words(fileid)), category)
        for category in movie_reviews.categories()
        for fileid in movie_reviews.fileids(category)]

print(docs[0])

(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.', 'they', 'get', 'into', 'an', 'accident', '.', 'one', 'of', 'the', 'guys', 'dies', ',', 'but', 'his', 'girlfriend', 'continues', 'to', 'see', 'him', 'in', 'her', 'life', ',', 'and', 'has', 'nightmares', '.', 'what', "'", 's', 'the', 'deal', '?', 'watch', 'the', 'movie', 'and', '"', 'sorta', '"', 'find', 'out', '.', '.', '.', 'critique', ':', 'a', 'mind', '-', 'fuck', 'movie', 'for', 'the', 'teen', 'generation', 'that', 'touches', 'on', 'a', 'very', 'cool', 'idea', ',', 'but', 'presents', 'it', 'in', 'a', 'very', 'bad', 'package', '.', 'which', 'is', 'what', 'makes', 'this', 'review', 'an', 'even', 'harder', 'one', 'to', 'write', ',', 'since', 'i', 'generally', 'applaud', 'films', 'which', 'attempt', 'to', 'break', 'the', 'mold', ',', 'mess', 'with', 'your', 'head', 'and', 'such', '(', 'lost', 'highway', '&', 'memento', ')', ',', 'but', 'there', 'are', 'good', 'and', 'b

In [9]:
# get top n words from the feature dict : here lets take 3000
word_features = [w[0] for w in counter_dict.most_common(3000)]

# print 5
word_features[:5]

['film', 'one', 'movie', 'like', 'even']

In [13]:
# This method is used to define if a word exits or doesnot exits on a review and we can take this as a feature. 
def search_features(doc):
    
    words = set(doc)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features

# test this function 
result = search_features(docs[0][0])


In [14]:
# get the feature set
featureset = [(search_features(doc), category) for (doc, category) in docs]

In [15]:
# train test set split 
training_set = featureset[:1600]
testing_set = featureset[1600:]

In [19]:
print(f"Training set : {training_set[0]}")



In [20]:
# Training Set
n_count = 0
p_count = 0

for value in training_set:
    if value[1] == "neg":
        n_count += 1
    elif value[1] == "pos":
        p_count += 1

print("Negative Reviews:", n_count)
print("Positive Reviews:", p_count)

# Calculate Total Number of Reviews
total_feature_set = len(training_set)

# Calculate Percentages
percentage_pos = round((p_count / total_feature_set) * 100, 2)
percentage_neg = round((n_count / total_feature_set) * 100, 2)

print("\nPercentage of Positive Reviews:", percentage_pos, "%")
print("Percentage of Negative Reviews:", percentage_neg, "%")

Negative Reviews: 1000
Positive Reviews: 600

Percentage of Positive Reviews: 37.5 %
Percentage of Negative Reviews: 62.5 %


In [21]:
# Testing Set
n_count = 0
p_count = 0

for value in testing_set:
    if value[1] == "neg":
        n_count += 1
    elif value[1] == "pos":
        p_count += 1

print("Number of Negative Reviews:", n_count)
print("Number of Positive Reviews:", p_count)

# Calculate Total Number of Reviews
total_feature_set = len(testing_set)

# Calculate Percentages
percentage_pos = round((p_count / total_feature_set) * 100, 2)
percentage_neg = round((n_count / total_feature_set) * 100, 2)

print("\nPercentage of Positive Reviews:", percentage_pos, "%")
print("Percentage of Negative Reviews:", percentage_neg, "%")

Number of Negative Reviews: 0
Number of Positive Reviews: 400

Percentage of Positive Reviews: 100.0 %
Percentage of Negative Reviews: 0.0 %


In [22]:
# shuffeling the data
import random
random.shuffle(featureset)
training_set = featureset[:1600]
testing_set = featureset[1600:]

In [24]:
print("Training Set After Shuffle")
n_count = 0
p_count = 0

for value in training_set:
    if value[1] == "neg":
        n_count += 1
    elif value[1] == "pos":
        p_count += 1

print("Number of Negative Reviews:", n_count)
print("Number of Positive Reviews:", p_count)

# Calculate Total Number of Reviews
total_feature_set = len(training_set)

# Calculate Percentages
percentage_pos = round((p_count / total_feature_set) * 100, 2)
percentage_neg = round((n_count / total_feature_set) * 100, 2)

print("\nPercentage of Positive Reviews:", percentage_pos, "%")
print("Percentage of Negative Reviews:", percentage_neg, "%")

Training Set After Shuffle
Number of Negative Reviews: 796
Number of Positive Reviews: 804

Percentage of Positive Reviews: 50.25 %
Percentage of Negative Reviews: 49.75 %


In [25]:
print("testing set after shuffle")
# Testing Set
n_count = 0
p_count = 0

for value in testing_set:
    if value[1] == "neg":
        n_count += 1
    elif value[1] == "pos":
        p_count += 1

print("Number of Negative Reviews:", n_count)
print("Number of Positive Reviews:", p_count)

# Calculate Total Number of Reviews
total_feature_set = len(testing_set)

# Calculate Percentages
percentage_pos = round((p_count / total_feature_set) * 100, 2)
percentage_neg = round((n_count / total_feature_set) * 100, 2)

print("\nPercentage of Positive Reviews:", percentage_pos, "%")
print("Percentage of Negative Reviews:", percentage_neg, "%")

testing set after shuffle
Number of Negative Reviews: 204
Number of Positive Reviews: 196

Percentage of Positive Reviews: 49.0 %
Percentage of Negative Reviews: 51.0 %


**Training the classifier**

In [27]:
classifier = nltk.NaiveBayesClassifier.train(training_set)

print("Classifier's training accuracy is: {}".format(nltk.classify.accuracy(classifier, training_set)* 100))
print("Classifier's testing accuracy is: {}".format(nltk.classify.accuracy(classifier, testing_set)* 100))

Classifier's training accuracy is: 89.0
Classifier's testing accuracy is: 84.5


**What are my most informative words?**

In [28]:
report = classifier.show_most_informative_features(5)

Most Informative Features
                   damon = True              pos : neg    =     16.2 : 1.0
               ludicrous = True              neg : pos    =     12.3 : 1.0
             outstanding = True              pos : neg    =     11.3 : 1.0
              schumacher = True              neg : pos    =     11.1 : 1.0
                  finest = True              pos : neg    =     10.9 : 1.0


In [30]:
important_features = classifier.most_informative_features(5)
print(important_features)

[('damon', True), ('ludicrous', True), ('outstanding', True), ('schumacher', True), ('finest', True)]


In [33]:
import math
import numpy as np

def get_top_ratios(classifier, n=300, top_n=10):
    most_informative = classifier.most_informative_features(n)

    top_pos_neg = []
    top_neg_pos = []

    for fname, _ in most_informative:
        pos_prob = classifier._feature_probdist['pos', fname].prob(True)
        neg_prob = classifier._feature_probdist['neg', fname].prob(True)
        
        pos_log_prob = np.log(pos_prob) if pos_prob != 0 else float('-inf')
        neg_log_prob = np.log(neg_prob) if neg_prob != 0 else float('-inf')

        pos_ratio = math.exp(pos_log_prob - neg_log_prob)
        neg_ratio = math.exp(neg_log_prob - pos_log_prob)

        top_pos_neg.append((fname, pos_ratio))
        top_neg_pos.append((fname, neg_ratio))

    top_pos_neg.sort(key=lambda x: x[1], reverse=True)
    top_neg_pos.sort(key=lambda x: x[1], reverse=True)

    top_pos_neg = top_pos_neg[:top_n]
    top_neg_pos = top_neg_pos[:top_n]

    return top_pos_neg, top_neg_pos


top_pos_neg_features, top_neg_pos_features = get_top_ratios(classifier, n=300, top_n=10)

print("Top 10 features with highest pos:neg ratios:")
for feature, ratio in top_pos_neg_features:
    print(f"{feature}: {ratio:.2f}")

print("\nTop 10 features with highest neg:pos ratios:")
for feature, ratio in top_neg_pos_features:
    print(f"{feature}: {ratio:.2f}")


Top 10 features with highest pos:neg ratios:
damon: 16.17
outstanding: 11.33
finest: 10.89
anger: 9.48
mature: 6.49
allows: 5.80
extraordinary: 5.67
wonderfully: 5.58
breathtaking: 5.49
designer: 5.39

Top 10 features with highest neg:pos ratios:
ludicrous: 12.32
schumacher: 11.11
snake: 8.69
idiotic: 7.94
lifeless: 7.94
jolie: 7.07
welles: 6.40
lame: 6.40
laughable: 6.40
awful: 6.34


In [34]:
from nltk.classify import NaiveBayesClassifier

# Assuming you have a training set prepared
# Example: training_set = [({'feature1': True, 'feature2': False}, 'pos'), ...]

# Train your Naive Bayes classifier
classifier = NaiveBayesClassifier.train(training_set)

def extract_features_by_ratio(classifier, n=300, top_n=25, ratio='pos:neg'):
    most_informative = classifier.most_informative_features(n)
    extracted_features = []

    for fname, score in most_informative:
        if ratio == 'pos:neg':
            feat_ratio = classifier._feature_probdist['pos', fname].prob(True) / \
                          classifier._feature_probdist['neg', fname].prob(True)
        elif ratio == 'neg:pos':
            feat_ratio = classifier._feature_probdist['neg', fname].prob(True) / \
                          classifier._feature_probdist['pos', fname].prob(True)

        extracted_features.append(((fname, feat_ratio), score))

    extracted_features.sort(key=lambda x: x[0][1], reverse=True)
    extracted_features = [x[0][0] for x in extracted_features[:top_n]]

    return extracted_features

# Function to get top features with highest pos:neg and neg:pos ratios
def get_top_ratios(classifier, n=300, top_n=25):
    top_pos_neg = extract_features_by_ratio(classifier, n=n, top_n=top_n, ratio='pos:neg')
    top_neg_pos = extract_features_by_ratio(classifier, n=n, top_n=top_n, ratio='neg:pos')

    # Merge list of tuples containing feature names
    top_features = sorted(list(top_pos_neg) + list(top_neg_pos))

    return top_features

# Get top 50 features (25 from highest pos:neg and 25 from highest neg:pos ratios)
top_features = get_top_ratios(classifier, n=300, top_n=50)

print("Top 100 features for training:")
print(top_features)


Top 100 features for training:
['alicia', 'allows', 'anger', 'anna', 'anywhere', 'awful', 'badly', 'beautifully', 'bland', 'bore', 'boring', 'bother', 'breasts', 'breathtaking', 'culture', 'damme', 'damon', 'decades', 'delightful', 'designer', 'develops', 'dull', 'embarrassing', 'era', 'excellent', 'extraordinary', 'failure', 'fantastic', 'finest', 'fits', 'frankly', 'freedom', 'friendship', 'garbage', 'german', 'gon', 'hanks', 'idiotic', 'inane', 'inept', 'initially', 'innocence', 'italian', 'jedi', 'jolie', 'joy', 'lame', 'laughable', 'lethal', 'lifeless', 'ludicrous', 'mature', 'memorable', 'mess', 'nevertheless', 'nomination', 'obi', 'ordinary', 'outstanding', 'painful', 'painfully', 'patch', 'pointless', 'poorly', 'portrayal', 'portrayed', 'promising', 'random', 'refreshing', 'religion', 'ridiculous', 'ripley', 'sat', 'satisfying', 'schumacher', 'schwarzenegger', 'skip', 'snake', 'snow', 'spice', 'stiller', 'stupid', 'subtle', 'superb', 'tedious', 'terrible', 'terrific', 'themes',

**Saving the model.**

In [38]:
import pickle
save_classifier = open("naive_bayes_model.pkl","wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()

In [39]:
classifier_f = open("naive_bayes_model.pkl","rb")
classifier = pickle.load(classifier_f)
classifier_f.close()

In [41]:
custom_review = "I hated the restaurant. It was a disaster eating there. Poor service, arrogant waiters."
custom_review_tokens = word_tokenize(custom_review)
custom_review_set = search_features(custom_review_tokens)
print(classifier.classify(custom_review_set))

neg
