In [39]:
# Import libraries
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import FreqDist, NaiveBayesClassifier, classify
import re
import string
import random
import pandas as pd

# Download NLTK data files (if not already installed)
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/takaya_shirai@pip.waseda.ac.jp/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/takaya_shirai@pip.waseda.ac.jp/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/takaya_shirai@pip.waseda.ac.jp/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/takaya_shirai@pip.waseda.ac.jp/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [40]:
# Define a function to clean and normalize tokens
def normalize_and_remove_noise(pos_tokens, stop_words = ()):
    cleaned_tokens = []

    for token, tag in pos_tokens:
        # Remove URLs starting with http:// or https://
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)

        # Determine part of speech for lemmatization
        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        # Lemmatize token
        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        # Remove punctuation, special characters, and stop words
        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

In [41]:
# Define a function to prepare tokens for the Naive Bayes model
def prepare_tokens_for_model(cleaned_tokens):
    for tokens in cleaned_tokens:
        yield dict([token, True] for token in tokens)

In [42]:
# Load the dataset
data = pd.read_csv('./data/IMDB Dataset.csv')

# Separate positive and negative reviews
positive_reviews = data[data['sentiment'] == 'positive']
negative_reviews = data[data['sentiment'] == 'negative']

# Display the number of positive and negative samples
print("Number of positive samples:", len(positive_reviews))
print("Number of negative samples:", len(negative_reviews))

# Tokenize and POS-tag the data
positive_tokens = positive_reviews['review'].apply(word_tokenize)
negative_tokens = negative_reviews['review'].apply(word_tokenize)

pos_tagged_positive_tokens = positive_tokens.apply(pos_tag)
pos_tagged_negative_tokens = negative_tokens.apply(pos_tag)

Number of positive samples: 25000
Number of negative samples: 25000


In [43]:
# Define stop words
stop_words = stopwords.words('english')

# Clean the tokens
cleaned_positive_tokens = [normalize_and_remove_noise(tokens, stop_words) for tokens in pos_tagged_positive_tokens]
cleaned_negative_tokens = [normalize_and_remove_noise(tokens, stop_words) for tokens in pos_tagged_negative_tokens]

In [46]:
# Prepare data for the model
positive_features = prepare_tokens_for_model(positive_cleaned_tokens)
negative_features = prepare_tokens_for_model(negative_cleaned_tokens)

# Label datasets as positive or negative
positive_dataset = [(feature_set, "Positive") for feature_set in positive_features]
negative_dataset = [(feature_set, "Negative") for feature_set in negative_features]

# Combine and shuffle the dataset
dataset = positive_dataset + negative_dataset
random.shuffle(dataset)

# Split data into training (90%) and testing (10%) sets
train_data_size = int(len(dataset) * 0.9)
train_data = dataset[:train_data_size]
test_data = dataset[train_data_size:]

In [47]:
# Train Naive Bayes classifier
classifier = NaiveBayesClassifier.train(train_data)

# Evaluate and display model accuracy
print("Accuracy is:", classify.accuracy(classifier, test_data))

# Display the most informative features
print(classifier.show_most_informative_features(10))

Accuracy is: 0.8606
Most Informative Features
                    2/10 = True           Negati : Positi =     83.1 : 1.0
                    boll = True           Negati : Positi =     58.4 : 1.0
                     uwe = True           Negati : Positi =     57.7 : 1.0
                    3/10 = True           Negati : Positi =     57.5 : 1.0
                  awful. = True           Negati : Positi =     47.0 : 1.0
                    4/10 = True           Negati : Positi =     36.3 : 1.0
               terrible. = True           Negati : Positi =     33.7 : 1.0
                    8/10 = True           Positi : Negati =     30.6 : 1.0
                    1/10 = True           Negati : Positi =     29.7 : 1.0
                      d- = True           Negati : Positi =     29.7 : 1.0
None
