Problem Definition: The goal of this project is to create a sentiment analysis model that can classify movie reviews as either positive or negative based on their content. 

In [None]:
!pip install pandas scikit-learn matplotlib seaborn nltk wordcloud textblob

In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import nltk

from wordcloud import WordCloud
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter
from textblob import TextBlob


The read_reviews function will loop through the Imdb directory and aggregate the training and testing data into two lists.
We collect our list of training reviews and corresponding sentiments, as well as the testing data, and load them into panda dataframes.

In [None]:
def read_reviews(data_dir):
    reviews = []
    sentiments = []

    for sentiment in ['pos', 'neg']:
        sentiment_dir = os.path.join(data_dir, sentiment)
        for filename in os.listdir(sentiment_dir):
            with open(os.path.join(sentiment_dir, filename), 'r', encoding='utf-8') as f:
                reviews.append(f.read())
                sentiments.append(sentiment)
    return reviews, sentiments

train_data_dir = 'data/aclImdb/train'
train_reviews, train_sentiments = read_reviews(train_data_dir)

test_data_dir = 'data/aclImdb/test'
test_reviews, test_sentiments = read_reviews(test_data_dir)

train_df = pd.DataFrame({'review': train_reviews, 'sentiment':train_sentiments})
test_df = pd.DataFrame({'review': test_reviews, 'sentiment': test_sentiments})

print(train_df.head())
print(test_df.head())

Now that we have our dataframes, it's time to explore the data. Unlike the data containing continuous numerical features in our wine and flower classifiers, we instead of text-based data.
We will analyze the top-n most frequent words in both positive and negative reviews.
So we must preprocess the text data by tokenizing, removing stop words, applying lemmatization
Then we will create frequency distributions for both positive and negative reviews
Finally we will be able to visualize the top-n most frequent words for both positive and negative reviews

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

#Tokenize, remove stopwords, lemmatize input text
def preprocess_text(text):
    words = nltk.word_tokenize(text.lower())

    stop_words = set(stopwords.words('english'))
    filtered_words = []
    for word in words:
        if word.isalnum() and word not in stop_words:
            filtered_words.append(word)
    words = filtered_words

    lemmatizer = WordNetLemmatizer()
    lemmatized_words = []
    for word in words:
        if abs(TextBlob(word).sentiment.polarity) > 0.4:
            lemmatized_word = lemmatizer.lemmatize(word)
            lemmatized_words.append(lemmatized_word)
    words = lemmatized_words

    return words

positive_reviews = train_df[train_df['sentiment'] == 'pos']['review']
negative_reviews = train_df[train_df['sentiment'] == 'neg']['review']

positive_words = []
for review in positive_reviews:
    preprocessed_review = preprocess_text(review)
    for word in preprocessed_review:
        positive_words.append(word)

negative_words = []
for review in negative_reviews:
    preprocessed_review = preprocess_text(review)
    for word in preprocessed_review:
        negative_words.append(word)

positive_word_freq = Counter(positive_words)
negative_word_freq = Counter(negative_words)

#Visualize top-n most frequent words for positive and negative reviews with matplotlib
def plot_word_freq(word_freq, n, title):
    top_n_words = word_freq.most_common(n)
    words, frequencies = zip(*top_n_words)
    plt.figure(figsize=(15, 5))
    plt.bar(words, frequencies)
    plt.xlabel('Words')
    plt.ylabel('Frequency')
    plt.title(title)
    plt.show()


n = 20  # amount of top words we want to visualize
plot_word_freq(positive_word_freq, n, 'Most Frequent Words in Positive Reviews')
plot_word_freq(negative_word_freq, n, 'Most Frequent Words in Negative Reviews')

With the data preprocessed we can now vectorize the features into a numerical format

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(tokenizer=preprocess_text, ngram_range=(1, 2))
X_train = vectorizer.fit_transform(train_df['review'])
X_test = vectorizer.transform(test_df['review'])