### Loading the Data

In [1]:
import os
import urllib.request
import tarfile

os.makedirs("data", exist_ok=True)

if not os.path.exists("data/aclImdb_v1.tar.gz"):
        #download database
        print("downloading database...")
        urllib.request.urlretrieve("http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", "data/aclImdb_v1.tar.gz")
        print("download complete")

if not os.path.exists("data/aclImdb/"):
        #extract database
        print("extracting database...")
        with tarfile.open("data/aclImdb_v1.tar.gz", "r:gz") as tar:
                tar.extractall(path="data")
        print("database extracted")


In [None]:
import os
import glob

def read_imdb_data(data_dir='data/aclImdb'):
    data = {}
    labels = {}

     # Loop over the two splits: training and testing
    for data_type in ['train', 'test']:
        data[data_type] = {}
        labels[data_type] = {}

        # Loop over both sentiment categories: positive and negative
        for sentiment in ['pos', 'neg']:
            data[data_type][sentiment] = []
            labels[data_type][sentiment] = []

            # Construct path to all text files of the current split and sentiment
            path = os.path.join(data_dir, data_type, sentiment, '*.txt')
            files = glob.glob(path)

            # Read each review text file
            for f in files:
                with open(f, encoding="utf-8") as review:
                    data[data_type][sentiment].append(review.read())
                    # Assign label 1 for 'pos' and 0 for 'neg'
                    labels[data_type][sentiment].append(1 if sentiment == 'pos' else 0)

            # Sanity check: ensure that every text has a matching label
            assert len(data[data_type][sentiment]) == len(labels[data_type][sentiment]), \
                    "{}/{} data size does not match labels size".format(data_type, sentiment)

    return data, labels

In [3]:
data, labels = read_imdb_data()
print("IMDb reviews: train = {} pos / {} neg, test = {} pos / {} neg".format(
            len(data['train']['pos']), len(data['train']['neg']),
            len(data['test']['pos']), len(data['test']['neg'])))

UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 803: character maps to <undefined>

In [None]:
import json
if not os.path.exists("data/data.json"):
        # Save the loaded review texts into a JSON file
        json.dump(data, open('data/data.json', 'w'))
        # Save the sentiment labels into another JSON file
        json.dump(labels, open('data/labels.json', 'w'))

In [None]:
# Open and load the movie review data from 'data.json'
f=open('data/data.json')
data = json.load(f)
f.close()

# Open and load the sentiment labels from 'labels.json'
f=open('data/labels.json')
labels = json.load(f)
f.close()

In [None]:
print("IMDb reviews: train = {} pos / {} neg, test = {} pos / {} neg".format(
            len(data['train']['pos']), len(data['train']['neg']),
            len(data['test']['pos']), len(data['test']['neg'])))

IMDb reviews: train = 12500 pos / 12500 neg, test = 12500 pos / 12500 neg


### 1. Understanding the Data
function that gets the average, maximum, and minimum word length of a dataset.

In [None]:
def understanding_data(data, set):
        maxlength = 0
        minlength = 99999999999999999999999
        total = 0

        for sentiment in data[set]:
                for entry in data[set][sentiment]:
                        word_count = len(entry.split())
                        total += word_count
                        if word_count > maxlength:
                                maxlength = word_count
                        if word_count < minlength:
                                minlength = word_count
        set_size = len(data[set]['pos'])+len(data[set]['neg'])
        print("{} set avg word count: {}, max word count: {}, min word count {}".format(set, total/set_size, maxlength, minlength))


In [None]:
understanding_data(data, "train")
understanding_data(data, "test")

train set avg word count: 233.7872, max word count: 2470, min word count 10
test set avg word count: 228.52668, max word count: 2278, min word count 4


#### Word Length Results:
| | avg word count| max word count | min word count|
|---|---|---|---|
|train set| 233.7872 | 2470 | 10|
|test set | 229.52688 | 2278 | 4|

The word count of the reviews seem to vary quite drastically, with the longest review being almost 2500 words,\
and the shortest being only 4, and an average of about 230 words. Since the LSTM model reads one word at at time,\
reviews that are very long (~2500 words) might make training take significantly more time/memory, while on the\
other hand reviews that are very short (<10 words) probably won't give the model enough context to train on. 

### Creating a Balanced Validation Split

In [None]:
from sklearn.utils import shuffle

def split_data(data, val_size = 3000):
        shuffle(data['train']['pos'], labels['train']['pos'], random_state=420) #keeps the shuffling consistent
        shuffle(data['train']['neg'], labels['train']['neg'], random_state=420)

        val_size_pos = int(val_size/2)
        val_size_neg = val_size - val_size_pos

        train_set = data['train']['pos'][val_size_pos:] + data['train']['neg'][val_size_neg:]

        val_set = data['train']['pos'][:val_size_pos] + data['train']['neg'][:val_size_neg]

        test_set = data['test']['pos'] + data['test']['neg']

        print("Train: {} | Val: {} | Test: {}".format(
                len(train_set), len(val_set), len(test_set)
        ))
        print("IMDB data: train = pos {} / neg {} , val = pos {} / neg {}, test = pos {} / neg {}".format(
                len(data['train']['pos']) - val_size_pos, len(data['train']['neg']) - val_size_neg, 
                val_size_pos, val_size_neg, 
                len(data['test']['pos']), len(data['test']['neg'])
        ))

        return train_set, val_set, test_set

In [None]:
train_set, val_set, test_set = split_data(data)

Train: 22000 | Val: 3000 | Test: 25000
IMDB data: train = pos 11000 / neg 11000 , val = pos 1500 / neg 1500, test = pos 12500 / neg 12500


### Data Preprocessing
- Convert text to lowercase
- Remove punctuation
- Tokenize text
- Remove stop words
- Stemming

In [None]:
# This line imports the Natural Language Toolkit (NLTK) library, which provides various tools and resources for working with human language data.
import nltk

# This imports the stopwords module from NLTK, which contains a list of common English words (like "the", "a", "is")
# that often don't carry significant meaning in text analysis and are usually removed.
from nltk.corpus import stopwords

# This imports the PorterStemmer class from NLTK.
# Stemming is the process of reducing words to their root or base form (e.g., "running" becomes "run").
# The Porter stemmer is a widely used algorithm for this purpose.
from nltk.stem.porter import *

# This imports the re module, which provides support for regular expressions. Regular expressions are powerful tools for pattern matching and text manipulation.
import re

#  This imports the BeautifulSoup library, which is used for parsing HTML and XML documents. It's helpful for extracting text content from web pages or documents that might contain HTML tags.
from bs4 import BeautifulSoup

# This defines a function named review_to_words that takes a single argument review, which is expected to be a string containing the text of a movie review.
def review_to_words(review):
    # This line downloads the list of stopwords from NLTK if it hasn't been downloaded already. The quiet=True argument suppresses the download output.
    nltk.download("stopwords", quiet=True)

    # This creates an instance of the PorterStemmer class, which we'll use later for stemming words.
    stemmer = PorterStemmer()

    # This line uses BeautifulSoup to parse the input review as an HTML document ("html.parser" specifies the parser to use).
    # Then, .get_text() extracts the visible text content, effectively removing any HTML tags that might be present in the review.
    text = BeautifulSoup(review, "html.parser").get_text()

    # text.lower(): It converts the entire text to lowercase. This ensures that words like "The" and "the" are treated as the same.
    # re.sub(r"[^a-zA-Z0-9]", " ", ...): It replaces any character that is not an uppercase letter (a-z), a lowercase letter (A-Z), or a digit (0-9) with a space.
    # This helps in removing punctuation marks and other special characters.
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())

    # This line splits the processed text into a list of individual words using whitespace as the delimiter.
    words = text.split()

    # This line uses a list comprehension to filter out stopwords.
    # It iterates through the words list and keeps only those words that are not present in the English stopwords list provided by NLTK.
    words = [w for w in words if w not in stopwords.words("english")]

    #  This line uses another list comprehension to perform stemming.
    # It iterates through the filtered words and applies the stem() method of the PorterStemmer to each word, reducing it to its root form.
    words = [PorterStemmer().stem(w) for w in words] # stem

    return words

In [None]:
import json

#if data has not been processed
if not os.path.exists("data/processed/"):
        os.makedirs("data/processed/")

        train_set_processed = []
        for entry in train_set:
                train_set_processed.append(review_to_words(entry))

        val_set_processed = []
        for entry in val_set:
                val_set_processed.append(review_to_words(entry))

        test_set_processed = []
        for entry in test_set:
                test_set_processed.append(review_to_words(entry))

        # Save into a JSON file
        with open('data/processed/train_set_processed.json', 'w') as f:
                json.dump(train_set_processed, open('data/processed/train_set_processed.json', 'w'))
        with open('data/processed/test_set_processed.json', 'w') as f:
                json.dump(test_set_processed, open('data/processed/test_set_processed.json', 'w'))
        with open('data/processed/val_set_processed.json', 'w') as f:
                json.dump(val_set_processed, open('data/processed/val_set_processed.json', 'w'))


In [None]:
# Open and load preprocessed data
f=open('data/processed/test_set_processed.json')
test_set_processed = json.load(f)
f.close()

f=open('data/processed/train_set_processed.json')
train_set_processed = json.load(f)
f.close()

f=open('data/processed/val_set_processed.json')
val_set_processed = json.load(f)
f.close()

#display preprocessed data sample
print(train_set_processed[0])

['kenneth', 'branagh', 'show', 'excel', 'skill', 'act', 'write', 'deep', 'thought', 'provok', 'interpret', 'shakespear', 'classic', 'well', 'written', 'tragedi', 'kenneth', 'play', 'role', 'hamlet', 'distinct', 'emot', 'provok', 'tear', 'kate', 'winslet', 'perform', 'also', 'great', 'note']
