# IMDB sentiment analysis

In [2]:
import os
import re
import shutil
import sys
import tarfile
import urllib.request
from functools import partial

import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.sentiment.util import mark_negation
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import (
    CountVectorizer,
    TfidfTransformer,
    TfidfVectorizer,
)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    make_scorer,
    mean_absolute_error,
    mean_squared_error,
    plot_confusion_matrix,
    r2_score,
)
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

RANDOM_SEED = 42

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Data loading

In [3]:
# Set the dataset name and URL
dataset_name = "imdb"
dataset_url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

# Create the dataset folder
dataset_folder = os.path.join(os.getcwd(), "datasets", dataset_name)
original_dataset_folder = os.path.join(dataset_folder, "original")
if not os.path.exists(original_dataset_folder):
    os.makedirs(original_dataset_folder)

# Create the dataframe folder
df_folder = os.path.join(dataset_folder, "dataframe")
if not os.path.exists(df_folder):
    os.makedirs(df_folder)
dataframe_path = os.path.join(df_folder, dataset_name + ".pkl")

In [4]:
# Download and extract the dataset
original_dataset_path = os.path.join(original_dataset_folder, "movies.tar.gz")
if not os.path.exists(original_dataset_path):
    urllib.request.urlretrieve(dataset_url, original_dataset_path)
    print("Successful download")
    tar = tarfile.open(original_dataset_path)
    tar.extractall(original_dataset_folder)
    extracted_folder_name = tar.getnames()[0]
    tar.close()
    print("Successful extraction")
else:
    print("Dataset already downloaded and extracted")

Successful download
Successful extraction


In [5]:
if not os.path.exists(dataframe_path):
    dataframe_rows = []
    for split in ["train", "test"]:
        for sentiment in ["pos", "neg"]:
            folder = os.path.join(
                original_dataset_folder, extracted_folder_name, split, sentiment
            )
            for filename in os.listdir(folder):
                file_path = os.path.join(folder, filename)
                try:
                    if os.path.isfile(file_path):
                        with open(file_path, mode="r", encoding="utf-8") as text_file:
                            # Extract info
                            text = text_file.read()
                            score = filename.split("_")[1].split(".")[0]
                            file_id = filename.split("_")[0]

                            # Compute sentiment
                            num_sentiment = (
                                1 if sentiment == "pos"
                                else 0 if sentiment == "neg"
                                else -1
                            )

                            # Create single dataframe row
                            dataframe_row = {
                                "file_id": file_id,
                                "score": score,
                                "sentiment": num_sentiment,
                                "split": split,
                                "text": text,
                            }
                            dataframe_rows.append(dataframe_row)
                except Exception as e:
                    print("Failed to process %s. Reason: %s" % (file_path, e))
                    sys.exit(0)

    # Transform the list of rows in a proper dataframe
    dataframe = pd.DataFrame(dataframe_rows)
    dataframe_cols = ["file_id", "score", "sentiment", "split", "text"]
    dataframe = dataframe[dataframe_cols]
    dataframe.to_pickle(dataframe_path)
else:
    print("Dataframe already saved as a pickle file")

In [4]:
df = pd.read_pickle(dataframe_path)
df['score'] = pd.to_numeric(df['score'])
df.head()

Unnamed: 0,file_id,score,sentiment,split,text
0,2257,7,1,train,"Sarafina was a fun movie, and some of the song..."
1,4778,9,1,train,"Like his early masterpiece ""The Elephant Man"" ..."
2,7284,8,1,train,When I was young I had seen very few movies. M...
3,4845,9,1,train,Hello Playmates.I recently watched this film f...
4,6822,7,1,train,"""Opening Night"" released in 1977, tries to be ..."


## Data exploration

In [5]:
print("Distribution of scores: ")
df['score'].value_counts()

Distribution of scores: 


1     10122
10     9731
8      5859
4      5331
3      4961
7      4803
9      4607
2      4586
Name: score, dtype: int64

In [6]:
score_labels = sorted(df['score'].unique())
print(f"Score labels: {score_labels}")

Score labels: [1, 2, 3, 4, 7, 8, 9, 10]


In [7]:
print(f"Number of duplicated texts: {df['text'].duplicated().value_counts()[True]}")

Number of duplicated texts: 418


In [7]:
def compute_freqs(words, top, thresh):
    """
    Return the `top` most frequent words of the dataset,
    after filtering them to be after `thresh` frequency 
    """
    fdist = nltk.FreqDist(words)
    common = {
        k: fdist.freq(k)
        for k, _ in sorted(fdist.items(), key=lambda i: i[1], reverse=True)
    }
    words = list(common.keys())
    freqs = list(common.values())
    common_cumulative = {k: sum(freqs[:i]) for i, k in enumerate(words)}
    return [k for k, v in common_cumulative.items() if v >= thresh][:top]


def print_df_freqs(dataframe, top=10, thresh=0.7):
    """
    Print the `top` most frequent words of the dataset,
    for each class
    """
    for c in score_labels:
        words = []
        text = df.query(f"score == {c}")["text"].to_numpy()
        for t in text:
            words.extend(word_tokenize(t.lower()))
        print(f"Number of different words in class {c}: {len(set(words))}")
        common = compute_freqs(words, top=top, thresh=thresh)
        print(f"{top} most frequent (after {thresh} frequency) words in class {c}:")
        print(common)
        print()


print_df_freqs(df)

Number of different words in class 1: 61170
10 most frequent (after 0.7 frequency) words in class 1:
['book', 'maybe', 'gets', 'almost', 'may', '2', 'sure', 'since', 'however', '..']

Number of different words in class 2: 44293
10 most frequent (after 0.7 frequency) words in class 2:
['rather', 'own', 'budget', 'sense', 'actor', 'both', 'feel', 'yet', 'having', 'half']

Number of different words in class 3: 49303
10 most frequent (after 0.7 frequency) words in class 3:
['screen', 'audience', 'stupid', 'family', 'actor', 'house', 'rest', 'sex', 'once', 'during']

Number of different words in class 4: 52665
10 most frequent (after 0.7 frequency) words in class 4:
['actor', 'death', 'different', 'help', 'fan', 'together', 'takes', 'each', 'less', 'house']

Number of different words in class 7: 50749
10 most frequent (after 0.7 frequency) words in class 7:
['\x96', 'until', 'second', 'believe', 'keep', 'kids', 'become', 'small', 'hollywood', 'production']

Number of different words in clas

In [15]:
def re_match(regex, text):
    """
    Return True if the given regex matches inside the given text,
    otherwise return False
    """
    return re.match(regex, text) is not None


# Find dates
DATE_RE = r"\d{1,2}[-\/\.]\d{1,2}[-\/\.]\d{2,4}"
num_dates = df["text"].map(partial(re_match, DATE_RE)).value_counts()
print("Number of texts with dates:")
print(num_dates)
print()

# Find floats
FLOAT_RE = r"(\d*\,)?\d+.\d*"
num_floats = df["text"].map(partial(re_match, FLOAT_RE)).value_counts()
print("Number of texts with decimal numbers:")
print(num_floats)
print()

# Find ints
INT_RE = r"(?<=\s)\d+(?=\s)"
num_ints = df["text"].map(partial(re_match, INT_RE)).value_counts()
print("Number of texts with integers:")
print(num_ints)
print()

# Find brackets
BRACKETS_RE = r"\[[^]]*\]"
num_brackets = df["text"].map(partial(re_match, BRACKETS_RE)).value_counts()
print("Number of texts with elements in square brackets:")
print(num_brackets)
print()

# Find HTML tags
HTML_RE = r"<.*?>"
num_html = df["text"].map(partial(re_match, HTML_RE)).value_counts()
print("Number of texts with HTML tags:")
print(num_html)
print()

# Find punctuation
PUNCTUATION_RE = r"[^\w{w}\s\{<>}]+"

Number of texts with dates:
False    49997
True         3
Name: text, dtype: int64

Number of texts with decimal numbers:
False    49814
True       186
Name: text, dtype: int64

Number of texts with integers:
False    50000
Name: text, dtype: int64

Number of texts with elements in square brackets:
False    49987
True        13
Name: text, dtype: int64

Number of texts with HTML tags:
False    49846
True       154
Name: text, dtype: int64



## Train/test utils

In [8]:
train_df = df.query('split == "train"')
test_df = df.query('split == "test"')

In [9]:
x_train, x_test, y_train, y_test = (
    train_df["text"],
    train_df["score"],
    test_df["text"],
    test_df["score"],
)
print(f"Shape of the train data: {x_train.shape}")
print(f"Shape of the test data: {x_test.shape}")

Shape of the train data: (25000,)
Shape of the test data: (25000,)


In [10]:
def predict(classifier, x_train, x_test):
    return (
        classifier.predict(x_train),
        classifier.predict(x_test),
        np.around(y_pred_test),
    )

In [11]:
def print_evaluation(y_test, y_pred_test, y_pred_test_class):
    # Evaluation as a regression task
    print("R2 score %f" % (r2_score(y_test, y_pred_test)))
    print("MAE %f" % (mean_absolute_error(y_test, y_pred_test)))
    print("MSE %f" % (mean_squared_error(y_test, y_pred_test)))
    print()

    # Evaluation as a multi-class classification task
    report = classification_report(y_test, y_pred_test_class, labels=score_labels)
    print(report)

    # Fancy confusion matrix
    plot_confusion_matrix(
        classifier,
        x_test,
        y_test,
        normalize="true",
        cmap=plt.cm.Blues,
        values_format=".2f",
    )

## Preprocessing

In [12]:
EN_STOPWORDS = set(stopwords.words("english"))
print(f"Stopwords ({len(EN_STOPWORDS)}): {EN_STOPWORDS}")

Stopwords (179): {'are', 'so', 'aren', 'each', 'has', 'have', 'nor', 'how', "should've", 'through', "needn't", 'its', 'mustn', 'just', "weren't", 'does', 'which', 'shan', 'very', 'didn', 'were', 'haven', 'and', "it's", 's', 'who', 'if', 'did', 'ourselves', 'having', 'more', 'o', 'below', 'because', 'is', 'same', 'by', 'shouldn', 'over', 'in', 'they', 'wasn', 'there', 'them', 'not', 'yours', 'once', 'wouldn', 'theirs', 'with', 'between', "shouldn't", 'the', 'again', 'about', 'myself', 'or', "you're", 'been', 'it', 'was', 'doing', 'here', 'herself', "haven't", "didn't", "won't", 'himself', 'needn', 'his', 'than', 'my', 'into', 'these', 'him', 'had', 'm', 'ain', 'won', 'this', "couldn't", 'during', 'while', 'being', 'to', 'will', "doesn't", 'such', 'of', 'under', 'our', "don't", 'am', 'when', 'itself', 'yourself', 'you', 'their', 'as', 'own', 'i', 'what', 'a', "you've", 'weren', 'hers', 'most', "wouldn't", 'we', 'she', 'above', 're', 'hadn', 'on', 'other', 'ours', 'where', 'against', 'any

In [16]:
STEMMER = nltk.porter.PorterStemmer()
LEMMATIZER = nltk.wordnet.WordNetLemmatizer()


def stem_text(text):
    return [STEMMER.stem(word) for word in text]


def lemmatize_text(text):
    return [LEMMATIZER.lemmatize(word) for word in text]


def preprocess_text(
    text,
    mark_negation=False,
    remove_punct=False,
    remove_stopwords=False,
    min_chars=None,
    root=None,
):
    # Strip trailing spaces and remove newlines
    text = text.strip().replace("\n", " ").replace("\r", " ")
    # Convert to lowercase
    text = text.lower()
    # Remove HTML tags
    text = re.sub(HTML_RE, "", text)
    # Remove text in square brackets
    text = re.sub(BRACKETS_RE, "", text)
    # Remove dates
    text = re.sub(DATE_RE, "", text)
    # Remove floating numbers
    text = re.sub(FLOAT_RE, "", text)
    # Mark negation
    if mark_negation:
        text = " ".join(mark_negation(text.split(), double_neg_flip=False))
        EN_STOPWORDS.add("_NEG")
    # Remove punctuation
    if remove_punct:
        text = re.sub(PUNCTUATION_RE, "", text)
    # Leave single whitespace
    text = text.split()
    # Remove words with less than `n` chars
    if min_chars is not None and isinstance(min_chars, int):
        text = [word for word in text if len(word) >= min_chars]
    # Remove stopwords
    if remove_stopwords:
        text = [word for word in text if word not in EN_STOPWORDS]
    # Perform stemming/lemmatization
    if root == "stem":
        text = stem_text(text)
    elif root == "lemmatize":
        text = lemmatize_text(text)
    # Return the text as a string
    return " ".join(text)

In [17]:
rnd_text = np.random.choice(df.index, 1)[0]
print(f"Random text with score {df['score'].iloc[rnd_text]}")
print()
print(f"Before pre-processing:")
print(df['text'].iloc[rnd_text])
print()
print("After pre-processing:")
print(preprocess_text(df['text'].iloc[rnd_text]))

Random text with score 1

Before pre-processing:
"Catchfire" or "Backtrack" as it is sometimes called, is not very good. That is, it's bad. Jodie Foster had already won an Oscar at this point. Why did she agree to do this? I don't know.<br /><br />The hostage/kidnapper relationship is not believable, even if it is a common psychological phenomenon in real life.<br /><br />Worst of all, this film features a scene where Hopper and Foster ride a boat under the Fremont Bridge (a bridge in Seattle) which means that traffic had to stop so that the bridge could open. I've had to wait for that bridge to go down many times, almost all of them on the bus. It's not a pleasant wait. This film caused unnecessary bridge-waiting and the world is a worse place for it.

After pre-processing:
"catchfire" or "backtrack" as it is sometimes called, is not very good. that is, it's bad. jodie foster had already won an oscar at this point. why did she agree to do this? i don't know.the hostage/kidnapper relat

## Baseline

In [19]:
text_clf = Pipeline(
    [
        ("tfidf", TfidfVectorizer()),
        ("clf", LogisticRegression(random_state=RANDOM_SEED, max_iter=1000)),
    ]
)
text_clf = text_clf.fit(x_train, y_train)

KeyboardInterrupt: 

In [None]:
y_pred_train, y_pred_test, y_pred_test_class = predict(classifier, x_train, x_test)
print_evaluation(y_test, y_pred_test, y_pred_test_class)

## Classification with logistic regression

In [18]:
text_clf = Pipeline(
    [
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        ("clf", LogisticRegression(random_state=RANDOM_SEED, max_iter=1000)),
    ]
)

parameters = {
    "vect__ngram_range": [(1, 1), (1, 2)],
    "tfidf__use_idf": (True, False),
}
classifier = GridSearchCV(
    text_clf, parameters, scoring=make_scorer(r2_score), cv=5, n_jobs=-1
)
classifier = classifier.fit(x_train, y_train)
print(f"Best parameters: {classifier.best_params_}")



KeyboardInterrupt: 

In [None]:
y_pred_train, y_pred_test, y_pred_test_class = predict(classifier, x_train, x_test)
print_evaluation(y_test, y_pred_test, y_pred_test_class)

## Classification with another classifier

In [None]:
text_clf = Pipeline(
    [
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        (
            "clf",
            SGDClassifier(
                random_state=RANDOM_SEED,
                loss="hinge",
                penalty="l2",
                alpha=1e-3,
                max_iter=500,
                tol=None,
            ),
        ),
    ]
)

parameters = {
    "vect__ngram_range": [(1, 1), (1, 2)],
    "tfidf__use_idf": (True, False),
    "clf__alpha": (1e-2, 1e-3),
}
classifier = GridSearchCV(
    text_clf, parameters, scoring=make_scorer(r2_score), cv=5, n_jobs=-1
)
classifier = classifier.fit(x_train, y_train)
print(f"Best parameters: {classifier.best_params_}")

In [None]:
y_pred_train, y_pred_test, y_pred_test_class = predict(classifier, x_train, x_test)
print_evaluation(y_test, y_pred_test, y_pred_test_class)