# IMDB sentiment analysis

In [2]:
import os
import re
import shutil
import sys
import tarfile
import urllib.request
from functools import partial

import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.sentiment.util import mark_negation
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import (
    CountVectorizer,
    TfidfTransformer,
    TfidfVectorizer,
)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    make_scorer,
    mean_absolute_error,
    mean_squared_error,
    plot_confusion_matrix,
    r2_score,
)
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

RANDOM_SEED = 42

[nltk_data] Downloading package stopwords to /Users/jobs/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/jobs/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/jobs/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Data loading

In [3]:
# Set the dataset name and URL
dataset_name = "imdb"
dataset_url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

# Create the dataset folder
dataset_folder = os.path.join(os.getcwd(), "datasets", dataset_name)
original_dataset_folder = os.path.join(dataset_folder, "original")
if not os.path.exists(original_dataset_folder):
    os.makedirs(original_dataset_folder)

# Create the dataframe folder
df_folder = os.path.join(dataset_folder, "dataframe")
if not os.path.exists(df_folder):
    os.makedirs(df_folder)
dataframe_path = os.path.join(df_folder, dataset_name + ".pkl")

In [7]:
# Download and extract the dataset
original_dataset_path = os.path.join(original_dataset_folder, "movies.tar.gz")
if not os.path.exists(original_dataset_path):
    urllib.request.urlretrieve(dataset_url, original_dataset_path)
    print("Successful download")
    tar = tarfile.open(original_dataset_path)
    tar.extractall(original_dataset_folder)
    extracted_folder_name = tar.getnames()[0]
    tar.close()
    print("Successful extraction")
else:
    print("Dataset already downloaded and extracted")

Dataset already downloaded and extracted


In [8]:
if not os.path.exists(dataframe_path):
    dataframe_rows = []
    for split in ["train", "test"]:
        for sentiment in ["pos", "neg"]:
            folder = os.path.join(
                original_dataset_folder, "aclImdb", split, sentiment
            )
            for filename in os.listdir(folder):
                file_path = os.path.join(folder, filename)
                try:
                    if os.path.isfile(file_path):
                        with open(file_path, mode="r", encoding="utf-8") as text_file:
                            # Extract info
                            text = text_file.read()
                            score = filename.split("_")[1].split(".")[0]
                            file_id = filename.split("_")[0]

                            # Compute sentiment
                            num_sentiment = (
                                1 if sentiment == "pos"
                                else 0 if sentiment == "neg"
                                else -1
                            )

                            # Create single dataframe row
                            dataframe_row = {
                                "file_id": file_id,
                                "score": score,
                                "sentiment": num_sentiment,
                                "split": split,
                                "text": text,
                            }
                            dataframe_rows.append(dataframe_row)
                except Exception as e:
                    print("Failed to process %s. Reason: %s" % (file_path, e))
                    sys.exit(0)

    # Transform the list of rows in a proper dataframe
    dataframe = pd.DataFrame(dataframe_rows)
    dataframe_cols = ["file_id", "score", "sentiment", "split", "text"]
    dataframe = dataframe[dataframe_cols]
    dataframe.to_pickle(dataframe_path)
else:
    print("Dataframe already saved as a pickle file")

Dataframe already saved as a pickle file


In [4]:
df = pd.read_pickle(dataframe_path)
df['score'] = pd.to_numeric(df['score'])
df.head()

Unnamed: 0,file_id,score,sentiment,split,text
0,4715,9,1,train,For a movie that gets no respect there sure ar...
1,12390,8,1,train,Bizarre horror movie filled with famous faces ...
2,8329,7,1,train,"A solid, if unremarkable film. Matthau, as Ein..."
3,9063,8,1,train,It's a strange feeling to sit alone in a theat...
4,3092,10,1,train,"You probably all already know this by now, but..."


## Data exploration

In [10]:
print("Distribution of scores: ")
df['score'].value_counts()

Distribution of scores: 


1     10122
10     9731
8      5859
4      5331
3      4961
7      4803
9      4607
2      4586
Name: score, dtype: int64

In [11]:
score_labels = sorted(df['score'].unique())
print(f"Score labels: {score_labels}")

Score labels: [1, 2, 3, 4, 7, 8, 9, 10]


In [12]:
print(f"Number of duplicated texts: {df['text'].duplicated().value_counts()[True]}")

Number of duplicated texts: 418


In [7]:
def compute_freqs(words, top, thresh):
    """
    Return the `top` most frequent words of the dataset,
    after filtering them to be after `thresh` frequency 
    """
    fdist = nltk.FreqDist(words)
    common = {
        k: fdist.freq(k)
        for k, _ in sorted(fdist.items(), key=lambda i: i[1], reverse=True)
    }
    words = list(common.keys())
    freqs = list(common.values())
    common_cumulative = {k: sum(freqs[:i]) for i, k in enumerate(words)}
    return [k for k, v in common_cumulative.items() if v >= thresh][:top]


def print_df_freqs(dataframe, top=10, thresh=0.7):
    """
    Print the `top` most frequent words of the dataset,
    for each class
    """
    for c in score_labels:
        words = []
        text = df.query(f"score == {c}")["text"].to_numpy()
        for t in text:
            words.extend(word_tokenize(t.lower()))
        print(f"Number of different words in class {c}: {len(set(words))}")
        common = compute_freqs(words, top=top, thresh=thresh)
        print(f"{top} most frequent (after {thresh} frequency) words in class {c}:")
        print(common)
        print()


print_df_freqs(df)

Number of different words in class 1: 61170
10 most frequent (after 0.7 frequency) words in class 1:
['book', 'maybe', 'gets', 'almost', 'may', '2', 'sure', 'since', 'however', '..']

Number of different words in class 2: 44293
10 most frequent (after 0.7 frequency) words in class 2:
['rather', 'own', 'budget', 'sense', 'actor', 'both', 'feel', 'yet', 'having', 'half']

Number of different words in class 3: 49303
10 most frequent (after 0.7 frequency) words in class 3:
['screen', 'audience', 'stupid', 'family', 'actor', 'house', 'rest', 'sex', 'once', 'during']

Number of different words in class 4: 52665
10 most frequent (after 0.7 frequency) words in class 4:
['actor', 'death', 'different', 'help', 'fan', 'together', 'takes', 'each', 'less', 'house']

Number of different words in class 7: 50749
10 most frequent (after 0.7 frequency) words in class 7:
['\x96', 'until', 'second', 'believe', 'keep', 'kids', 'become', 'small', 'hollywood', 'production']

Number of different words in clas

In [5]:
def re_match(regex, text):
    """
    Return True if the given regex matches inside the given text,
    otherwise return False
    """
    return re.match(regex, text) is not None


# Find dates
DATE_RE = r"\d{1,2}[-\/\.]\d{1,2}[-\/\.]\d{2,4}"
num_dates = df["text"].map(partial(re_match, DATE_RE)).value_counts()
print("Number of texts with dates:")
print(num_dates)
print()

# Find floats
FLOAT_RE = r"(\d*\,)?\d+.\d*"
num_floats = df["text"].map(partial(re_match, FLOAT_RE)).value_counts()
print("Number of texts with decimal numbers:")
print(num_floats)
print()

# Find ints
INT_RE = r"(?<=\s)\d+(?=\s)"
num_ints = df["text"].map(partial(re_match, INT_RE)).value_counts()
print("Number of texts with integers:")
print(num_ints)
print()

# Find brackets
BRACKETS_RE = r"\[[^]]*\]"
num_brackets = df["text"].map(partial(re_match, BRACKETS_RE)).value_counts()
print("Number of texts with elements in square brackets:")
print(num_brackets)
print()

# Find HTML tags
HTML_RE = r"<.*?>"
num_html = df["text"].map(partial(re_match, HTML_RE)).value_counts()
print("Number of texts with HTML tags:")
print(num_html)
print()

# Find punctuation
PUNCTUATION_RE = r"[^\w{w}\s\{<>}]+"

Number of texts with dates:
False    49997
True         3
Name: text, dtype: int64

Number of texts with decimal numbers:
False    49814
True       186
Name: text, dtype: int64

Number of texts with integers:
False    50000
Name: text, dtype: int64

Number of texts with elements in square brackets:
False    49987
True        13
Name: text, dtype: int64

Number of texts with HTML tags:
False    49846
True       154
Name: text, dtype: int64



## Train/test utils

In [17]:
train_df = df.loc[df['split'] == "train"]
test_df = df.loc[df['split'] == "test"]

In [21]:
train_corpus, train_scores, test_corpus, test_scores = (
    train_df["text"].tolist(),
    np.array(train_df["score"].tolist()),
    test_df["text"].tolist(),
    np.array(test_df["score"].tolist()),
)
print(f"Number of sentences in the training set: {len(x_train)}")
print(f"Number of sentences in the test set: {len(x_test)}")

Number of sentences in the training set: 25000
Number of sentences in the test set: 25000


In [22]:
def predict(classifier, x_train, x_test):
    return (
        classifier.predict(x_train),
        classifier.predict(x_test),
        np.around(y_pred_test),
    )

In [23]:
def print_evaluation(y_test, y_pred_test, y_pred_test_class):
    # Evaluation as a regression task
    print("R2 score %f" % (r2_score(y_test, y_pred_test)))
    print("MAE %f" % (mean_absolute_error(y_test, y_pred_test)))
    print("MSE %f" % (mean_squared_error(y_test, y_pred_test)))
    print()

    # Evaluation as a multi-class classification task
    report = classification_report(y_test, y_pred_test_class, labels=score_labels)
    print(report)

    # Fancy confusion matrix
    plot_confusion_matrix(
        classifier,
        x_test,
        y_test,
        normalize="true",
        cmap=plt.cm.Blues,
        values_format=".2f",
    )

## Preprocessing

In [24]:
EN_STOPWORDS = set(stopwords.words("english"))
print(f"Stopwords ({len(EN_STOPWORDS)}): {EN_STOPWORDS}")

Stopwords (179): {'after', 'your', 'd', "mustn't", 'which', "needn't", 'it', 'doesn', 'couldn', 'our', 'some', "weren't", "haven't", 'm', 'ours', "you're", 'their', 'him', 'as', 'few', 'hasn', 'these', 'more', 'are', 'not', 'you', 'were', 'until', 'against', 'won', 'those', 'on', "you've", 'own', "shouldn't", "she's", "aren't", 'before', 'doing', "it's", 'the', 'from', "don't", 'for', 'such', 'theirs', 'didn', 'up', 'o', "wouldn't", 'his', 'about', 'over', 'here', 'if', 'most', 'be', 'into', 'above', 's', "should've", 'did', 'between', 'aren', 'itself', 'with', "couldn't", 'an', 'while', 've', 'of', 'off', 'further', 'why', 'both', 'same', 'this', 'than', 'just', 'no', 'she', 't', 'them', "hadn't", 'and', 'through', 'yours', "isn't", 'yourselves', 'himself', 'needn', "hasn't", 'is', 'where', 'don', "shan't", 'myself', 'been', 'in', 'll', 'haven', 'has', 'what', 'was', 'had', 'do', 'how', 'but', 'a', 'its', 'by', 'or', 'being', 'ma', 'shouldn', 'am', 'i', 'weren', 'during', 'they', 'the

In [25]:
STEMMER = nltk.porter.PorterStemmer()
LEMMATIZER = nltk.wordnet.WordNetLemmatizer()


def stem_text(text):
    return [STEMMER.stem(word) for word in text]


def lemmatize_text(text):
    return [LEMMATIZER.lemmatize(word) for word in text]


def preprocess_text(
    text,
    mark_negation=False,
    remove_punct=False,
    remove_stopwords=False,
    min_chars=None,
    root=None,
):
    # Strip trailing spaces and remove newlines
    text = text.strip().replace("\n", " ").replace("\r", " ")
    # Convert to lowercase
    text = text.lower()
    # Remove HTML tags
    text = re.sub(HTML_RE, "", text)
    # Remove text in square brackets
    text = re.sub(BRACKETS_RE, "", text)
    # Remove dates
    text = re.sub(DATE_RE, "", text)
    # Remove floating numbers
    text = re.sub(FLOAT_RE, "", text)
    # Mark negation
    if mark_negation:
        text = " ".join(mark_negation(text.split(), double_neg_flip=False))
        EN_STOPWORDS.add("_NEG")
    # Remove punctuation
    if remove_punct:
        text = re.sub(PUNCTUATION_RE, "", text)
    # Leave single whitespace
    text = text.split()
    # Remove words with less than `n` chars
    if min_chars is not None and isinstance(min_chars, int):
        text = [word for word in text if len(word) >= min_chars]
    # Remove stopwords
    if remove_stopwords:
        text = [word for word in text if word not in EN_STOPWORDS]
    # Perform stemming/lemmatization
    if root == "stem":
        text = stem_text(text)
    elif root == "lemmatize":
        text = lemmatize_text(text)
    # Return the text as a string
    return " ".join(text)

In [26]:
rnd_text = np.random.choice(df.index, 1)[0]
print(f"Random text with score {df['score'].iloc[rnd_text]}")
print()
print(f"Before pre-processing:")
print(df['text'].iloc[rnd_text])
print()
print("After pre-processing:")
print(preprocess_text(df['text'].iloc[rnd_text]))

Random text with score 3

Before pre-processing:
This picture reminds me of a Keneth More picture from 1957 called The Admirable Crighton" whilst on the boat he was a servant and on the island he became the master and upon being saved reverted back to servant. Madonna did OK in some movies however this one doesn't fire. If there is any picture that show Madonna can't act it is this one.<br /><br />I am not sure whether this was a subtle copy of "The Admirable Crighton" but it sure looks like it and if thats the case then Hollywood must be running out of ideas and that is sad. to provide a platform for actors to improve their career profile and just on that this fails in every corner and detail.<br /><br />The plot is loose and the acting is mediocre. The script should have been put thru the shredder before taking it on location. While many here have canned Madoona for all her acting I think that in films like "A League of Their own" was quite good and enjoyable and "Who's That Girl" sh

## Baseline

In [28]:
text_clf = Pipeline(
    [
        ("tfidf", TfidfVectorizer()),
        ("clf", LogisticRegression(random_state=RANDOM_SEED, max_iter=1000)),
    ]
)
text_clf = text_clf.fit(train_corpus, train_scores)

In [None]:
y_pred_train, y_pred_test, y_pred_test_class = predict(classifier, x_train, x_test)
print_evaluation(y_test, y_pred_test, y_pred_test_class)

## Classification with logistic regression

In [18]:
text_clf = Pipeline(
    [
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        ("clf", LogisticRegression(random_state=RANDOM_SEED, max_iter=1000)),
    ]
)

parameters = {
    "vect__ngram_range": [(1, 1), (1, 2)],
    "tfidf__use_idf": (True, False),
}
classifier = GridSearchCV(
    text_clf, parameters, scoring=make_scorer(r2_score), cv=5, n_jobs=-1
)
classifier = classifier.fit(x_train, y_train)
print(f"Best parameters: {classifier.best_params_}")



KeyboardInterrupt: 

In [None]:
y_pred_train, y_pred_test, y_pred_test_class = predict(classifier, x_train, x_test)
print_evaluation(y_test, y_pred_test, y_pred_test_class)

## Classification with another classifier

In [None]:
text_clf = Pipeline(
    [
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        (
            "clf",
            SGDClassifier(
                random_state=RANDOM_SEED,
                loss="hinge",
                penalty="l2",
                alpha=1e-3,
                max_iter=500,
                tol=None,
            ),
        ),
    ]
)

parameters = {
    "vect__ngram_range": [(1, 1), (1, 2)],
    "tfidf__use_idf": (True, False),
    "clf__alpha": (1e-2, 1e-3),
}
classifier = GridSearchCV(
    text_clf, parameters, scoring=make_scorer(r2_score), cv=5, n_jobs=-1
)
classifier = classifier.fit(x_train, y_train)
print(f"Best parameters: {classifier.best_params_}")

In [None]:
y_pred_train, y_pred_test, y_pred_test_class = predict(classifier, x_train, x_test)
print_evaluation(y_test, y_pred_test, y_pred_test_class)