In [1]:
import nltk
import pandas as pd
import re
import torch
from math import log
from nltk import WordNetLemmatizer
from torch import optim
from torch import nn
from torchtext import datasets
from typing import Dict, Iterable, List, Tuple

# 1) Load Dataset

In [2]:
def sentiment_to_int(sentiment: str) -> int:
    return int(sentiment.lower() == 'pos')

def int_to_sentiment(value: int) -> str:
    return 'pos' if value > 0 else 'neg'

def convert_dataset_tuple(dataset_tuple: Tuple[str, str]) -> Tuple[int, str]:
    (sentiment, review) = dataset_tuple
    return (sentiment_to_int(sentiment), review.lower())

def convert_dataset_to_list(dataset: Iterable[Tuple[str, str]]) -> List[Tuple[int, str]]:
    return list(map(convert_dataset_tuple, dataset))

def convert_dataset_to_dataframe(dataset: Iterable[Tuple[str, str]]) -> pd.DataFrame:
    return pd.DataFrame(data=convert_dataset_to_list(dataset),
                        columns=['sentiment', 'review'])

In [3]:
def load_imdb_dataframes() -> Tuple[pd.DataFrame, pd.DataFrame]:
    (train_set_iter, test_set_iter) = datasets.IMDB()
    return (convert_dataset_to_dataframe(train_set_iter),
            convert_dataset_to_dataframe(test_set_iter))

In [4]:
(df_train, df_test) = load_imdb_dataframes()

# 2) Analyse Dataset

In [5]:
len(df_train), len(df_test)

(25000, 25000)

In [6]:
df_train['sentiment'].value_counts(normalize=True)

0    0.5
1    0.5
Name: sentiment, dtype: float64

In [7]:
df_test['sentiment'].value_counts(normalize=True)

0    0.5
1    0.5
Name: sentiment, dtype: float64

In [8]:
def trim_str(value: str) -> str:
    return value.strip() if value else value

def is_str_null_or_empty(value: str) -> bool:
    return not trim_str(value)

def is_str_series_complete(series: Iterable[str]) -> bool:
    return not any(map(is_str_null_or_empty, series))

In [9]:
is_str_series_complete(df_train['review'])

True

In [10]:
is_str_series_complete(df_test['review'])

True

In [11]:
def minmax_length_in_str_series(series: pd.Series) -> Tuple[int, int]:
    minmax_length = series.apply(len).agg(['min', 'max'])
    return (minmax_length['min'], minmax_length['max'])

In [12]:
minmax_length_in_str_series(df_train['review'])

(52, 13704)

In [13]:
minmax_length_in_str_series(df_test['review'])

(32, 12988)

In [14]:
df_train.head()

Unnamed: 0,sentiment,review
0,0,i rented i am curious-yellow from my video sto...
1,0,"""i am curious: yellow"" is a risible and preten..."
2,0,if only to avoid making this type of film in t...
3,0,this film was probably inspired by godard's ma...
4,0,"oh, brother...after hearing about this ridicul..."


In [15]:
df_test.head()

Unnamed: 0,sentiment,review
0,0,i love sci-fi and am willing to put up with a ...
1,0,"worth the entertainment value of a rental, esp..."
2,0,its a totally average film with a few semi-alr...
3,0,star rating: ***** saturday night **** friday ...
4,0,"first off let me say, if you haven't enjoyed a..."


In [16]:
def print_first_n_reviews(dataframe: pd.DataFrame, n: int = 5) -> None:
    for review in dataframe['review'][:n]:
        print(f'\t{review}\n')

def print_first_n_reviews_from_dataframes(n: int = 5) -> None:
    for df in [df_train, df_test]:
        print_first_n_reviews(df, n)

In [17]:
print_first_n_reviews_from_dataframes()

	i rented i am curious-yellow from my video store because of all the controversy that surrounded it when it was first released in 1967. i also heard that at first it was seized by u.s. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" i really had to see this for myself.<br /><br />the plot is centered around a young swedish drama student named lena who wants to learn everything she can about life. in particular she wants to focus her attentions to making some sort of documentary on what the average swede thought about certain political issues such as the vietnam war and race issues in the united states. in between asking politicians and ordinary denizens of stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />what kills me about i am curious-yellow is that 40 years ago, this was considered pornographic. really, the sex and nudity scenes are few and far between, ev

# 3) Process Dataset

In [18]:
def remove_html_line_breaks(review: str) -> str:
    html_line_break_pattern = r'<\s*(?:/\s*br|br\s*/)\s*>'
    return ' '.join(re.split(html_line_break_pattern, review))

In [19]:
def gen_sentences_from_review(review: str) -> str:
    yield from filter(bool, nltk.sent_tokenize(review))

def gen_words_from_sentence(sentence: str) -> str:
    yield from filter(bool, nltk.word_tokenize(sentence))

In [20]:
def is_alphanumerical_token(token: str) -> bool:
    return any(filter(str.isalnum, token))

def filter_alphanumerical_tokens(token_iter: Iterable[str]) -> str:
    yield from filter(is_alphanumerical_token, token_iter)

In [21]:
en_stopwords = set(nltk.corpus.stopwords.words('english'))
en_stopwords |= {"n't", "'s", "'t", "'ve"}

def is_not_stopword(word: str) -> bool:
    return word not in en_stopwords

def filter_relevant_tokens(word_iter: Iterable[str]) -> str:
    yield from filter(is_not_stopword, word_iter)

In [22]:
lemmer = WordNetLemmatizer()

def gen_lemmatized_tokens(word_iter: Iterable[str]) -> str:
    yield from map(lemmer.lemmatize, word_iter)

In [23]:
def normalize_sentence(sentence: str) -> List[str]:
    tokens = gen_words_from_sentence(sentence)
    alphanum_tokens = filter_alphanumerical_tokens(tokens)
    relevant_tokens = filter_relevant_tokens(alphanum_tokens)
    lemmas = gen_lemmatized_tokens(relevant_tokens)
    return list(lemmas)

In [24]:
def gen_normalized_sentences_from_review(review: str) -> List[str]:
    sentences = gen_sentences_from_review(review)
    normalized_sentences = map(normalize_sentence, sentences)
    yield from filter(bool, normalized_sentences)

In [25]:
def normalize_review(review: str) -> List[List[str]]:
    review = remove_html_line_breaks(review)
    normalized_sentences = gen_normalized_sentences_from_review(review)
    return list(normalized_sentences)

In [26]:
def transform_reviews(dataframe: pd.DataFrame, transformation) -> None:
    dataframe['review'] = dataframe['review'].apply(transformation)

def transform_reviews_in_dataframes(transformation) -> None:
    for dataframe in [df_train, df_test]:
        transform_reviews(dataframe, transformation)

In [27]:
transform_reviews_in_dataframes(normalize_review)

In [28]:
print_first_n_reviews_from_dataframes()

	[['rented', 'curious-yellow', 'video', 'store', 'controversy', 'surrounded', 'first', 'released', '1967.', 'also', 'heard', 'first', 'seized', 'u.s.', 'custom', 'ever', 'tried', 'enter', 'country', 'therefore', 'fan', 'film', 'considered', 'controversial', 'really', 'see'], ['plot', 'centered', 'around', 'young', 'swedish', 'drama', 'student', 'named', 'lena', 'want', 'learn', 'everything', 'life'], ['particular', 'want', 'focus', 'attention', 'making', 'sort', 'documentary', 'average', 'swede', 'thought', 'certain', 'political', 'issue', 'vietnam', 'war', 'race', 'issue', 'united', 'state'], ['asking', 'politician', 'ordinary', 'denizen', 'stockholm', 'opinion', 'politics', 'sex', 'drama', 'teacher', 'classmate', 'married', 'men'], ['kill', 'curious-yellow', '40', 'year', 'ago', 'considered', 'pornographic'], ['really', 'sex', 'nudity', 'scene', 'far', 'even', 'shot', 'like', 'cheaply', 'made', 'porno'], ['countryman', 'mind', 'find', 'shocking', 'reality', 'sex', 'nudity', 'major', 

# 4) Modeling

## 4.1) Vocabulary

In [29]:
class Vocabulary:
    def __init__(self, word_iter: Iterable[str]):
        self.index_table = Vocabulary.make_index_table_(word_iter)
        self.padding_index = self.index_table[None]


    def make_index_table_(word_iter: Iterable[str]) -> Dict[str, int]:
        table = { None: 0 }
        for word in word_iter:
            if word not in table:
                table[word] = len(table)
        return table


    def translate_sentence(self, sentence: Iterable[str]) -> List[int]:
        return torch.tensor(list(map(self.translate_word_, sentence)), dtype=torch.long)


    def translate_word_(self, word: str) -> int:
        return self.index_table[word]


    def __len__(self):
        return len(self.index_table)

In [30]:
imdb_vocab = Vocabulary(word
                        for df in [df_train, df_test]
                            for review in df['review']
                                for sentence in review
                                    for word in sentence)

## 4.2) Architecture

In [31]:
class ReviewClassifier(nn.Module):
    def __init__(self, vocab: Vocabulary, n_of_sentiments: int):
        super(ReviewClassifier, self).__init__()

        vocab_len = len(vocab)

        self.embedding_len = int(vocab_len ** (1 / (1 + log(vocab_len, 10))))
        self.hidden_len = n_of_sentiments ** 2
        self.n_of_sentiments = n_of_sentiments

        self.word_embedder = nn.Embedding(vocab_len, self.embedding_len,
                                          padding_idx=vocab.padding_index)

        self.rnn_layer = nn.LSTM(self.embedding_len, self.hidden_len, batch_first=True)
        self.to_sentiment_layer = nn.Linear(self.hidden_len, self.n_of_sentiments)


    def forward(self, sentences: Iterable[torch.Tensor]) -> torch.Tensor:
        X = nn.utils.rnn.pad_sequence(sentences, batch_first=True)
        X = self.word_embedder(X)

        X = nn.utils.rnn.pack_padded_sequence(X, list(map(len, sentences)),
                                              batch_first=True, enforce_sorted=False)
        X, _ = self.rnn_layer(X)
        X, X_lengths = nn.utils.rnn.pad_packed_sequence(X, batch_first=True)

        flattened_indices = torch.arange(len(X_lengths)) * max(X_lengths) + (X_lengths - 1)
        X = X.view(-1, self.hidden_len)[flattened_indices, :]

        X = self.to_sentiment_layer(X)
        X = torch.tanh(X)

        return nn.functional.log_softmax(X, dim=1)

In [32]:
def sentences_to_model_input(sentences: Iterable[Iterable[str]], vocab: Vocabulary) -> List[torch.Tensor]:
    return list(map(vocab.translate_sentence, sentences))

# 5) Training

In [33]:
imdb_classif = ReviewClassifier(imdb_vocab, len(['neg', 'pos']))

In [35]:
def train_model(model: ReviewClassifier, train_data: pd.DataFrame, *,
                n_epochs: int = 100, batch_size: int = 64) -> None:
    loss_criterion = nn.NLLLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.02)

    for _ in range(n_epochs):
        for (_, row) in train_data.iterrows():
            (review, sentiment) = (row['review'], row['sentiment'])

            for i_start in range(0, len(review), batch_size):
                i_end = i_start+batch_size

                model.zero_grad()

                X = sentences_to_model_input(review[i_start:i_end], imdb_vocab)
                Y = torch.full([len(X)], sentiment)

                Y_pred = model(X)

                batch_loss = loss_criterion(Y_pred, Y)
                batch_loss.backward()

                optimizer.step()

In [36]:
train_model(imdb_classif, df_train, n_epochs=5, batch_size=32)

In [None]:
# batch_size = 64
# for (_, row) in df_train.iterrows():
#     (review, sentiment) = (row['review'], row['sentiment'])
#     for i in range(0, len(review), batch_size):
#         X = review[i:i+batch_size]
#         X = sentences_to_model_input(X, imdb_vocab)

#         Y_pred = imdb_classif(X)

# 6) Classifying

In [46]:
def predict_with_model(model: ReviewClassifier, test_data: pd.DataFrame) -> None:
    Y_true = torch.tensor([])
    Y_pred = torch.tensor([])
    with torch.no_grad():
        for (_, row) in test_data.iterrows():
            (review, sentiment) = (row['review'], row['sentiment'])

            X = sentences_to_model_input(review, imdb_vocab)
            Y = torch.full([len(X)], sentiment)

            pred = model(X).argmax(dim=1)

            Y_true = torch.hstack([Y_true, Y])
            Y_pred = torch.hstack([Y_pred, pred])

    return (Y_pred, Y_true)

In [47]:
(Y_pred, Y_true) = predict_with_model(imdb_classif, df_test)

In [48]:
(len(Y_pred), len(Y_true))

(298463, 298463)

In [49]:
(Y_pred == Y_true).sum() / len(Y_true)

tensor(0.4873)