In [1]:
import pandas as pd
import re
import torch
from torchtext import datasets
from typing import Iterable, List, Tuple

In [2]:
(train_set_it, test_set_it) = datasets.IMDB()

In [3]:
def translate_sentiment(sentiment: str) -> int:
    return 1 if sentiment.lower() == 'pos' else 0

def translate_set_tuple_to_df_tuple(set_tuple: Tuple[str, str]) -> Tuple[int, str]:
    (sentiment, review) = set_tuple
    return (translate_sentiment(sentiment), review.lower())

def translate_set_to_list(set_iterator: Iterable[Tuple[str, str]]) -> List[Tuple[int, str]]:
    return [translate_set_tuple_to_df_tuple(set_tp) for set_tp in set_iterator]

def translate_to_df(set_iterator: Iterable[Tuple[str, str]]) -> pd.DataFrame:
    return pd.DataFrame(data=translate_set_to_list(set_iterator),
                        columns=['sentiment', 'review'])

In [4]:
df_train = translate_to_df(train_set_it)
del train_set_it

In [5]:
df_test = translate_to_df(test_set_it)
del test_set_it

In [6]:
len(df_train), len(df_test)

(25000, 25000)

In [7]:
df_train['sentiment'].value_counts(normalize=True)

0    0.5
1    0.5
Name: sentiment, dtype: float64

In [8]:
df_test['sentiment'].value_counts(normalize=True)

0    0.5
1    0.5
Name: sentiment, dtype: float64

In [9]:
def trim_str(value: str) -> str:
    return value.strip() if value else value

def is_str_null_or_empty(value: str) -> bool:
    return not trim_str(value)

def is_str_series_complete(series: Iterable[str]) -> bool:
    return not any(map(is_str_null_or_empty, series))

In [10]:
is_str_series_complete(df_train['review'])

True

In [11]:
is_str_series_complete(df_test['review'])

True

In [12]:
def minmax_length_in_str_series(series: pd.Series) -> Tuple[int, int]:
    minmax_length = series.apply(len).agg(['min', 'max'])
    return (minmax_length['min'], minmax_length['max'])

In [13]:
minmax_length_in_str_series(df_train['review'])

(52, 13704)

In [14]:
minmax_length_in_str_series(df_test['review'])

(32, 12988)

In [15]:
df_train.head()

Unnamed: 0,sentiment,review
0,0,i rented i am curious-yellow from my video sto...
1,0,"""i am curious: yellow"" is a risible and preten..."
2,0,if only to avoid making this type of film in t...
3,0,this film was probably inspired by godard's ma...
4,0,"oh, brother...after hearing about this ridicul..."


In [16]:
df_test.head()

Unnamed: 0,sentiment,review
0,0,i love sci-fi and am willing to put up with a ...
1,0,"worth the entertainment value of a rental, esp..."
2,0,its a totally average film with a few semi-alr...
3,0,star rating: ***** saturday night **** friday ...
4,0,"first off let me say, if you haven't enjoyed a..."


In [17]:
def print_first_five_reviews_from_each_df(dfs_to_iterate) -> None:
    for df in dfs_to_iterate:
        for review in df['review'][:5]:
            print(f'\t{review}\n')

In [18]:
print_first_five_reviews_from_each_df([df_train, df_test])

	i rented i am curious-yellow from my video store because of all the controversy that surrounded it when it was first released in 1967. i also heard that at first it was seized by u.s. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" i really had to see this for myself.<br /><br />the plot is centered around a young swedish drama student named lena who wants to learn everything she can about life. in particular she wants to focus her attentions to making some sort of documentary on what the average swede thought about certain political issues such as the vietnam war and race issues in the united states. in between asking politicians and ordinary denizens of stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />what kills me about i am curious-yellow is that 40 years ago, this was considered pornographic. really, the sex and nudity scenes are few and far between, ev

In [19]:
def remove_html_line_breaks_from_str(value: str) -> str:
    html_line_break_pattern = r'<\s*(?:/\s*br|br\s*/)\s*>'
    return ' '.join(re.split(html_line_break_pattern, value))

In [20]:
for df in [df_train, df_test]:
    df['review'] = df['review'].apply(remove_html_line_breaks_from_str)

In [21]:
print_first_five_reviews_from_each_df([df_train, df_test])

	i rented i am curious-yellow from my video store because of all the controversy that surrounded it when it was first released in 1967. i also heard that at first it was seized by u.s. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" i really had to see this for myself.  the plot is centered around a young swedish drama student named lena who wants to learn everything she can about life. in particular she wants to focus her attentions to making some sort of documentary on what the average swede thought about certain political issues such as the vietnam war and race issues in the united states. in between asking politicians and ordinary denizens of stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.  what kills me about i am curious-yellow is that 40 years ago, this was considered pornographic. really, the sex and nudity scenes are few and far between, even then it's not sho

In [24]:
def split_sentences(value: str) -> List[str]:
    SENTENCE_END = '.!?:;'
    SCOPE_BEGIN = '([{'
    SCOPE_END = ')]}'
    sentences = []
    pending_sentences = []
    current_sentence = ''
    sentence_start = 0

    def append_piece_to_current_sentence(piece: str) -> None:
        nonlocal current_sentence
        current_sentence += piece

    def append_current_piece_to_current_sentence() -> None:
        append_piece_to_current_sentence(value[sentence_start:i])

    def append_last_piece_to_current_sentence() -> None:
        append_piece_to_current_sentence(value[sentence_start:])

    def add_sentence(sentence: str) -> None:
        trimmed_sentence = sentence.strip()
        if trimmed_sentence:
            sentences.append(trimmed_sentence)

    def add_current_sentence() -> None:
        add_sentence(current_sentence)

    for (i, c) in enumerate(value):
        if c in SCOPE_BEGIN:
            append_current_piece_to_current_sentence()
            pending_sentences.append(current_sentence)
            current_sentence = ''
            sentence_start = i+1
        elif c in SCOPE_END:
            append_current_piece_to_current_sentence()
            add_current_sentence()
            current_sentence = pending_sentences.pop(-1) if pending_sentences else ''
            sentence_start = i+1
        elif c in SENTENCE_END:
            append_current_piece_to_current_sentence()
            add_current_sentence()
            current_sentence = ''
            sentence_start = i+1

    append_last_piece_to_current_sentence()
    add_current_sentence()
    for unfinished_sentence in pending_sentences:
        add_sentence(unfinished_sentence)

    return sentences

In [25]:
for df in [df_train, df_test]:
    df['review'] = df['review'].apply(split_sentences)

In [26]:
print_first_five_reviews_from_each_df([df_train, df_test])

	['i rented i am curious-yellow from my video store because of all the controversy that surrounded it when it was first released in 1967', 'i also heard that at first it was seized by u', 's', 'customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" i really had to see this for myself', 'the plot is centered around a young swedish drama student named lena who wants to learn everything she can about life', 'in particular she wants to focus her attentions to making some sort of documentary on what the average swede thought about certain political issues such as the vietnam war and race issues in the united states', 'in between asking politicians and ordinary denizens of stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men', 'what kills me about i am curious-yellow is that 40 years ago, this was considered pornographic', "really, the sex and nudity scenes are few and far between, even 