In [1]:
import pandas as pd
import torch
from torchtext import datasets
from typing import Iterable, List, Tuple

In [2]:
(train_set_it, test_set_it) = datasets.IMDB()

In [3]:
def translate_sentiment(sentiment: str) -> int:
    return 1 if sentiment.lower() == 'pos' else 0

def translate_set_tuple_to_df_tuple(set_tuple: Tuple[str, str]) -> Tuple[int, str]:
    (sentiment, review) = set_tuple
    return (translate_sentiment(sentiment), review)

def translate_set_to_list(set_iterator: Iterable[Tuple[str, str]]) -> List[Tuple[int, str]]:
    return [translate_set_tuple_to_df_tuple(set_tp) for set_tp in set_iterator]

def translate_to_df(set_iterator: Iterable[Tuple[str, str]]) -> pd.DataFrame:
    return pd.DataFrame(data=translate_set_to_list(set_iterator),
                        columns=['sentiment', 'review'])

In [4]:
df_train = translate_to_df(train_set_it)
del train_set_it

In [5]:
df_test = translate_to_df(test_set_it)
del test_set_it

In [6]:
len(df_train), len(df_test)

(25000, 25000)

In [9]:
df_train['sentiment'].value_counts(normalize=True)

0    0.5
1    0.5
Name: sentiment, dtype: float64

In [10]:
df_test['sentiment'].value_counts(normalize=True)

0    0.5
1    0.5
Name: sentiment, dtype: float64

In [11]:
def trim_str(value: str) -> str:
    return value.strip() if value else value

def is_str_null_or_empty(value: str) -> bool:
    return not trim_str(value)

def is_str_series_complete(series: Iterable[str]) -> bool:
    return not any(map(is_str_null_or_empty, series))

In [12]:
is_str_series_complete(df_train['review'])

True

In [13]:
is_str_series_complete(df_test['review'])

True

In [14]:
def minmax_length_in_str_series(series: pd.Series) -> Tuple[int, int]:
    minmax_length = series.apply(len).agg(['min', 'max'])
    return (minmax_length['min'], minmax_length['max'])

In [15]:
minmax_length_in_str_series(df_train['review'])

(52, 13704)

In [16]:
minmax_length_in_str_series(df_test['review'])

(32, 12988)

In [17]:
df_train.head()

Unnamed: 0,sentiment,review
0,0,I rented I AM CURIOUS-YELLOW from my video sto...
1,0,"""I Am Curious: Yellow"" is a risible and preten..."
2,0,If only to avoid making this type of film in t...
3,0,This film was probably inspired by Godard's Ma...
4,0,"Oh, brother...after hearing about this ridicul..."


In [18]:
df_test.head()

Unnamed: 0,sentiment,review
0,0,I love sci-fi and am willing to put up with a ...
1,0,"Worth the entertainment value of a rental, esp..."
2,0,its a totally average film with a few semi-alr...
3,0,STAR RATING: ***** Saturday Night **** Friday ...
4,0,"First off let me say, If you haven't enjoyed a..."


In [21]:
for (i, review) in enumerate(v for df in (df_train, df_test) for v in df.loc[:10, 'review']):
    print(f'{i+1:02}:\n\t{review}\n')

01:
	I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between