In [5]:
%run -i "../util/util_simple_classifier.ipynb"

In [6]:
from langdetect import detect
from nltk import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from string import punctuation

In [7]:
(train_df, test_df) = load_train_test_dataset_pd("train", "test")
print(train_df)
print(test_df)

                                                   text  label
0     the rock is destined to be the 21st century's ...      1
1     the gorgeously elaborate continuation of " the...      1
2                        effective but too-tepid biopic      1
3     if you sometimes like to go to the movies to h...      1
4     emerges as something rare , an issue movie tha...      1
...                                                 ...    ...
8525  any enjoyment will be hinge from a personal th...      0
8526  if legendary shlockmeister ed wood had ever ma...      0
8527  hardly a nuanced portrait of a young woman's b...      0
8528    interminably bleak , to say nothing of boring .      0
8529  things really get weird , though not particula...      0

[8530 rows x 2 columns]
                                                   text  label
0     lovingly photographed in the manner of a golde...      1
1                 consistently clever and suspenseful .      1
2     it's like a " big chill 

In [8]:
# Filter out non-English text
train_df["lang"] = train_df["text"].apply(detect)
train_df = train_df[train_df['lang'] == 'en']
print(train_df)

                                                   text  label lang
0     the rock is destined to be the 21st century's ...      1   en
1     the gorgeously elaborate continuation of " the...      1   en
2                        effective but too-tepid biopic      1   en
3     if you sometimes like to go to the movies to h...      1   en
4     emerges as something rare , an issue movie tha...      1   en
...                                                 ...    ...  ...
8525  any enjoyment will be hinge from a personal th...      0   en
8526  if legendary shlockmeister ed wood had ever ma...      0   en
8527  hardly a nuanced portrait of a young woman's b...      0   en
8528    interminably bleak , to say nothing of boring .      0   en
8529  things really get weird , though not particula...      0   en

[8355 rows x 3 columns]


In [9]:
test_df["lang"] = test_df["text"].apply(detect)
test_df = test_df[test_df['lang'] == 'en']
print(test_df)

                                                   text  label lang
0     lovingly photographed in the manner of a golde...      1   en
1                 consistently clever and suspenseful .      1   en
2     it's like a " big chill " reunion of the baade...      1   en
3     the story gives ample opportunity for large-sc...      1   en
4                     red dragon " never cuts corners .      1   en
...                                                 ...    ...  ...
1061  a terrible movie that some people will neverth...      0   en
1062  there are many definitions of 'time waster' bu...      0   en
1063  as it stands , crocodile hunter has the hurrie...      0   en
1064  the thing looks like a made-for-home-video qui...      0   en
1065  enigma is well-made , but it's just too dry an...      0   en

[1049 rows x 3 columns]


In [10]:
# Split into words
train_df["text"] = train_df["text"].apply(word_tokenize)
print(train_df)
test_df["text"] = test_df["text"].apply(word_tokenize)
print(test_df)

                                                   text  label lang
0     [the, rock, is, destined, to, be, the, 21st, c...      1   en
1     [the, gorgeously, elaborate, continuation, of,...      1   en
2                   [effective, but, too-tepid, biopic]      1   en
3     [if, you, sometimes, like, to, go, to, the, mo...      1   en
4     [emerges, as, something, rare, ,, an, issue, m...      1   en
...                                                 ...    ...  ...
8525  [any, enjoyment, will, be, hinge, from, a, per...      0   en
8526  [if, legendary, shlockmeister, ed, wood, had, ...      0   en
8527  [hardly, a, nuanced, portrait, of, a, young, w...      0   en
8528  [interminably, bleak, ,, to, say, nothing, of,...      0   en
8529  [things, really, get, weird, ,, though, not, p...      0   en

[8355 rows x 3 columns]
                                                   text  label lang
0     [lovingly, photographed, in, the, manner, of, ...      1   en
1           [consistent

In [11]:
# Remove stopwords and punctuation
stop_words = list(stopwords.words('english'))
stop_words.append("``")
stop_words.append("'s")
def remove_stopwords_and_punct(x):
    new_list = [w for w in x if w not in stop_words and w not in punctuation]
    return new_list
train_df["text"] = train_df["text"].apply(remove_stopwords_and_punct)
print(train_df)
test_df["text"] = test_df["text"].apply(remove_stopwords_and_punct)
print(test_df)

                                                   text  label lang
0     [rock, destined, 21st, century, new, conan, go...      1   en
1     [gorgeously, elaborate, continuation, lord, ri...      1   en
2                        [effective, too-tepid, biopic]      1   en
3     [sometimes, like, go, movies, fun, wasabi, goo...      1   en
4     [emerges, something, rare, issue, movie, hones...      1   en
...                                                 ...    ...  ...
8525  [enjoyment, hinge, personal, threshold, watchi...      0   en
8526  [legendary, shlockmeister, ed, wood, ever, mad...      0   en
8527  [hardly, nuanced, portrait, young, woman, brea...      0   en
8528        [interminably, bleak, say, nothing, boring]      0   en
8529  [things, really, get, weird, though, particula...      0   en

[8355 rows x 3 columns]
                                                   text  label lang
0     [lovingly, photographed, manner, golden, book,...      1   en
1                   [co

In [69]:
# Count number of items per class
print(train_df.groupby('label').count())
print(test_df.groupby('label').count())

       text  lang
label            
0      4194  4194
1      4170  4170
       text  lang
label            
0       523   523
1       524   524


In [70]:
# Save to disk
train_df.to_json("../data/rotten_tomatoes_train.json")
test_df.to_json("../data/rotten_tomatoes_test.json")

In [78]:
def get_stats(word_list, num_words=200):
    freq_dist = FreqDist(word_list)
    print(freq_dist.most_common(num_words))
    return freq_dist

In [79]:
# Show most common words
positive_train_words = train_df[train_df["label"] == 1].text.sum()
negative_train_words = train_df[train_df["label"] == 0].text.sum()
positive_fd = get_stats(positive_train_words)
negative_fd = get_stats(negative_train_words)

[('film', 683), ('movie', 429), ("n't", 286), ('one', 280), ('--', 271), ('like', 209), ('story', 194), ('comedy', 160), ('good', 150), ('even', 144), ('funny', 137), ('way', 135), ('time', 127), ('best', 126), ('characters', 125), ('make', 124), ('life', 124), ('much', 122), ('us', 122), ('love', 118), ('performances', 117), ('makes', 116), ('may', 113), ('work', 111), ('director', 110), ('enough', 105), ('look', 103), ('still', 96), ('little', 94), ('well', 93), ('new', 92), ('films', 92), ('movies', 89), ('fun', 89), ('great', 88), ('drama', 87), ('two', 85), ('performance', 82), ('never', 81), ('could', 81), ('world', 77), ('people', 76), ('see', 76), ('cast', 75), ('many', 74), ('also', 73), ('though', 73), ('tale', 71), ('first', 70), ('documentary', 69), ('without', 69), ('entertaining', 68), ('big', 68), ('made', 67), ('heart', 66), ('ever', 65), ('family', 65), ('often', 64), ('would', 64), ('humor', 64), ("'re", 63), ('sense', 63), ('human', 61), ('romantic', 60), ('audience'