In [1]:
import numpy as np
import pandas as pd
import en_vectors_web_lg
from IPython.display import display_markdown

from shared.data import load_imdb
from machine_learning.plot_helpers import describe_data

In [2]:
nlp = en_vectors_web_lg.load()

In [3]:
print("Loading training data...")
X_train, y_train = load_imdb('../data/aclImdb/train')

print("Loading test data...")
X_test, y_test = load_imdb('../data/aclImdb/test')

data = pd.DataFrame({
    'text': pd.Series(X_train + X_test),
    'score': pd.Series(y_train[:, 0] + y_test[:, 0])
})

del X_train
del y_train
del X_test
del y_test

Loading training data...
Loading test data...


In [4]:
def generate_words(text_series):
    for sentence in text_series:
        for t in nlp(sentence):
            if t and t.lemma_:
                yield t.lemma_.lower()

words = pd.Series(generate_words(data.text))

In [5]:
counts = words.value_counts()

display_markdown("**Total words**", raw=True)
display(len(words))

display_markdown("**Unique words**", raw=True)
display(len(counts))

**Total words**

13377116

**Unique words**

172778

In [6]:
display_markdown("**Let's print the most popular words**", raw=True)
display(counts[:50])

**Let's print the most popular words**

the      648550
,        542712
be       483885
.        449595
a        445083
and      321829
of       288816
to       267857
have     240375
in       183748
it       151203
that     147998
this     147662
i        136770
"        136634
-        103536
/><br    100972
movie     99224
not       92185
film      91615
with      86640
for       86247
but       81610
on        67151
(         64951
)         63982
much      62011
you       59333
his       57406
do        53902
one       53028
he        52252
see       46666
good      46660
at        46189
make      45321
all       45294
by        44099
like      43806
can       42427
'         41811
they      41558
who       40960
from      40048
!         39889
so        39410
get       35214
or        35183
just      34700
her       34440
dtype: int64

In [22]:
from spacy.lang.en.stop_words import STOP_WORDS

display_markdown("**Ok, we want to remove all stopwords and punctuations**", raw=True)
cleaned_words = pd.Series([w for w in words if w.lower() not in STOP_WORDS and w.isalpha()])

display_markdown("**Total cleaned words**", raw=True)
display('{} - {:.2f}% of all'.format(len(cleaned_words), len(cleaned_words) * 1.0 / len(words) * 100))

cleaned_counts = cleaned_words.value_counts()
display_markdown("**Unique cleaned words**", raw=True)
display(len(cleaned_counts))

display_markdown("**Let's print the most popular words**", raw=True)
display(cleaned_counts[:50])

**Ok, we want to remove all stopwords and punctuations**

**Total cleaned words**

'4831723 - 36.12% of all'

**Unique cleaned words**

82333

**Let's print the most popular words**

movie          99232
film           91610
good           46661
like           43807
time           30344
character      27235
watch          26668
story          24432
think          24131
little         22007
scene          20840
great          19599
look           19485
know           18834
end            18035
bad            17950
people         17767
play           16937
love           16826
act            16790
way            16682
come           16170
thing          15825
find           15784
br             15747
conjurer       15209
man            14358
work           13426
plot           13422
actor          13042
want           13026
life           12377
try            12303
feel           12265
year           12129
doe            11114
wrong          11078
old            10121
use             9860
funny           9447
lot             9347
real            9311
interest        9270
director        9235
guy             8858
performance     8789
cast            8425
big          

In [35]:
display_markdown("**Time to remove all words that we saw only once, as they won't help us to teach models**", raw=True)
final_words = cleaned_counts[cleaned_counts > 1].index.values

display_markdown("**Out final words statistics looks like this**", raw=True)
display('{} - {:.2f}% of words'.format(len(final_words), len(final_words) * 1.0 / len(cleaned_counts) * 100))

**Time to remove all words that we saw only once, as they won't help us to teach models**

**Out final words statistics looks like this**

'48758 - 59.22% of words'

In [39]:
display_markdown("**Let's clean datasets**", raw=True)

allowed_words = set(final_words)

def clean(sentence):
    return ' '.join(t.lemma_.lower() for t in nlp(sentence) if t and t.lemma_ and t.lemma_.lower() in allowed_words) 

data['cleaned'] = data.text.apply(clean)

**Let's clean datasets**

In [40]:
data.cleaned

0        katzir produce wonderful film roller coaster r...
1        want scream like big studio horror product for...
2        premise rate low plausibility unfortunately co...
3        face rent stdvd sequel forget gem expect afore...
4        tobe hooper exercise regard unfortunate young ...
5        maybe pc version game impressive maybe finish ...
6        kroko like leave cinema time life recommend wa...
7        start comment huge nightmare elm street fan th...
8        film time extremely long sequence dialogue bad...
9        invisible ray science fiction horror conjurer ...
10       touch story courage adversity woman find jewis...
11       shipman pay hefty sum money promote maxwell ro...
12       sundance movie certainly miss real art house d...
13       bother try watch terrible mini series hour bea...
14       parent male able identify character heartache ...
15       story grow relationship jeff stewart ronda rom...
16       weekend watch funny film like kid cute remind .