In [1]:
import os
import pandas as pd
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from tqdm import notebook

In [2]:
TRAIN_PATH = '../dataset/train.tsv'

In [3]:
df = pd.read_csv(TRAIN_PATH, sep = '\t')

In [4]:
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [5]:
X = df.Phrase.values

In [6]:
Y = df.Sentiment.values

In [7]:
X

array(['A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .',
       'A series of escapades demonstrating the adage that what is good for the goose',
       'A series', ..., 'avuncular chortles', 'avuncular', 'chortles'],
      dtype=object)

In [8]:
Y

array([1, 2, 2, ..., 3, 2, 2], dtype=int64)

In [9]:
nlp = spacy.load("en")

In [10]:
[word.lemma_ for word in nlp(X[0].lower()) if word.lemma_ not in STOP_WORDS]

['series',
 'escapade',
 'demonstrate',
 'adage',
 'good',
 'goose',
 'good',
 'gander',
 ',',
 'occasionally',
 'amuse',
 'story',
 '.']

In [11]:
maxlen = 0
word_freq_dict = {}
runner = notebook.tqdm(X, total = len(X))
for sentence in runner:
    tokenized = nlp(sentence.lower())
    word_list = [str(word.lemma_) for word in tokenized if word.lemma_ not in STOP_WORDS]
    maxlen = max(maxlen, len(word_list))
    for word in word_list:
        if word not in word_freq_dict:
            word_freq_dict[word] = 0
        word_freq_dict[word] += 1    

HBox(children=(FloatProgress(value=0.0, max=156060.0), HTML(value='')))




In [12]:
word_freq_dict

{'series': 363,
 'escapade': 16,
 'demonstrate': 102,
 'adage': 17,
 'good': 3281,
 'goose': 43,
 'gander': 9,
 ',': 42006,
 'occasionally': 184,
 'amuse': 100,
 'story': 2844,
 '.': 18514,
 'quiet': 153,
 'introspective': 23,
 'entertaining': 450,
 'independent': 42,
 'worth': 503,
 'seek': 176,
 'entertain': 271,
 'fan': 523,
 'ismail': 6,
 'merchant': 35,
 'work': 1997,
 'suspect': 70,
 'hard': 765,
 'time': 2480,
 'sit': 320,
 'positively': 38,
 'thrill': 186,
 'combination': 77,
 'ethnography': 7,
 'intrigue': 155,
 'betrayal': 29,
 'deceit': 14,
 'murder': 147,
 'shakespearean': 28,
 'tragedy': 315,
 'juicy': 30,
 'soap': 222,
 'opera': 300,
 'aggressive': 24,
 'self': 1038,
 '-': 23075,
 'glorification': 16,
 'manipulative': 127,
 'whitewash': 6,
 'comedy': 2011,
 'drama': 1000,
 'nearly': 384,
 'epic': 171,
 'proportion': 37,
 'root': 175,
 'sincere': 134,
 'performance': 1474,
 'title': 417,
 'character': 2891,
 'undergo': 16,
 'midlife': 16,
 'crisis': 70,
 'undergoing': 1,
 

In [13]:
maxlen

35

In [14]:
word_freq_dict['.']

18514

In [15]:
word_freq_dict['!']

248

In [16]:
word_freq_dict[',']

42006

In [17]:
len(word_freq_dict)

13553

In [18]:
import json

In [19]:
f = open("../dataset/word_freq_dict.json", "w")
json.dump(word_freq_dict, f)
f.close()

In [20]:
word_to_index = {}
for dex, word in enumerate(word_freq_dict):
    word_to_index[word] = dex

In [21]:
word_to_index

{'series': 0,
 'escapade': 1,
 'demonstrate': 2,
 'adage': 3,
 'good': 4,
 'goose': 5,
 'gander': 6,
 ',': 7,
 'occasionally': 8,
 'amuse': 9,
 'story': 10,
 '.': 11,
 'quiet': 12,
 'introspective': 13,
 'entertaining': 14,
 'independent': 15,
 'worth': 16,
 'seek': 17,
 'entertain': 18,
 'fan': 19,
 'ismail': 20,
 'merchant': 21,
 'work': 22,
 'suspect': 23,
 'hard': 24,
 'time': 25,
 'sit': 26,
 'positively': 27,
 'thrill': 28,
 'combination': 29,
 'ethnography': 30,
 'intrigue': 31,
 'betrayal': 32,
 'deceit': 33,
 'murder': 34,
 'shakespearean': 35,
 'tragedy': 36,
 'juicy': 37,
 'soap': 38,
 'opera': 39,
 'aggressive': 40,
 'self': 41,
 '-': 42,
 'glorification': 43,
 'manipulative': 44,
 'whitewash': 45,
 'comedy': 46,
 'drama': 47,
 'nearly': 48,
 'epic': 49,
 'proportion': 50,
 'root': 51,
 'sincere': 52,
 'performance': 53,
 'title': 54,
 'character': 55,
 'undergo': 56,
 'midlife': 57,
 'crisis': 58,
 'undergoing': 59,
 'narratively': 60,
 'trouble': 61,
 'day': 62,
 'plod': 

In [22]:
f = open("../dataset/word_to_index.json", "w")
json.dump(word_to_index, f)
f.close()