In [1]:
import os
import pandas as pd
import numpy as np
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from tqdm import tqdm
import json

In [2]:
TRAIN_PATH = '../dataset/train.tsv'
TEST_PATH = '../dataset/test.tsv'

In [3]:
df_train = pd.read_csv(TRAIN_PATH, sep = '\t')

In [4]:
df_train.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [5]:
df_test = pd.read_csv(TEST_PATH, sep = '\t')

In [6]:
df_test.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [7]:
X_train = df_train.Phrase.values

In [8]:
Y_train = df_train.Sentiment.values

In [9]:
X_train

array(['A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .',
       'A series of escapades demonstrating the adage that what is good for the goose',
       'A series', ..., 'avuncular chortles', 'avuncular', 'chortles'],
      dtype=object)

In [10]:
Y_train

array([1, 2, 2, ..., 3, 2, 2], dtype=int64)

In [11]:
X_test = df_test.Phrase.values

In [12]:
X_test

array(['An intermittently pleasing but mostly routine effort .',
       'An intermittently pleasing but mostly routine effort', 'An', ...,
       'A long-winded ,', 'A long-winded', 'predictable scenario'],
      dtype=object)

In [13]:
X = np.concatenate((X_train, X_test))

In [14]:
X

array(['A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .',
       'A series of escapades demonstrating the adage that what is good for the goose',
       'A series', ..., 'A long-winded ,', 'A long-winded',
       'predictable scenario'], dtype=object)

In [15]:
X.shape, X_train.shape, X_test.shape

((222352,), (156060,), (66292,))

In [16]:
nlp = spacy.load("en")

In [17]:
[word.lemma_ for word in nlp(X[0].lower()) if word.lemma_ not in STOP_WORDS]

['series',
 'escapade',
 'demonstrate',
 'adage',
 'good',
 'goose',
 'good',
 'gander',
 ',',
 'occasionally',
 'amuse',
 'story',
 '.']

In [19]:
maxlen = 0
word_freq_dict = {}
runner = tqdm(X, total = len(X))
for sentence in runner:
    tokenized = nlp(sentence.lower())
    word_list = [str(word.lemma_) for word in tokenized if word.lemma_ not in STOP_WORDS]
    maxlen = max(maxlen, len(word_list))
    for word in word_list:
        if word not in word_freq_dict:
            word_freq_dict[word] = 0
        word_freq_dict[word] += 1

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 222352/222352 [21:41<00:00, 170.78it/s]


In [20]:
word_freq_dict

{'series': 517,
 'escapade': 16,
 'demonstrate': 138,
 'adage': 17,
 'good': 4676,
 'goose': 43,
 'gander': 9,
 ',': 58124,
 'occasionally': 282,
 'amuse': 159,
 'story': 3882,
 '.': 25972,
 'quiet': 198,
 'introspective': 23,
 'entertaining': 563,
 'independent': 74,
 'worth': 658,
 'seek': 216,
 'entertain': 402,
 'fan': 723,
 'ismail': 6,
 'merchant': 37,
 'work': 2615,
 'suspect': 89,
 'hard': 1017,
 'time': 3623,
 'sit': 433,
 'positively': 62,
 'thrill': 295,
 'combination': 108,
 'ethnography': 7,
 'intrigue': 209,
 'betrayal': 51,
 'deceit': 14,
 'murder': 197,
 'shakespearean': 28,
 'tragedy': 393,
 'juicy': 46,
 'soap': 279,
 'opera': 389,
 'aggressive': 30,
 'self': 1428,
 '-': 32027,
 'glorification': 16,
 'manipulative': 152,
 'whitewash': 6,
 'comedy': 2877,
 'drama': 1276,
 'nearly': 554,
 'epic': 303,
 'proportion': 46,
 'root': 246,
 'sincere': 182,
 'performance': 2184,
 'title': 599,
 'character': 4090,
 'undergo': 16,
 'midlife': 23,
 'crisis': 99,
 'undergoing': 1,

In [21]:
maxlen

35

In [22]:
word_freq_dict['.']

25972

In [23]:
word_freq_dict['!']

312

In [24]:
word_freq_dict[',']

58124

In [25]:
len(word_freq_dict)

15741

In [26]:
f = open("../dataset/word_freq_dict.json", "w")
json.dump(word_freq_dict, f)
f.close()

In [27]:
word_to_index = {}
for dex, word in enumerate(word_freq_dict):
    word_to_index[word] = dex

In [28]:
word_to_index

{'series': 0,
 'escapade': 1,
 'demonstrate': 2,
 'adage': 3,
 'good': 4,
 'goose': 5,
 'gander': 6,
 ',': 7,
 'occasionally': 8,
 'amuse': 9,
 'story': 10,
 '.': 11,
 'quiet': 12,
 'introspective': 13,
 'entertaining': 14,
 'independent': 15,
 'worth': 16,
 'seek': 17,
 'entertain': 18,
 'fan': 19,
 'ismail': 20,
 'merchant': 21,
 'work': 22,
 'suspect': 23,
 'hard': 24,
 'time': 25,
 'sit': 26,
 'positively': 27,
 'thrill': 28,
 'combination': 29,
 'ethnography': 30,
 'intrigue': 31,
 'betrayal': 32,
 'deceit': 33,
 'murder': 34,
 'shakespearean': 35,
 'tragedy': 36,
 'juicy': 37,
 'soap': 38,
 'opera': 39,
 'aggressive': 40,
 'self': 41,
 '-': 42,
 'glorification': 43,
 'manipulative': 44,
 'whitewash': 45,
 'comedy': 46,
 'drama': 47,
 'nearly': 48,
 'epic': 49,
 'proportion': 50,
 'root': 51,
 'sincere': 52,
 'performance': 53,
 'title': 54,
 'character': 55,
 'undergo': 56,
 'midlife': 57,
 'crisis': 58,
 'undergoing': 59,
 'narratively': 60,
 'trouble': 61,
 'day': 62,
 'plod': 

In [29]:
f = open("../dataset/word_to_index_train_and_test.json", "w")
json.dump(word_to_index, f)
f.close()