In [1]:
import json
import os
import re
from collections import Counter
import pickle

In [2]:
train_sents, train_tags, train_intents = [], [], []
path = '../00_data/snips/train'
for filename in os.listdir(path):
    with open(path + '/' + filename) as json_file:
        intent = filename.split('_')[1]
        try:
            data = json.load(json_file)
            data = data[intent]
            # print(data[:5])
        except UnicodeDecodeError:
            pass
        for sent in data:
            s, t = [], []
            for dct in sent['data']:
                if 'entity' in dct.keys():
                    t.append(dct['entity'])
                    s.append(dct['text'])
                else:
                    t.append("NONE")
                    s.append(dct['text'])
            train_sents.append(s)
            train_tags.append(t)
            train_intents.append(intent)

In [3]:
len(train_sents), len(train_tags), len(train_intents)

(13784, 13784, 13784)

In [4]:
Counter(train_intents).most_common()

[('GetWeather', 2000),
 ('PlayMusic', 2000),
 ('BookRestaurant', 1973),
 ('SearchScreeningEvent', 1959),
 ('RateBook', 1956),
 ('SearchCreativeWork', 1954),
 ('AddToPlaylist', 1942)]

In [5]:
val_sents, val_tags, val_intents = [], [], []
path = '../00_data/snips/validate'
for filename in os.listdir(path):
    if 'json' in filename:
        with open(path + '/' + filename) as json_file:
            intent = filename.split('_')[1]
            intent = intent.split('.')[0]
            print(intent)
            try:
                data = json.load(json_file)
                data = data[intent]
            except UnicodeDecodeError:
                pass
            for sent in data:
                s, t = [], []
                for dct in sent['data']:
                    if 'entity' in dct.keys():
                        t.append(dct['entity'])
                        s.append(dct['text'])
                    else:
                        t.append("NONE")
                        s.append(dct['text'])
                val_sents.append(s)
                val_tags.append(t)
                val_intents.append(intent)

SearchCreativeWork
GetWeather
PlayMusic
RateBook
SearchScreeningEvent
BookRestaurant
AddToPlaylist


In [6]:
len(val_sents), len(val_tags), len(val_intents)

(700, 700, 700)

In [7]:
Counter(val_intents).most_common()

[('SearchCreativeWork', 100),
 ('GetWeather', 100),
 ('PlayMusic', 100),
 ('RateBook', 100),
 ('SearchScreeningEvent', 100),
 ('BookRestaurant', 100),
 ('AddToPlaylist', 100)]

In [8]:
print(train_sents[0])
print(train_tags[0])
print(train_intents[0])

['Add another ', 'song', ' to the ', 'Cita Romántica', ' playlist. ']
['NONE', 'music_item', 'NONE', 'playlist', 'NONE']
AddToPlaylist


In [9]:
# preprocess sentences
def cleanup(sentlist, taglist):
    newsents = []
    newtags = []
    for idx, ss in enumerate(sentlist):
        nss, ntt = [], []
        for jdx, s in  enumerate(ss):
            s = s.lower()
            for c in ['.', ',', '!', '?', ]:
                s = s.replace(c, '')
            tt = s.split()
            for t in tt:
                nss.append(t)
                ntt.append(taglist[idx][jdx])
        newsents.append(nss)
        newtags.append(ntt)
    return newsents, newtags

In [10]:
train_sents_c, train_tags_c = cleanup(train_sents, train_tags)

In [11]:
val_sents_c, val_tags_c = cleanup(val_sents, val_tags)

In [12]:
for i in range(10):
    print(train_intents[i])
    print(train_sents_c[i])
    print(train_tags_c[i])
    print()

AddToPlaylist
['add', 'another', 'song', 'to', 'the', 'cita', 'romántica', 'playlist']
['NONE', 'NONE', 'music_item', 'NONE', 'NONE', 'playlist', 'playlist', 'NONE']

AddToPlaylist
['add', 'clem', 'burke', 'in', 'my', 'playlist', 'pre-party', 'r&b', 'jams']
['NONE', 'artist', 'artist', 'NONE', 'playlist_owner', 'NONE', 'playlist', 'playlist', 'playlist']

AddToPlaylist
['add', 'live', 'from', 'aragon', 'ballroom', 'to', 'trapeo']
['NONE', 'entity_name', 'entity_name', 'entity_name', 'entity_name', 'NONE', 'playlist']

AddToPlaylist
['add', 'unite', 'and', 'win', 'to', 'my', 'night', 'out']
['NONE', 'entity_name', 'entity_name', 'entity_name', 'NONE', 'playlist_owner', 'playlist', 'playlist']

AddToPlaylist
['add', 'track', 'to', 'my', 'digster', 'future', 'hits']
['NONE', 'music_item', 'NONE', 'playlist_owner', 'playlist', 'playlist', 'playlist']

AddToPlaylist
['add', 'the', 'piano', 'bar', 'to', 'my', 'cindy', 'wilson']
['NONE', 'playlist', 'playlist', 'playlist', 'NONE', 'playlist_o

In [13]:
for i in range(len(val_intents[:10])):
    print(val_intents[i])
    print(val_sents_c[i])
    print(val_tags_c[i])
    print()

SearchCreativeWork
['wish', 'to', 'find', 'the', 'movie', 'the', 'heart', 'beat']
['NONE', 'NONE', 'NONE', 'NONE', 'object_type', 'NONE', 'object_name', 'object_name']

SearchCreativeWork
['please', 'look', 'up', 'the', 'tv', 'show', 'vanity']
['NONE', 'NONE', 'NONE', 'NONE', 'object_type', 'object_type', 'object_name']

SearchCreativeWork
['get', 'me', 'the', "elvis'", 'christmas', 'album', 'tv', 'show']
['NONE', 'NONE', 'NONE', 'object_name', 'object_name', 'object_name', 'object_type', 'object_type']

SearchCreativeWork
['please', 'find', 'me', 'the', 'saga', 'the', 'deep', 'six']
['NONE', 'NONE', 'NONE', 'NONE', 'object_type', 'object_name', 'object_name', 'object_name']

SearchCreativeWork
['wish', 'to', 'see', 'the', 'photograph', 'with', 'the', 'name', 'live:', 'right', 'here']
['NONE', 'NONE', 'NONE', 'NONE', 'object_type', 'NONE', 'NONE', 'NONE', 'object_name', 'object_name', 'object_name']

SearchCreativeWork
['looking', 'for', 'a', 'novel', 'called', 'death', 'march']
['NONE

In [14]:
import numpy as np
np.shape(val_intents), np.shape(val_sents_c), np.shape(val_tags_c)

((700,), (700,), (700,))

In [15]:
pickle.dump(train_sents_c, open('../00_data/snips/train_sents.pkl', 'wb'))
pickle.dump(train_tags_c, open('../00_data/snips/train_tags.pkl', 'wb'))
pickle.dump(train_intents, open('../00_data/snips/train_intents.pkl', 'wb'))
pickle.dump(val_sents_c, open('../00_data/snips/val_sents.pkl', 'wb'))
pickle.dump(val_tags_c, open('../00_data/snips/val_tags.pkl', 'wb'))
pickle.dump(val_intents, open('../00_data/snips/val_intents.pkl', 'wb'))