In [128]:
from keras.models import load_model
import numpy as np 
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Dropout
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re
import json
import pandas as pd
from utils import get_train_data_from_csv, get_dev_data_from_csv, get_test_data_from_csv, Indexer, get_indexer
from nltk.tokenize import TweetTokenizer

In [129]:
model = load_model('models/model_15_ds_50.h5')

In [137]:
include_test = False

tknr = TweetTokenizer()
word_indexer = Indexer()
word_indexer.add_and_get_index("UNK")

train_data = get_train_data_from_csv('data/train_15_ds.csv')
dev_data = get_dev_data_from_csv('data/dev_15_ds.csv')
test_data = get_test_data_from_csv('data/test_15_ds.csv')

X_train = []
Y_train = []
X_dev = []
Y_dev = []
Y_dev_true = []
X_test = []
Y_test = []
Y_test_true = []

for d in train_data:
    words = tknr.tokenize(d.text)
    vector = []
    for word in words:
        idx= word_indexer.add_and_get_index(word)
        vector.append(idx)
    X_train.append(vector)
    y = d.label
    y_onehot = to_categorical(y, len(indexer), dtype='float32')
    Y_train.append(y_onehot)

print("indexed training data")

for d in dev_data:
    words = tknr.tokenize(d.text)
    vector = []
    for word in words:
        idx= word_indexer.add_and_get_index(word)
        vector.append(idx)
    X_dev.append(vector)
    y = d.label
    y_onehot = to_categorical(y, len(indexer), dtype='float32')
    Y_dev.append(y_onehot)
    Y_dev_true.append(d.label)

print("indexed dev data")

if include_test:
    for d in test_data:
        words = tknr.tokenize(d.text)
        vector = []
        for word in words:
            idx= word_indexer.add_and_get_index(word)
            vector.append(idx)
    X_test.append(vector)
    y = d.label
    y_onehot = to_categorical(y, len(indexer), dtype='float32')
    Y_test.append(y_onehot)
    Y_test_true.append(d.label)

if include_test:
    print("indexed test data")

ix = len(X_train)
dix = len(X_dev)
X = X_train + X_dev + X_test
X = np.array(X)
X = pad_sequences(X)

X_train = np.array(X[:ix])
X_dev = np.array(X[ix:ix+dix])
X_test = np.array(X[ix+dix:])
Y_train = np.array(Y_train)
Y_dev = np.array(Y_dev)
Y_test = np.array(Y_test)

read train data from: data/train_15_ds.csv
read dev data from: data/dev_15_ds.csv
read test data from: data/test_15_ds.csv
indexed training data
indexed dev data


In [138]:
indexer = get_indexer('indexer_15_dups.csv')
indexer

read indexer from: indexer_15_dups.csv


['😂', '💯', '😍', '🔥', '💕', '💀', '😩', '😭', '😊', '❤', '🙏', '👀', '🙄', '😘', '🎃']

In [171]:
to_predict = ["Water bottle", 
              "iPhone", 
              "I went to watch the Avengers last week", 
              "you up?"]
preds = []

for ex in to_predict:
    tk = tknr.tokenize(ex)
    vec = []
    for k in tk:
        idx = word_indexer.index_of(k)
        if idx == -1:
            idx = 0
        vec.append(idx)
    preds.append(np.array(vec))
    
preds.append(X[0])

In [172]:
preds = pad_sequences(preds)
preds = np.array(preds[:-1])

In [173]:
predictions = model.predict_classes(preds)


In [174]:
for idx, pred in enumerate(predictions):
    print("text:", to_predict[idx])
    print("pred:", indexer.get_object(pred))
    print()

text: Water bottle
pred: 👀

text: iPhone
pred: 😍

text: I went to watch the Avengers last week
pred: 💀

text: you up?
pred: 😘

