In [11]:
import warnings
warnings.filterwarnings('ignore',category=FutureWarning)

import re
import json
import numpy as np
import pandas as pd
import pickle as pkl

from collections import defaultdict 

from tensorflow import keras
from tensorflow import one_hot

from tensorflow.keras.models import Sequential

from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import TimeDistributed

from tensorflow.keras.optimizers import Adam

TEST = 550
EPOCHS = 35
MAX_LEN = 50
TRAIN = 2400
BATCH_SIZE = 16
HIDDEN_DIM = 100
VECTOR_DIM = 100

In [12]:
def get_data(path, name):

    X = []
    Y = []

    data = json.load(open(path))

    if name=='train':
        data = data[:TRAIN]

    elif name=='test':
        data = data[-TEST:]

    for dict in data:

        x = dict['input']
        y = dict['output']

        n = len(x)

        for i in range(n):
            x[i] = re.sub('@[^ ]+','<username>',x[i])
            x[i] = re.sub('http://[^ ]+','<link>',x[i])

            y[i] = re.sub('@[^ ]+','<username>',y[i])
            y[i] = re.sub('http://[^ ]+','<link>',y[i])

        for _ in range(MAX_LEN-n):
            x.append("")
            y.append("")
        
        X.append(x)
        Y.append(y)

    return X, Y

In [13]:
def buildDict(data):

    wordToNum = defaultdict(int)
    num = 1
    for sent in data:
        for word in sent:
            if not wordToNum[word]:
                wordToNum[word] = num
                num+= 1

    return wordToNum

In [14]:
def buildDictInv(wordToNum):

    numToWord = defaultdict(str)

    for key in wordToNum.keys():
        numToWord[wordToNum[key]]=key

    return numToWord

In [15]:
def tokenize(data,wordToNum):

    tokenizedData = []

    for sent in data:

        tokenizedSent = []
        for word in sent:
            tokenizedSent.append(wordToNum[word])

        tokenizedSent=np.array(tokenizedSent,dtype=float)
        tokenizedData.append(tokenizedSent)

    return np.array(tokenizedData)

In [16]:
def getModel(VOCAB_SIZE):

    model = Sequential()

    model.add(Input(shape=(MAX_LEN,)))
    model.add(Embedding(VOCAB_SIZE, output_dim=VECTOR_DIM, input_length=MAX_LEN, trainable=True))
    model.add(LSTM(HIDDEN_DIM, return_sequences = True))
    model.add(TimeDistributed(Dense(VOCAB_SIZE)))
    model.add(Activation('softmax'))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])
    
    print(model.summary())
    
    return model

In [17]:
def defineModel():

    raw, normalized = get_data('data/data.json', 'train')

    wordToNum = buildDict(raw + normalized)
    numToWord = buildDictInv(wordToNum)
    vocab_size = len(wordToNum)+1

    model       = getModel(vocab_size)

    model.save('model.h5')
    pkl.dump(wordToNum, open('data/wordToNum.pkl', 'wb'))
    pkl.dump(numToWord, open('data/numToWord.pkl', 'wb'))

In [18]:
def train():

  model = keras.models.load_model('model.h5')
  wordToNum = pkl.load(open('data/wordToNum.pkl', 'rb'))

  raw, normalized = get_data('data/data.json', 'train')
  rawValid, normalizedValid = get_data('data/data.json', 'test')

  raw = tokenize(raw, wordToNum)
  normalized = tokenize(normalized, wordToNum)
  rawValid = tokenize(rawValid, wordToNum)
  normalizedValid = tokenize(normalizedValid, wordToNum)

  x = raw
  y = normalized
  y = y.reshape(y.shape[0], y.shape[1], 1)

  xValid = rawValid
  yValid = normalizedValid
  yValid = yValid.reshape(yValid.shape[0], yValid.shape[1], 1)

  model.fit(x, y, validation_data=(xValid, yValid), batch_size = BATCH_SIZE, epochs = EPOCHS)

  model.save('model.h5')

In [19]:
defineModel()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 100)           1382600   
_________________________________________________________________
lstm_1 (LSTM)                (None, 50, 100)           80400     
_________________________________________________________________
time_distributed_1 (TimeDist (None, 50, 13826)         1396426   
_________________________________________________________________
activation_1 (Activation)    (None, 50, 13826)         0         
Total params: 2,859,426
Trainable params: 2,859,426
Non-trainable params: 0
_________________________________________________________________
None


In [20]:
train()

Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35
