In [1]:
import warnings
warnings.filterwarnings('ignore',category=FutureWarning)

import re
import os
import time
import json
import numpy as np
import pandas as pd
import pickle as pkl
from tensorflow import keras
import matplotlib.pyplot as plt
from collections import defaultdict 
import tensorflow.keras.backend as K

from tensorflow import one_hot
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import Sequential

from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import RepeatVector
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import TimeDistributed

from tensorflow.keras.optimizers import Adam

TEST = 50
EPOCHS = 15
LIMIT = 700
MAX_LEN = 50
BATCH_SIZE = 16
HIDDEN_DIM = 100
VECTOR_DIM = 100
VALID_SPLIT = 0.15

def get_data(path, setNum):

    X = []
    Y = []

    data = json.load(open(path))

    if setNum:
        data = data[LIMIT*(setNum-1):LIMIT*setNum] 

    for dict in data:

        x = dict['input']
        y = dict['output']

        n = len(x)

        for i in range(n):
            x[i] = re.sub('@[^ ]+','<username>',x[i])
            x[i] = re.sub('http://[^ ]+','<link>',x[i])

            y[i] = re.sub('@[^ ]+','<username>',y[i])
            y[i] = re.sub('http://[^ ]+','<link>',y[i])

        for _ in range(MAX_LEN-n):
            x.append("")
            y.append("")
        
        X.append(x)
        Y.append(y)

    return X, Y

def buildDict(data):

    wordToNum = defaultdict(int)
    num = 1
    for sent in data:
        for word in sent:
            if not wordToNum[word]:
                wordToNum[word] = num
                num+= 1

    return wordToNum

def buildDictInv(wordToNum):

    numToWord = defaultdict(str)

    for key in wordToNum.keys():
        numToWord[wordToNum[key]]=key

    return numToWord

def tokenize(data,wordToNum):

    tokenizedData = []

    for sent in data:

        tokenizedSent = []
        for word in sent:
            tokenizedSent.append(wordToNum[word])

        tokenizedSent=np.array(tokenizedSent,dtype=float)
        tokenizedData.append(tokenizedSent)

    return np.array(tokenizedData)

def getModel(VOCAB_SIZE):

    model = Sequential()

    model.add(Input(shape=(MAX_LEN,)))
    model.add(Embedding(VOCAB_SIZE, output_dim=VECTOR_DIM, input_length=MAX_LEN, trainable=True))
    model.add(LSTM(HIDDEN_DIM, return_sequences = True))
    model.add(TimeDistributed(Dense(VOCAB_SIZE)))
    model.add(Activation('softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    
    print(model.summary())
    
    return model

def defineModel():

    raw, normalized = get_data('data.json', 0)

    wordToNum = buildDict(raw + normalized)
    numToWord = buildDictInv(wordToNum)
    vocab_size = len(wordToNum)+1

    model       = getModel(vocab_size)

    model.save('model.h5')

    return wordToNum, numToWord

wordToNum, numToWord = defineModel()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 100)           1634100   
_________________________________________________________________
lstm (LSTM)                  (None, 50, 100)           80400     
_________________________________________________________________
time_distributed (TimeDistri (None, 50, 16341)         1650441   
_________________________________________________________________
activation (Activation)      (None, 50, 16341)         0         
Total params: 3,364,941
Trainable params: 3,364,941
Non-trainable params: 0
_________________________________________________________________
None


In [2]:
def train(setNum):

  model = keras.models.load_model('model.h5')

  raw, normalized = get_data('data.json', setNum)

  raw = tokenize(raw, wordToNum)
  normalized = tokenize(normalized, wordToNum)

  x = raw
  y = normalized
  y = one_hot(y,len(wordToNum)+1, on_value=1, off_value=0)

  model.fit(x, y, batch_size = BATCH_SIZE, epochs = EPOCHS, validation_split = VALID_SPLIT)

  model.save('model.h5')

In [3]:
train(1)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [4]:
train(2)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [5]:
train(3)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [6]:
train(4)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [7]:
train(1)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [8]:
train(2)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [9]:
train(3)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [10]:
train(4)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [11]:
model = keras.models.load_model('model.h5')

def predict(raw):
    
    test = raw.split(" ")
    for i in range(MAX_LEN-len(test)):
        test.append("")

    test = [test]
    test = tokenize(test, wordToNum)


    pred = model.predict(test)

    normalized = []
    for i in range(MAX_LEN):
        normalized.append(numToWord[np.argmax(pred[0][i])])

    normalized = " ".join(normalized)

    return normalized

In [12]:
raw = "U r not playin wif Me"

norm = predict(raw)
print("Raw : ", raw)
print("Norm: ", norm)

Raw :  U r not playin wif Me
Norm:  you are not playing with me                                            


In [13]:
raw = "omg u r so funny , LOL"

norm = predict(raw)
print("Raw : ", raw)
print("Norm: ", norm)

Raw :  omg u r so funny , LOL
Norm:  oh my god you are so funny , laughing out loud                                           


In [14]:
raw = "tbh idk anything abt it"

norm = predict(raw)
print("Raw : ", raw)
print("Norm: ", norm)

Raw :  tbh idk anything abt it
Norm:  to be honest i don't know anything about it                                             


In [15]:
raw = "yea ... didnt Look lik it"

norm = predict(raw)
print("Raw : ", raw)
print("Norm: ", norm)

Raw :  yea ... didnt Look lik it
Norm:  yeah ... didn't look like it                                            
