In [1]:
import string
import re
import numpy as np
import pickle
import time

In [2]:
def clean_document(doco):
    punctuation = string.punctuation + '\n\n'
    punc_replace = ''.join([' ' for s in punctuation])
    doco_clean = doco.replace('-', ' ')
    doco_alphas = re.sub(r'\W +', '', doco_clean)
    trans_table = str.maketrans(punctuation, punc_replace)
    doco_clean = ' '.join([word.translate(trans_table) for word in doco_alphas.split(' ')])
    doco_clean = doco_clean.split(' ')
    doco_clean = [word.lower() for word in doco_clean if len(word) > 0]
    
    return doco_clean

In [3]:
def dataset_creator(file, inp_len):
    X = []
    label = []
    for line in file:
        cline = clean_document(line)
        length = len(cline)
        if length <= inp_len:
            continue
        for i in range(0, length - inp_len):
            X.append(cline[i:i+inp_len])
            label.append(cline[i+inp_len])
        X.append(cline[i+1:])
        label.append('<EOS>')
    return X, label

In [4]:
file_train = open('train.txt')
file_test = open('test.txt')

In [5]:
timesteps = 3

In [6]:

X_train, y_train = dataset_creator(file_train, timesteps)
X_test, y_test = dataset_creator(file_test, timesteps)

In [7]:
vocab = {}
i = 0
for row in X_train:
    for el in row:
        if el not in vocab:
            vocab[el] = i
            i += 1
            
for row in X_test:
    for el in row:
        if el not in vocab:
            vocab[el] = i
            i += 1
vocab['<EOS>'] = i

In [25]:
reverse_vocab = {}
for key in vocab.keys():
    reverse_vocab[vocab[key]] = key

In [30]:
reverse_vocab[0]

'the'

In [8]:
def one_hot_creator(X,y, vocab = vocab, input_len = timesteps):
    vocab_len = len(vocab)
    one_hotX = []
    one_hotY = []
    for row in X:
        temp = np.zeros(shape = (input_len, vocab_len))
        for (i,el) in enumerate(row):
            temp[i][vocab[el]] = 1
        one_hotX.append(temp)
            
    for row in y:
        temp = np.zeros(shape = (vocab_len,))
        temp[vocab[row]] = 1
        one_hotY.append(temp)
        
    return np.array(one_hotX), np.array(one_hotY)    

In [92]:
X_train_oh, y_train_oh = one_hot_creator(X_train[:128], y_train[:128])   #one_hot vectors
# # X_test_oh, y_test_oh = one_hot_creator(X_test, y_test)

In [93]:
z = 0
for i in y_train_oh[0]:
    if i == 1:
        z += 1
        
z

1

In [186]:
X_train[0]

['the', 'third', 'was']

In [45]:
t.shape

(3, 6683)

In [76]:
d = np.ones((3,1000))

In [77]:
e = np.ones((1000,5))
f = np.ones(5)

In [84]:
g = np.dot(d,e) + f
g

array([[1001., 1001., 1001., 1001., 1001.],
       [1001., 1001., 1001., 1001., 1001.],
       [1001., 1001., 1001., 1001., 1001.]])

In [123]:
a = np.ones((3,2))
b = np.ones((3,2))
c = np.random.uniform((3,2))

z = np.multiply(a,b)
np.multiply(z,c)

array([[2.31590559, 1.21920605],
       [2.31590559, 1.21920605],
       [2.31590559, 1.21920605]])

In [9]:
class GRU():
    def __init__(self,hidden_units, embed_len, batch_size, timesteps):
        
        self.hidden_units = hidden_units
        self.Wc = np.random.normal(size = (embed_len + self.hidden_units, self.hidden_units)) 
        self.Wu = np.random.normal(size = (embed_len + self.hidden_units, self.hidden_units)) 
        self.bc = np.random.normal(size = (1, self.hidden_units)) 
        self.bu = np.random.normal(size = (1, self.hidden_units))         
        self.batch_size = batch_size
        self.clist = []
        self.glist = []
        self.tlist = []
        self.c_initial = np.random.normal(size = (self.batch_size, self.hidden_units))
        self.timesteps = timesteps
        
    def forward(self,X):
        c = self.c_initial
        for i in range(self.timesteps):
            conc_inp = np.concatenate((X[:,i,:], c), axis = 1)
            
            tilda_inp = np.dot(conc_inp, self.Wc) + self.bc
            c_tilda = self.tanh(tilda_inp)
            self.tlist.append(tilda_inp)
            
            gamma_inp = np.dot(conc_inp, self.Wu) + self.bu
            gammau = self.sigmoid(gamma_inp)
            self.glist.append(gamma_inp)
            
            c = np.multiply(gammau, c_tilda) + np.multiply(1-gammau, c)
            self.clist.append(c)
        
        return c  
    
    def backward(self, prev_dev, X, lr, gmin, gmax):
        
        for i in range(self.timesteps):
            ind = self.timesteps - 1 - i
            if ind > 0:
                conc_inp = np.concatenate((X[:,ind,:], self.clist[ind-1]), axis = 1)
            else:
                conc_inp = np.concatenate((X[:,ind,:], self.c_initial), axis = 1)
            
            gammau = self.sigmoid(self.glist[ind])
            
            tanderv = 1 - (self.tanh(self.tlist[ind]))**2
            sigmaderv = gammau*(1-gammau)
            
            c_tilda = self.tanh(self.tlist[ind])
            inp_transpose = np.transpose(conc_inp)
            
            temp = np.multiply(gammau, tanderv)
            temp = np.multiply(temp, prev_dev)
            
            gradWc = np.clip(np.dot(inp_transpose, temp)/self.batch_size, gmin, gmax)
            gradbc = np.clip(np.sum(temp, axis = 0)/self.batch_size, gmin, gmax)
            
            if ind > 0:
                temp = np.multiply(c_tilda - self.clist[ind - 1], sigmaderv)
            else:
                temp = np.multiply(c_tilda - self.c_initial, sigmaderv)
            
            temp = np.multiply(temp, prev_dev) 
            
            gradWu = np.clip(np.dot(inp_transpose, temp)/self.batch_size, gmin, gmax)
            gradbu = np.clip(np.sum(temp, axis = 0)/self.batch_size, gmin, gmax)
            
            self.Wc -= lr*gradWc
            self.bc -= lr*gradbc
            self.Wu -= lr*gradWu
            self.bu -= lr*gradbu
        
        self.clist = []
        self.tlist = []
        self.glist = []
        
    def sigmoid(self, X):
        return ( 1/ (1 + np.exp(-X)))
    
    def tanh(self, X):
        p = np.exp(X)
        m = np.exp(-X)
        return((p-m)/(p+m))  
    
    def load_param(self, param):
        self.Wc = param['Wc']
        self.bc = param['bc']
        self.Wu = param['Wu']
        self.bu = param['bu']
        
    def save_param(self, param):
        param['Wc'] = self.Wc
        param['bc'] = self.bc
        param['Wu'] = self.Wu
        param['bu'] = self.bu
        return param

In [10]:
class Network():
    def __init__(self, hidden_units, embed_len, output_size, batch_size, timesteps):
        self.gru = GRU(hidden_units, embed_len, batch_size, timesteps)
        self.W = np.random.normal(size = (hidden_units, output_size))
        self.b = np.random.normal(size = (1, output_size))
        self.batch_size = batch_size
        self.c = 0
        self.o = 0
        
    def forward(self, X):
        self.c = self.gru.forward(X)
        self.o = self.softmax(np.dot(self.c, self.W) + self.b)
        return self.o
    
    def backward(self, X, y, lr, gmin, gmax):
        grad = self.o - y

        gradW = np.clip(np.dot(np.transpose(self.c), grad)/self.batch_size, gmin, gmax)
        gradb = np.clip(np.sum(grad, axis = 0)/self.batch_size, gmin, gmax)

        self.c = 0
        self.o = 0
                        
        grad_to_backprop = np.dot(grad, np.transpose(self.W))
        
        self.W -= lr*gradW
        self.b -= lr*gradb
        
        self.gru.backward(grad_to_backprop, X, lr, gmin, gmax)
        
    def softmax(self, X):
        exps = np.exp(X - np.reshape(np.max(X, axis = 1), (X.shape[0], 1)))
        return exps / np.reshape(np.sum(exps, axis = 1), (X.shape[0], 1))
    
    def load_param(self, param):
        self.W = param['W']
        self.b = param['b']
        self.gru.load_param(param)
        
    def save_param(self):
        param = {}
        param['W'] = self.W
        param['b'] = self.b
        return(self.gru.save_param(param))
        

In [11]:
def loss_calc(pred, actual):
    mult = np.multiply(np.log(pred), actual)
    return -np.sum(mult)/pred.shape[0]

In [12]:
def train(X, y, net, epochs, lr, loss_list, batch_size, gmin = -10, gmax = 10, time_to_save = 1, cont_from = 0):
    train_len = len(X)
    for i in range(1, epochs+1):
        tic = time.time()
        counter = 0
        
        for j in range(0, train_len, batch_size):
            
            if(j+batch_size > train_len):
                X_batch, y_batch = one_hot_creator(X[train_len-batch_size:], y[train_len-batch_size:])
            else:
                X_batch, y_batch = one_hot_creator(X[j:j+batch_size], y[j:j+batch_size])
                
            pred = net.forward(X_batch)
            loss = loss_calc(pred, y_batch)
            
            loss_list.append(loss)
            net.backward(X_batch, y_batch, lr, gmin, gmax)
            counter += 1
            
        if i % time_to_save == 0:
            param_dict = net.save_param()
            with open('param_epoch_' + str(i) + '.pkl', 'wb') as f:
                pickle.dump(param_dict, f)
        
        ep_time = time.time() - tic
        print("Epoch: %d --> Average Loss: %.3f completed in %.3f seconds" 
              %(cont_from + i, sum(loss_list[len(loss_list)-counter:]) / counter, ep_time))

In [48]:
net2 = Network(1024, len(vocab), len(vocab), 256, 3)

In [46]:
with open('param_epoch_44.pkl', 'rb') as f:
    tada = pickle.load(f)

In [49]:
net2.load_param(tada)

In [51]:
x,y = one_hot_creator(X_train[:256], y_train[:256])
p = net2.forward(x)

In [52]:
p.shape

(256, 6683)

In [57]:
yint = np.argmax(y, axis = 1)
pint = np.argmax(p, axis = 1)
for i in range(pint.shape[0]):
    print("predicted " + str(reverse_vocab[pint[i]]) + " actual " + str(reverse_vocab[yint[i]]) )

predicted artists actual being
predicted background actual run
predicted prisoner actual by
predicted quickly actual the
predicted kuwaiti actual head
predicted lighter actual of
predicted anyways actual an
predicted covering actual investment
predicted furnishings actual firm
predicted kenneally actual <EOS>
predicted nov15 actual he
predicted pay actual was
predicted kosher actual manipulating
predicted belt actual the
predicted rate actual market
predicted an actual with
predicted successors actual his
predicted flowers actual bombing
predicted jeez actual targets
predicted insist actual <EOS>
predicted carless actual engineer
predicted someday actual asi
predicted vsmiami actual ali
predicted gold actual from
predicted initiated actual tikrit
predicted control actual <EOS>
predicted mumbai actual also
predicted gently actual killed
predicted ham actual in
predicted drill actual the
predicted feeding actual attack
predicted hairdresser actual <EOS>
predicted blogosphere actual it
pr

In [54]:
yint[0]

3

In [56]:
summer = 0
for i in range(256):
    summer += np.log(p[i][yint[i]])
    
-summer/256

114.23882330678707

In [108]:
loss_list = []
train(X_train, y_train, net, 50, 0.01, loss_list, 128)

Epoch: 1 --> Average Loss: 56.538 completed in 110.000 seconds
Epoch: 2 --> Average Loss: 54.516 completed in 110.083 seconds
Epoch: 3 --> Average Loss: 53.196 completed in 110.757 seconds
Epoch: 4 --> Average Loss: 52.092 completed in 111.730 seconds
Epoch: 5 --> Average Loss: 51.070 completed in 108.668 seconds
Epoch: 6 --> Average Loss: 50.199 completed in 110.120 seconds
Epoch: 7 --> Average Loss: 49.365 completed in 111.043 seconds
Epoch: 8 --> Average Loss: 48.562 completed in 110.859 seconds
Epoch: 9 --> Average Loss: 47.839 completed in 111.537 seconds
Epoch: 10 --> Average Loss: 47.220 completed in 108.511 seconds
Epoch: 11 --> Average Loss: 46.601 completed in 110.023 seconds
Epoch: 12 --> Average Loss: 45.974 completed in 111.316 seconds
Epoch: 13 --> Average Loss: 45.424 completed in 109.929 seconds
Epoch: 14 --> Average Loss: 44.875 completed in 115.560 seconds
Epoch: 15 --> Average Loss: 44.303 completed in 111.134 seconds
Epoch: 16 --> Average Loss: 43.813 completed in 1

In [111]:
train(X_train, y_train, net, 50, 0.001, loss_list, 128, cont_from=50)

Epoch: 51 --> Average Loss: 35.246 completed in 104.009 seconds
Epoch: 52 --> Average Loss: 35.230 completed in 105.677 seconds


KeyboardInterrupt: 

In [None]:
loss_list2 = []
train(X_train, y_train, net2, 50, 0.01, loss_list2, 256)

Epoch: 1 --> Average Loss: 114.529 completed in 280.236 seconds
Epoch: 2 --> Average Loss: 111.436 completed in 278.928 seconds
Epoch: 3 --> Average Loss: 108.830 completed in 279.059 seconds
Epoch: 4 --> Average Loss: 106.986 completed in 278.852 seconds
Epoch: 5 --> Average Loss: 105.232 completed in 280.288 seconds
Epoch: 6 --> Average Loss: 103.848 completed in 278.417 seconds
Epoch: 7 --> Average Loss: 102.589 completed in 279.409 seconds
Epoch: 8 --> Average Loss: 101.249 completed in 279.429 seconds
Epoch: 9 --> Average Loss: 100.077 completed in 277.888 seconds
Epoch: 10 --> Average Loss: 99.031 completed in 277.133 seconds
Epoch: 11 --> Average Loss: 97.971 completed in 278.471 seconds
Epoch: 12 --> Average Loss: 96.830 completed in 256.688 seconds
Epoch: 13 --> Average Loss: 95.764 completed in 249.403 seconds
Epoch: 14 --> Average Loss: 94.895 completed in 250.579 seconds
Epoch: 15 --> Average Loss: 94.046 completed in 252.558 seconds
Epoch: 16 --> Average Loss: 93.177 compl

In [23]:
toc = time.time()

In [24]:
toc - tic

8.918617963790894

In [184]:
i = 0
j = 0.25
print("Epoch: %d --> Average Loss: %.3f" %(i, j))

Epoch: 0 --> Average Loss: 0.250


In [179]:
l = []
for i in range(1,10):
    l.append(i)

In [215]:
import pickle

In [223]:
j = 5
with open('param_' + str(j) + 'pkl', 'wb') as f:
    pickle.dump(vocab, f)

In [217]:
with open('param' + j + '.pkl', 'rb') as f:
    tada = pickle.load(f)

In [220]:
len(tada)

6683

In [17]:
def buildmodel(VOC_LEN, inp_len = input_len):
    model = Sequential()
    model.add(LSTM(256, input_shape = (inp_len, VOC_LEN), return_sequences = True))
    model.add(Dropout(0.2))
    model.add(Dense(VOC_LEN, activation = 'softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')
    return model

In [18]:
model = buildmodel(len(vocab))

In [19]:
model.fit(X_train_oh, y_train_oh, epochs = 50, batch_size = 128)

ValueError: Error when checking target: expected dense_3 to have 2 dimensions, but got array with shape (1000, 6683, 1)