In [1]:
!nvidia-smi # check which GPU we have

Mon Jun 21 12:05:43 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   64C    P8    11W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

### Imports

In [2]:
import pandas as pd
import numpy as np
import re
import string
from collections import Counter, defaultdict
from itertools import islice

from keras.preprocessing.text import one_hot
from keras.layers import Embedding, Dense, Dropout, Flatten
from keras import Sequential 
from keras.preprocessing.sequence import pad_sequences

import tensorflow.keras.backend as K
import tensorflow as tf

from tensorflow.keras.callbacks import EarlyStopping

from keras.models import model_from_json
from pathlib import Path

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Data Processing

In [3]:
DIRECTORY = "" 

class Sentences(object):

    def __init__(self,filename, vocab = None) -> None:
        self.filename = filename
        if vocab is None:
          self.vocab = self.unk_handling(1)
        else: 
          self.vocab = vocab
        self.hash_to_word = defaultdict(lambda:"<UNK>")

    def unk_handling(self,threshold):
        counter = Counter()
        with open(DIRECTORY+self.filename,"rb") as file:
            for sentence in file:
                counter.update(Counter(str(sentence).lower().translate(str.maketrans('','',string.punctuation)).split()))

        return {k for k,c in counter.items() if c > threshold}

    def __iter__(self):
        vocab_length = len(self.vocab)+2
        with open(DIRECTORY + self.filename,"rb") as file:
            for sentence in file:
              encoded_arr = [one_hot("<s>",vocab_length)[0]]
              for word in [word if word in self.vocab else "<UNK>" for word in str(sentence).lower().translate(str.maketrans('','',string.punctuation)).split()]:
                  hashed_word = one_hot(word,vocab_length)
                  self.hash_to_word[hashed_word[0]] = word
                  encoded_arr.append(hashed_word[0])
              yield np.array(encoded_arr)

def subseqs(seq,window_length):
  return np.fromfunction(lambda i, j: seq[i + j], (len(seq) - window_length + 1, window_length),dtype=int)

train_sentences = Sentences("nchlt_text.nr.train")
val_sentences = Sentences("nchlt_text.nr.valid", train_sentences.vocab)
test_sentences = Sentences("nchlt_text.nr.test", train_sentences.vocab)

In [4]:
train = []
val = []
test = []

window_length = 3

for vec in train_sentences:
  train.extend(subseqs(vec,window_length))
for vec in val_sentences:
  val.extend(subseqs(vec,window_length))
for vec in test_sentences:
  test.extend(subseqs(vec,window_length))

train = pd.DataFrame(train)
val = pd.DataFrame(val)
test = pd.DataFrame(test)

In [5]:
X_train = np.array(train.iloc[:,0:window_length-1])
y_train = np.array(train.iloc[:,window_length-1])

X_val = np.array(val.iloc[:,0:window_length-1])
y_val = np.array(val.iloc[:,window_length-1])

X_test = np.array(test.iloc[:,0:window_length-1])
y_test = np.array(test.iloc[:,window_length-1])

In [6]:
X_train

array([[36325, 35579],
       [35579, 55139],
       [55139, 35730],
       ...,
       [32456, 47347],
       [47347, 56996],
       [56996, 15477]])

### Baseline Neural Network Model

In [7]:
vocab_size = len(train_sentences.vocab)+2
vocab_size

58108

In [8]:
# Perplexity metric
def perplexity(y_true, y_pred):
   scce = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)
   perplexity = K.exp(scce(y_true, y_pred))
   return perplexity


custom_early_stopping = EarlyStopping(
    monitor='val_loss', 
    patience=2, 
    min_delta=0.0001 # amount of change to quantify an improvement
)

In [9]:
# define the model
model = Sequential()
model.add(Embedding(vocab_size, 1000, input_length=window_length-1))
model.add(Flatten())
model.add(Dense(1024, activation = 'relu'))
model.add(Dropout(0.2))
model.add(Dense(2048, activation = 'relu'))
model.add(Dropout(0.5))
model.add(Dense(vocab_size, activation='softmax'))

# compile the model
model.compile(optimizer='adam', # defualt params of [learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False]
              loss='sparse_categorical_crossentropy', 
              metrics=['accuracy', perplexity])
# summarize the model
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 2, 1000)           58108000  
_________________________________________________________________
flatten (Flatten)            (None, 2000)              0         
_________________________________________________________________
dense (Dense)                (None, 1024)              2049024   
_________________________________________________________________
dropout (Dropout)            (None, 1024)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 2048)              2099200   
_________________________________________________________________
dropout_1 (Dropout)          (None, 2048)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 58108)             1

In [10]:
epochs = 50
batch_size = 4096

In [11]:
# Train
model_history = model.fit(x = X_train, 
                                y = y_train, 
                                epochs=epochs,
                                batch_size=batch_size,
                                validation_data = (X_val,y_val),
                                callbacks=[custom_early_stopping],
                                verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50


In [12]:
model_structure=model.to_json()
f = Path("/content/drive/My Drive/NLP A2/trigram_structure.json")
f.write_text(model_structure)
model.save_weights('/content/drive/My Drive/NLP A2/trigram.h5')

In [None]:
model.load_weights('/content/drive/My Drive/NLP A2/trigram.h5')

In [17]:
# evaluate the model
loss, accuracy, perplexity = model.evaluate(X_test, y_test, verbose=0)
print('Perplexity: %f' % (np.exp(loss)))
print('Accuracy: %f' % (accuracy*100))

Perplexity: 8.757308
Accuracy: 60.357952
