In [3]:
import re
import os
import numpy as np
import pandas as pd
import tensorflow_datasets
import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout,Embedding, LSTM, Bidirectional, Input, Dropout, GlobalAveragePooling1D
from tensorflow.keras import Sequential
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.model_selection import train_test_split
import nltk
from nltk.tokenize import word_tokenize 
from nltk.stem import PorterStemmer 
!pip install transformers
from transformers import *
from google.colab import drive
drive.mount('/content/drive')
nltk.download('punkt')

# tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [0]:
df_train = pd.read_csv("/content/drive/My Drive/ipython notebooks/COSC-572/full_prepped_training.csv")
df_test = pd.read_csv("/content/drive/My Drive/ipython notebooks/COSC-572/prepped_test.csv")

In [0]:
#before feeding into the model, preprocess text by using this function 
def prepare_text(text):
    porter_stemmer = PorterStemmer() 
    NON_ALPHA_NUMERIC = re.compile(r'\W+') 
    numbers = re.compile(r'\d+')
    whitespace = re.compile(r'\s+')
    
    preped_text = ''
    word_tokens = word_tokenize(text) 
    for token in word_tokens:
        #normalize all to lower case
        token = token.lower()
        #normalize special punctuations
        token = token.replace('–', '-')
        token = token.replace("'", ' ')
        # remove non-alpha numeric
        token = re.sub(NON_ALPHA_NUMERIC,' ', token)
        # stem words
        # token = porter_stemmer.stem(token)
        preped_text += token + ' '

    #normalize white spaces    
    preped_text = re.sub(whitespace, ' ', preped_text).strip()
    return preped_text

In [0]:
# process the texts using prepare_text()
df_train['edited_sentence'] = df_train['edited_sentence'].transform(func=prepare_text)
df_test['edited_sentence'] = df_test['edited_sentence'].transform(func=prepare_text)

In [7]:
# function for finding the length of the longest sentence in terms of number of words
def find_maxlen(df, column_name):
    return df[column_name].map(lambda x: len(x.split())).max()
print(find_maxlen(df_train, 'edited_sentence'))
print(find_maxlen(df_test, 'edited_sentence'))

23
22


In [0]:
maxlen = 35

In [9]:
bert_tokenizer_transformer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [0]:
def _get_segments(sentences):
    sentences_segments = []
    for sent in sentences:
      temp = []
      i = 0
      for token in sent.split(" "):
        temp.append(i)
        if token == "[SEP]":
          i += 1
      sentences_segments.append(temp)
    return sentences_segments

def _get_inputs(df,_maxlen,tokenizer,use_keras_pad=False):
    maxqnans = np.int((_maxlen-20)/2)
    pattern = '[^\w\s]+|\n' # remove everything including newline (|\n) other than words (\w) or spaces (\s)
    
    sentences = ["[CLS] " + " " .join(tokenizer.tokenize(samp))[:]  + " [SEP] " for samp in df['edited_sentence']]
    sentences_mask = [[1]*len(sent.split(" "))+[0]*(_maxlen - len(sent.split(" "))) for sent in sentences]

    sentences_padded = [sent + " [PAD]"*(_maxlen-len(sent.split(" "))) if len(sent.split(" "))!=_maxlen else sent for sent in sentences ]

    sentences_converted = [tokenizer.convert_tokens_to_ids(s.split(" ")) for s in sentences_padded]
    
    sentences_segment = _get_segments(sentences_padded)

    genLength = set([len(sent.split(" ")) for sent in sentences_padded])

    if _maxlen < 20:
      raise Exception("max length cannot be less than 20")
    elif len(genLength)!=1: 
      print(genLength)
      raise Exception("sentences are not of same size")


    return [tf.cast(sentences_converted,tf.int32), tf.cast(sentences_segment,tf.int32), tf.cast(sentences_mask,tf.int32)]

    
    

In [0]:
bert_inputs = _get_inputs(df=df_train,tokenizer=bert_tokenizer_transformer,_maxlen=maxlen)

In [0]:
Xtr = bert_inputs
ytr = df_train['meanGrade']

# # print(Xtr[0].shape)
# # print(Xtr[1].shape)
# # print(Xtr[2].shape)
# # print(ytr.shape)


Xte = _get_inputs(df_test,_maxlen=maxlen, tokenizer = bert_tokenizer_transformer )
yte = df_test['meanGrade']


# print(Xte[0].shape)
# print(Xte[1].shape)
# print(Xte[2].shape)
# print(yte.shape)

In [27]:
#Pad sentences and one_hot encode words to use for embeddings.
token_inputs = Input(shape=(maxlen,), dtype=tf.int32, name='input_word_ids')
mask_inputs = Input(shape=(maxlen,), dtype=tf.int32, name='input_masks')
seg_inputs = Input(shape=(maxlen,), dtype=tf.int32, name='input_segments')

bert_model = TFBertModel.from_pretrained("bert-base-uncased")

seq_output,_ = bert_model([token_inputs, mask_inputs, seg_inputs])
X = LSTM(2048, activation='tanh', recurrent_activation='sigmoid', use_bias=True, return_sequences=True)(seq_output)
X = Bidirectional(LSTM(2048, activation='tanh', recurrent_activation='sigmoid', use_bias=True, return_sequences=False))(X)
X = Dense(1024, activation='relu')(X)
X = Dropout(0.5)(X)
X = Dense(512, activation='relu')(X),
X = Dropout(0.5)(X)
X = Dense(128, activation='relu')(X),
X = Dropout(0.5)(X)
X = Dense(32, activation='relu')(X),
X = Dropout(0.5)(X)
output_= Dense(1, activation='relu', name='output')(X)

bert_model = Model([token_inputs, mask_inputs, seg_inputs],output_)
bert_model.summary()

Model: "model_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 35)]         0                                            
__________________________________________________________________________________________________
input_masks (InputLayer)        [(None, 35)]         0                                            
__________________________________________________________________________________________________
input_segments (InputLayer)     [(None, 35)]         0                                            
__________________________________________________________________________________________________
tf_bert_model_6 (TFBertModel)   ((None, 35, 768), (N 109482240   input_word_ids[0][0]             
                                                                 input_masks[0][0]          

In [0]:
opt = tf.keras.optimizers.Adam(learning_rate=3e-4)
bert_model.compile(optimizer=opt, loss='mean_squared_error', metrics=['mse'])

min_loss_change = EarlyStopping(monitor='val_loss', min_delta=1e-4, patience=5, verbose=1, restore_best_weights=True)


In [23]:
# print(Xte[0].shape)
# print(Xte[1].shape)
# print(Xte[2].shape)
print(yte)


0       1.2
1       0.4
2       1.0
3       1.6
4       0.4
       ... 
3019    1.8
3020    0.6
3021    0.4
3022    0.0
3023    0.8
Name: meanGrade, Length: 3024, dtype: float64


In [24]:
history = bert_model.fit( Xtr, ytr, 
                    epochs=20,
                    batch_size = 32,
                    callbacks = [min_loss_change], validation_data = (Xte, yte)
          )

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
105/560 [====>.........................] - ETA: 7:45 - loss: 0.3848 - mse: 0.3848

KeyboardInterrupt: ignored