In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy

import tensorflow as tf

import keras
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dense, LSTM, Embedding, Flatten
from keras.layers import Dropout, Activation, Bidirectional, GlobalMaxPool1D
from keras.models import Sequential, load_model
from keras import initializers, regularizers, optimizers, layers
from keras.utils import plot_model
from keras.callbacks import ModelCheckpoint


# !pip install -q -U keras-tuner
# import kerastuner as kt

import IPython


from sklearn.model_selection import train_test_split

import re

from nltk import pos_tag
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer 

from gensim.models import word2vec

from IPython.display import display 

import seaborn as sns
import matplotlib.pyplot as plt
from src import load_text, get_word_index


sns.set(context = 'notebook', style = 'whitegrid')
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows',50)

https://towardsdatascience.com/hands-on-nlp-deep-learning-model-preparation-in-tensorflow-2-x-2e8c9f3c7633

GloVe embeddigns thanks to Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. GloVe: Global Vectors for Word Representation. [pdf] [bib]




Thanks to https://keras.io/examples/nlp/pretrained_word_embeddings/ and Kefei Mo https://towardsdatascience.com/hands-on-nlp-deep-learning-model-preparation-in-tensorflow-2-x-2e8c9f3c7633 

for the below

In [None]:
df = load_text(sentences=True, grammarize=False)
word_index = get_word_index(df)

In [None]:
!python -m spacy download en_core_web_lg
nlp = spacy.load('en_core_web_lg')


num_words = len(word_index.keys())
print(f'total vocabulary length: {num_words}')


num_tokens = num_words + 1
embedding_dim = len(nlp('the').vector)
hits = 0
misses = 0

embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():

    try:
        embedding_matrix[i+1] = nlp(word).vector
        hits += 1
    except:
        misses +=1
print(f'words converted: {hits}, words not found: {misses}')
tokens = df.Text.apply(lambda text: [word_index[word] for word in text.split()])
X = pad_sequences(tokens, padding='post')
y = df.Grade


X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.2)
X_t, X_val, y_t, y_val = train_test_split(X_train, y_train, random_state = 42, test_size = 0.2)

In [None]:
X_train.shape

In [None]:
embedding_layer=Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False)
version = 0

In [None]:
def make_model():

    model = Sequential()

    model.add(embedding_layer)
    model.add(LSTM(50, return_sequences=True))
    model.add(GlobalMaxPool1D())
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu', 
                    kernel_regularizer = regularizers.l1_l2(l1=1e-5, l2=1e-4)))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu', 
                    kernel_regularizer = regularizers.l1_l2(l1=1e-5, l2=1e-4)))    
    model.add(Dense(1))

    adam = optimizers.Adam(learning_rate=.01, decay=1e-2)
    model.compile(optimizer = adam, loss = 'mean_absolute_error', metrics = None)
    
    return model
model = make_model()
filepath = 'model1-best.hdf5'
checkpoint = ModelCheckpoint(filepath, monitor='val_loss',verbose=1, 
                             save_best_only=True,
                             mode='min')
callbacks = [checkpoint]

print(model.summary())

In [None]:
history = model.fit(X_train, y_train,
                     batch_size=100,
                     epochs=50,
                     validation_split=.2,
                    callbacks=callbacks)

plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.legend()
plt.show()

model = keras.models.load_model('model1-best.hdf5')
yhat = model.predict(X_test).ravel()
print('MAE = ', np.sum(np.abs(y_test-yhat))/len(y_test))
print('mean grade prediction = ',np.mean(model.predict(X_train)))
print('mean grade = ', np.mean(y_train))

errors = df.loc[y_test.index][['Text','Grade']]
errors['Predicted Grade'] = yhat
errors.sample(5)

In [None]:
df

In [None]:
def make_model2():

    model = Sequential()

    model.add(embedding_layer)
    model.add(LSTM(50, return_sequences=True))
    model.add(GlobalMaxPool1D())
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu', 
                    kernel_regularizer = regularizers.l1_l2(l1=1e-5, l2=1e-4)))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu', 
                    kernel_regularizer = regularizers.l1_l2(l1=1e-5, l2=1e-4)))  
    model.add(Dropout(0.5))
    model.add(Dense(32, activation='relu', 
                    kernel_regularizer = regularizers.l1_l2(l1=1e-5, l2=1e-4)))  
    model.add(Dropout(0.3))
    model.add(Dense(16, activation='relu', 
                    kernel_regularizer = regularizers.l1_l2(l1=1e-5, l2=1e-4))) 
    model.add(Dropout(0.3))
 
    model.add(Dense(1))

    adam = optimizers.Adam(learning_rate=.01, decay=1e-3)
    model.compile(optimizer = adam, loss = 'mean_absolute_error', metrics = None)
    
    return model
model = make_model2()
print(model.summary())
filepath = 'model2-best.hdf5'
checkpoint = ModelCheckpoint(filepath, monitor='val_loss',verbose=1, 
                             save_best_only=True,
                             mode='min')
callbacks = [checkpoint]

history = model.fit(X_train, y_train,
                     batch_size=100,
                     epochs=50,
                     validation_split=.2,
                    callbacks=callbacks)

plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.legend()
plt.show()

model = keras.models.load_model('/content/model2-best.hdf5')
yhat = model.predict(X_test).ravel()
print('MAE = ', np.sum(np.abs(y_test-yhat))/len(y_test))
print('mean grade prediction = ',np.mean(model.predict(X_train)))
print('mean grade = ', np.mean(y_train))

errors = df.loc[y_test.index][['Text','Grade']]
errors['Predicted Grade'] = yhat
errors.sample(5)

In [None]:
def make_model3():

    model = Sequential()

    model.add(embedding_layer)
    model.add(LSTM(100, return_sequences=True))
    model.add(GlobalMaxPool1D())
    model.add(Dropout(0.5))
    model.add(Dense(128, activation='relu', 
                    kernel_regularizer = regularizers.l1_l2(l1=1e-5, l2=1e-4)))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu', 
                    kernel_regularizer = regularizers.l1_l2(l1=1e-5, l2=1e-4)))  
    model.add(Dropout(0.5))
    model.add(Dense(32, activation='relu', 
                    kernel_regularizer = regularizers.l1_l2(l1=1e-5, l2=1e-4)))  
    model.add(Dropout(0.3))
    model.add(Dense(16, activation='relu', 
                    kernel_regularizer = regularizers.l1_l2(l1=1e-5, l2=1e-4))) 
    model.add(Dropout(0.3))
 
    model.add(Dense(1))

    adam = optimizers.Adam(learning_rate=.01, decay=1e-3)
    model.compile(optimizer = adam, loss = 'mean_absolute_error', metrics = None)
    
    return model
model = make_model3()
print(model.summary())
filepath = 'model3-best.hdf5'
checkpoint = ModelCheckpoint(filepath, monitor='val_loss',verbose=1, 
                             save_best_only=True,
                             mode='min')
callbacks = [checkpoint]

# history = model.fit(X_train, y_train,
#                      batch_size=100,
#                      epochs=200,
#                      validation_split=.2,
#                     callbacks=callbacks)

# plt.plot(history.history['loss'], label='Training Loss')
# plt.plot(history.history['val_loss'], label='Validation Loss')
# plt.legend()
# plt.show()

model = keras.models.load_model('/content/model3-best.hdf5')
yhat = model.predict(X_test).ravel()
print('MAE = ', np.sum(np.abs(y_test-yhat))/len(y_test))
print('mean grade prediction = ',np.mean(model.predict(X_test)))
print('mean grade = ', np.mean(y_test))

errors = df.loc[y_test.index][['Text','Grade']]
errors['Predicted Grade'] = yhat
errors.sample(5)