In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy

import tensorflow as tf

import keras
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dense, LSTM, Embedding, Flatten, \
Dropout, Activation, Bidirectional, GlobalMaxPool1D
from keras.layers.experimental.preprocessing import TextVectorization
from keras.models import Sequential, load_model
from keras import initializers, regularizers, optimizers, layers
from keras.utils import plot_model
from keras.callbacks import ModelCheckpoint
from keras.initializers import Constant


# !pip install -q -U keras-tuner
# import kerastuner as kt

import IPython
from wordcloud import WordCloud
import seaborn as sns

from sklearn.model_selection import train_test_split

import re

from nltk import pos_tag
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer 

from gensim.models import word2vec

from IPython.display import display 

import seaborn as sns
import matplotlib.pyplot as plt
from src import load_text, get_word_index

!python -m spacy download en_core_web_lg
import en_core_web_lg


sns.set(context = 'notebook', style = 'whitegrid')
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows',50)

https://towardsdatascience.com/hands-on-nlp-deep-learning-model-preparation-in-tensorflow-2-x-2e8c9f3c7633

GloVe embeddigns thanks to Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. GloVe: Global Vectors for Word Representation. [pdf] [bib]




Thanks to https://keras.io/examples/nlp/pretrained_word_embeddings/ and Kefei Mo https://towardsdatascience.com/hands-on-nlp-deep-learning-model-preparation-in-tensorflow-2-x-2e8c9f3c7633 

for the below

In [None]:
df = load_text(sentences=True, grammarize=False)
word_index = get_word_index(text)
X = df.Text
y = df.Grade

In [None]:
sns.barplot(x=y.unique(),y=y.value_counts())

In [None]:
for grade in sorted(df.Grade.unique()):
    text = ' '.join([story for story in df.loc[df.Grade == grade, 'Text']])
    wordcloud = WordCloud().generate(text)
    plt.figure(figsize=(15,12))
    plt.imshow(wordcloud)
    plt.title('Grade {}'.format(grade))
    plt.axis('off')
    plt.show()

In [None]:
X_train.str.len().max()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.2)
X_t, X_val, y_t, y_val = train_test_split(X_train, y_train, random_state = 42, test_size = 0.2)
#fit the vectorizer on the text and extract the corpus vocabulary
longest_sentence = X_train.str.len().max()
Vectorizer = TextVectorization(output_sequence_length=longest_sentence)
Vectorizer.adapt(X_train.to_numpy())
vocab = Vectorizer.get_vocabulary()
nlp = en_core_web_lg.load()

#generate the embedding matrix
num_tokens = len(vocab)
embedding_dim = len(nlp('The').vector)
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for i, word in enumerate(vocab):
    embedding_matrix[i] = nlp(word).vector

#Load the embedding matrix as the weights matrix for the embedding layer and set trainable to False
Embedding_layer=Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=Constant(embedding_matrix),
    trainable=False)


In [None]:
Vectorizer2 = TextVectorization(output_mode='count', max_tokens=2000)
Vectorizer2.adapt(X_train.to_numpy())
model = Sequential()
model.add(Input(shape=(1,), dtype=tf.string))
model.add(Vectorizer2)
model.add(Dense(128, activation='relu'))
model.add(Dropout(rate=.2))
model.add(Dense(128, activation='relu'))
model.add(Dropout(rate=.2))
model.add(Dense(128, activation='relu'))
model.add(Dropout(rate=.2))
model.add(Dense(128, activation='relu'))
model.add(Dropout(rate=.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(rate=.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(rate=.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(rate=.2))
model.add(Dense(32, activation='relu'))
model.add(Dropout(rate=.2))
model.add(Dense(32, activation='relu'))
model.add(Dropout(rate=.2))
model.add(Dense(32, activation='relu'))
model.add(Dropout(rate=.2))
model.add(Dense(1, activation='relu'))
sgd = optimizers.SGD(learning_rate=0.1, decay=1e-2)
model.compile(optimizer='adam', loss='mean_absolute_error')
filepath = 'best-MLP'
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, 
                             save_best_only=True,
                             mode='min',
                             save_format='tf'
                             )
callbacks = [checkpoint]

history = model.fit(X_train,
            y_train,
            epochs=100,
            batch_size=100,
            validation_split = .2,
            verbose=1,
            callbacks=callbacks)

plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.legend()
plt.show()

model = keras.models.load_model(filepath)
yhat = model.predict(X_test).ravel()
print('MAE = ', np.sum(np.abs(y_test-yhat))/len(y_test))
print('mean grade prediction = ', np.mean(model.predict(X_test)))
print('mean grade = ', np.mean(y_test))

errors = df.loc[y_test.index][['Text','Grade']]
errors['Predicted Grade'] = yhat
errors.sample(5)

In [None]:
filepath = 'model-best'
model = Sequential()
model.add(Input(shape=(1,), dtype=tf.string))
model.add(Vectorizer)
model.add(embedding_layer)
model.add(LSTM(100, return_sequences=True))
model.add(GlobalMaxPool1D())
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu', 
                kernel_regularizer = regularizers.l1_l2(l1=1e-5, l2=1e-4)))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu', 
                kernel_regularizer = regularizers.l1_l2(l1=1e-5, l2=1e-4)))  
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu', 
                kernel_regularizer = regularizers.l1_l2(l1=1e-5, l2=1e-4)))  
model.add(Dropout(0.3))
model.add(Dense(16, activation='relu', 
                kernel_regularizer = regularizers.l1_l2(l1=1e-5, l2=1e-4))) 
model.add(Dropout(0.3))

model.add(Dense(1))

adam = optimizers.Adam(learning_rate=.01, decay=1e-3)
model.compile(optimizer = adam, loss = 'mean_absolute_error', metrics = None)

print(model.summary())
checkpoint = ModelCheckpoint(filepath, monitor='val_loss',verbose=1, 
                             save_best_only=True,
                             mode='min',
                             save_format='tf')
callbacks = [checkpoint]

history = model.fit(X_train, y_train,
                     batch_size=100,
                     epochs=100,
                     validation_split=.2,
                    callbacks=callbacks
                    )

plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.legend()
plt.show()

model = keras.models.load_model(filepath)
yhat = model.predict(X_test).ravel()
print('MAE = ', np.sum(np.abs(y_test-yhat))/len(y_test))
print('mean grade prediction = ', np.mean(model.predict(X_test)))
print('mean grade = ', np.mean(y_test))

errors = df.loc[y_test.index][['Text','Grade']]
errors['Predicted Grade'] = yhat
errors.sample(5)