In [0]:
import re
import os
import numpy as np
import pandas as pd
import keras
from keras.preprocessing.text import Tokenizer, one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Embedding, Dense, Dropout, Conv1D, GlobalMaxPool1D
from keras.models import Model
from keras.callbacks.callbacks import EarlyStopping
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
import nltk
from nltk.tokenize import word_tokenize 
from nltk.stem import PorterStemmer 

from google.colab import drive
drive.mount('/content/drive')
nltk.download('punkt')

Using TensorFlow backend.


Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [0]:
df_train = pd.read_csv("/content/drive/My Drive/ipython notebooks/COSC-572/full_prepped_training.csv")
df_test = pd.read_csv("/content/drive/My Drive/ipython notebooks/COSC-572/prepped_test.csv")

In [0]:
#before feeding into the model, preprocess text by using this function 
def prepare_text(text):
    porter_stemmer = PorterStemmer() 
    NON_ALPHA_NUMERIC = re.compile(r'\W+') 
    numbers = re.compile(r'\d+')
    whitespace = re.compile(r'\s+')
    
    preped_text = ''
    word_tokens = word_tokenize(text) 
    for token in word_tokens:
        #normalize all to lower case
        token = token.lower()
        #normalize special punctuations
        token = token.replace('–', '-')
        token = token.replace("'", ' ')
        # remove non-alpha numeric
        token = re.sub(NON_ALPHA_NUMERIC,' ', token)
        # stem words
        # token = porter_stemmer.stem(token)
        preped_text += token + ' '

    #normalize white spaces    
    preped_text = re.sub(whitespace, ' ', preped_text).strip()
    return preped_text

In [0]:
# process the texts using prepare_text()
df_train['edited_sentence'] = df_train['edited_sentence'].transform(func=prepare_text)
df_test['edited_sentence'] = df_test['edited_sentence'].transform(func=prepare_text)

In [0]:
# function for finding the length of the longest sentence in terms of number of words
def find_maxlen(df, column_name):
    return df[column_name].map(lambda x: len(x.split())).max()
print(find_maxlen(df_train, 'edited_sentence'))
print(find_maxlen(df_test, 'edited_sentence'))


23
22


In [0]:
max_length = 30

In [0]:
#Pad sentences and one_hot encode words to use for embeddings.
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_train['edited_sentence'])
tokenizer.fit_on_texts(df_test['edited_sentence'])
vocab_size = len(tokenizer.word_index)+1
edited_docs_train = tokenizer.texts_to_sequences(df_train['edited_sentence'])
edited_docs_test = tokenizer.texts_to_sequences(df_test['edited_sentence'])
edited_docs_train = pad_sequences(edited_docs_train, maxlen=max_length, padding='post')
edited_docs_test = pad_sequences(edited_docs_test, maxlen=max_length, padding='post')
print(f'max length {max_length}')
print(f'vocab size {vocab_size}')
print(edited_docs_train.shape)
print(edited_docs_test.shape)


max length 30
vocab size 17580
(17900, 30)
(3024, 30)


In [0]:
#load glove embedding for the vocabulary of our data
embeddings_index = dict()
f = open('/content/drive/My Drive/ipython notebooks/Embeddings/GloVe/glove.42B.300d.txt')
for line in f:
	values = line.split()
	word = values[0]
	coefs = np.asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()


In [0]:
embedding_matrix = np.zeros((vocab_size, 300))
for word, i in tokenizer.word_index.items():
    if i >= vocab_size:
        break
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

print(f'embedding matrix shape: {embedding_matrix.shape}')

embedding matrix shape: (17580, 300)


In [0]:
X_train = edited_docs_train
y_train = df_train['meanGrade']
print(X_train.shape)
print(y_train.shape)

(17900, 30)
(17900,)


In [0]:
X_test = edited_docs_test
y_test = df_test['meanGrade']
print(X_test.shape)
print(y_test.shape)

(3024, 30)
(3024,)


In [0]:
edited_input = Input(shape=(max_length,), dtype='float64', name='edited_input')
edited_embedding = Embedding(input_dim=vocab_size, output_dim=300, input_length=max_length, weights=[embedding_matrix], trainable=False)(edited_input)

convs = []
filter_sizes = [2,3,4,5,6]

for filter_size in filter_sizes:
    conv_for_filter = Conv1D(filters=128, kernel_size=filter_size,activation='relu')(edited_embedding)
    conv_for_filter = Conv1D(filters=32, kernel_size=filter_size,activation='relu')(edited_embedding)
    conv_for_filter = GlobalMaxPool1D()(conv_for_filter)
    conv_for_filter = Dropout(0.5)(conv_for_filter)
    convs.append(conv_for_filter)

combined_x = keras.layers.concatenate(convs, axis=1)
combined_x = Dense(128, activation='relu')(combined_x)
combined_x = Dropout(0.5)(combined_x)
combined_x = Dense(128, activation='relu')(combined_x)
combined_x = Dropout(0.5)(combined_x)

combined_output = Dense(1, activation='relu', name='combined_output')(combined_x)

model = Model(inputs=edited_input, outputs=combined_output)


In [0]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
edited_input (InputLayer)       (None, 30)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 30, 300)      5274000     edited_input[0][0]               
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 29, 64)       38464       embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_4 (Conv1D)               (None, 28, 64)       57664       embedding_1[0][0]                
____________________________________________________________________________________________

In [0]:
opt = keras.optimizers.Adam(learning_rate=3e-4)

In [0]:
model.compile(optimizer=opt, loss = "mean_squared_error", metrics = ['mse'])

In [0]:
min_loss_change = EarlyStopping(monitor='val_loss', min_delta=1e-4, patience=8, verbose=1, restore_best_weights=True)

history = model.fit(
          {'edited_input': X_train},
          {'combined_output': y_train},
          epochs=30, 
          batch_size=16, 
          callbacks=[min_loss_change],
          validation_data = [{'edited_input': X_test},  {'combined_output': y_test}]
          )

Train on 17900 samples, validate on 3024 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Restoring model weights from the end of the best epoch
Epoch 00012: early stopping


In [0]:
model.save('considerboth.h5')