In [None]:
# Code Citations
# Ridley, R., He, L., Dai, X., Huang, S., & Chen, J. (2020). Prompt agnostic essay scorer: a domain generalization approach to cross-prompt automated essay scoring. arXiv preprint arXiv:2008.01441.
# Pethani, M. (2019) Automated Essay Scoring: Kaggle Competition — End to End Project Implementation. Medium. Retrieved from https://medium.com/@mayurmorin/automated-essay-scoring-kaggle-competition-end-to-end-project-implementation-part-1-b75a043903c4

In [1]:
# Imports
import re
import os
import pandas as pd
import numpy as np
import readability
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from gensim.models import Word2Vec
from keras.layers import Input, Concatenate
from keras.models import Model
from keras.layers import LSTM, Dropout, Dense
from tqdm.keras import TqdmCallback
import nltk
from nltk.corpus import stopwords
from sklearn.metrics import cohen_kappa_score
nltk.download('stopwords')
nltk.download('punkt')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Oscar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Oscar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
#General Settings
testSize = 0.1      #Validation set
seed = 42           #Seed
numEpochs = 500     #Epochs
batchSize = 64      #Batch Size
ignore_warnings = True

#Word vector settings
num_features =  500  #Dimension of the word vector
min_word_count = 5   #Mininum recurrence of word to be included
num_workers = 4
context = 30
downsampling = 1e-3
MAX_SENTLEN = 100

#Writing features to use
keepCats =  ['Kincaid' , 'complex_words', 'type_token_ratio', 'words', 'wordtypes', 'subordination', 'conjunction', 'preposition'] 

#File Names
saveName = 'DENSE_3004'
X = pd.read_csv('./Data/train.csv')

#Save settings
dsettings = {'num_features': num_features,'MAX_SENTLEN': MAX_SENTLEN, 'keepCats': [keepCats]}
sdf = pd.DataFrame(data=dsettings)
sdf.to_csv(saveName+ '_settings.csv')

In [5]:
#Ignore warnings
if ignore_warnings:
    import warnings
    warnings.filterwarnings("ignore")
    import os
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

In [7]:
#Writing Features Functions (Ridley et al., 2020)
X["nonseq_input"] = X.apply(lambda x: [x["Grade"]] + [x["Grade"]], axis=1)

def replace_url(text):
    replaced_text = re.sub('(http[s]?://)?((www)\.)?([a-zA-Z0-9]+)\.{1}((com)(\.(cn))?|(org))', '<url>', text)
    return replaced_text

def tokenize(string):
    tokens = nltk.word_tokenize(string)
    for index, token in enumerate(tokens):
        if token == '@' and (index+1) < len(tokens):
            tokens[index+1] = '@' + re.sub('[0-9]+.*', '', tokens[index+1])
            tokens.pop(index)
    return tokens

def shorten_sentence(sent, max_sentlen):
    new_tokens = []
    sent = sent.strip()
    tokens = nltk.word_tokenize(sent)
    if len(tokens) > max_sentlen:
        split_keywords = ['because', 'but', 'so', 'You', 'He', 'She', 'We', 'It', 'They', 'Your', 'His', 'Her']
        k_indexes = [i for i, key in enumerate(tokens) if key in split_keywords]
        processed_tokens = []
        if not k_indexes:
            num = len(tokens) / max_sentlen
            num = int(round(num))
            k_indexes = [(i+1)*max_sentlen for i in range(num)]

        processed_tokens.append(tokens[0:k_indexes[0]])
        len_k = len(k_indexes)
        for j in range(len_k-1):
            processed_tokens.append(tokens[k_indexes[j]:k_indexes[j+1]])
        processed_tokens.append(tokens[k_indexes[-1]:])

        for token in processed_tokens:
            if len(token) > max_sentlen:
                num = len(token) / max_sentlen
                num = int(np.ceil(num))
                s_indexes = [(i+1)*max_sentlen for i in range(num)]

                len_s = len(s_indexes)
                new_tokens.append(token[0:s_indexes[0]])
                for j in range(len_s-1):
                    new_tokens.append(token[s_indexes[j]:s_indexes[j+1]])
                new_tokens.append(token[s_indexes[-1]:])

            else:
                new_tokens.append(token)
    else:
        return [tokens]

    return new_tokens

def tokenize_to_sentences(text, max_sentlength, create_vocab_flag=False):
    sents = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\!|\?)\s', text)
    processed_sents = []
    for sent in sents:
        if re.search(r'(?<=\.{1}|\!|\?|\,)(@?[A-Z]+[a-zA-Z]*[0-9]*)', sent):
            s = re.split(r'(?=.{2,})(?<=\.{1}|\!|\?|\,)(@?[A-Z]+[a-zA-Z]*[0-9]*)', sent)
            ss = " ".join(s)
            ssL = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\!|\?)\s', ss)

            processed_sents.extend(ssL)
        else:
            processed_sents.append(sent)

    if create_vocab_flag:
        sent_tokens = [tokenize(sent) for sent in processed_sents]
        tokens = [w for sent in sent_tokens for w in sent]
        return tokens

    sent_tokens = []
    for sent in processed_sents:
        shorten_sents_tokens = shorten_sentence(sent, max_sentlength)
        sent_tokens.extend(shorten_sents_tokens)
    return sent_tokens

def text_tokenizer(text, replace_url_flag=True, tokenize_sent_flag=True, create_vocab_flag=False):
    text = replace_url(text)
    text = text.replace(u'"', u'')
    if "..." in text:
        text = re.sub(r'\.{3,}(\s+\.{3,})*', '...', text)
    if "??" in text:
        text = re.sub(r'\?{2,}(\s+\?{2,})*', '?', text)
    if "!!" in text:
        text = re.sub(r'\!{2,}(\s+\!{2,})*', '!', text)

    tokens = tokenize(text)
    if tokenize_sent_flag:
        text = " ".join(tokens)
        sent_tokens = tokenize_to_sentences(text, MAX_SENTLEN, create_vocab_flag)
        return sent_tokens
    else:
        raise NotImplementedError

In [8]:
#Generate Writing Features (Ridley et al., 2020)
from spellchecker import SpellChecker
spell = SpellChecker()

i_ = 0
for index, row in X.iterrows():
    content = row['Text']
    score = row['Grade']

    sent_tokens = text_tokenizer(content, replace_url_flag=True, tokenize_sent_flag=True)
    sentences = [' '.join(sent) + '\n' for sent in sent_tokens]
    sentences = ''.join(sentences)
    readability_scores = readability.getmeasures(sentences, lang='en')
    
    features = []
    cats = []
    #keepCats = ['Kincaid', 'ARI', 'Coleman-Liau', 'FleschReadingEase', 'GunningFogIndex', 'LIX', 'SMOGIndex', 'RIX', 'DaleChallIndex', 'characters_per_word', 'syll_per_word', 'words_per_sentence', 'sentences_per_paragraph', 'type_token_ratio', 'directspeech_ratio', 'characters', 'syllables', 'words', 'wordtypes', 'sentences', 'paragraphs', 'long_words', 'complex_words', 'complex_words_dc', 'tobeverb', 'auxverb', 'conjunction', 'pronoun', 'preposition', 'nominalization', 'pronoun', 'interrogative', 'article', 'subordination', 'conjunction', 'preposition']

    for cat in readability_scores.keys():
        for subcat in readability_scores[cat].keys():
            if subcat in keepCats:
                cats.append(subcat)
                ind_score = readability_scores[cat][subcat]
                features.append(ind_score)
                
    # find those words that may be misspelled
    sentences = sentences.replace('\n', ' ').replace('\r', '').replace('etc', '')
    words = sentences.split(" ")
    words = [x for x in words if '\'' not in x and len(x)>3]
    misspelled = spell.unknown(words)
    features.append(len(misspelled))

    X.at[i_, 'nonseq_input' ] = features
    i_ += 1
    
print('Number features:', len(X.iloc[1]['nonseq_input']))
print('Categories:', cats)

Number features: 11
Categories: ['Kincaid', 'type_token_ratio', 'words', 'wordtypes', 'complex_words', 'conjunction', 'preposition', 'subordination', 'conjunction', 'preposition']


In [9]:
#Functions to extract word vectors (Pethani, M.,2019)
def essay_to_wordlist(essay_v, remove_stopwords):
    """Remove the tagged labels and word tokenize the sentence."""
    essay_v = re.sub("[^a-zA-Z]", " ", essay_v)
    words = essay_v.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return (words)

def essay_to_sentences(essay_v, remove_stopwords):
    """Sentence tokenize the essay and call essay_to_wordlist() for word tokenization."""
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sentences = tokenizer.tokenize(essay_v.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(essay_to_wordlist(raw_sentence, remove_stopwords))
    return sentences

def makeFeatureVec(words, model, num_features):
    """Make Feature Vector from the words list of an Essay."""
    featureVec = np.zeros((num_features,),dtype="float32")
    num_words = 0.
    index2word_set = set(model.wv.index_to_key)
    for word in words:
        if word in index2word_set:
            num_words += 1
            featureVec = np.add(featureVec,model.wv.get_vector(word))        
    featureVec = np.divide(featureVec,num_words)
    return featureVec

def getAvgFeatureVecs(essays, model, num_features):
    """Main function to generate the word vectors for word2vec model."""
    counter = 0
    essayFeatureVecs = np.zeros((len(essays),num_features),dtype="float32")
    for essay in essays:
        essayFeatureVecs[counter] = makeFeatureVec(essay, model, num_features)
        counter = counter + 1
    return essayFeatureVecs

In [10]:
# Dense Neural network
num_non_seq_features = len(X.iloc[1]['nonseq_input'])
def get_model(num_features):
    """Define the model."""
    #Input
    input_layer = Input(shape=(num_features, ))
    
    #First Layers
    hidden_layer = Dense(num_features, activation='relu')(input_layer)
    dropout_layer = Dropout(0.5)(hidden_layer)
    hidden_layer = Dense(256, activation='relu')(dropout_layer)
    dropout_layer = Dropout(0.5)(hidden_layer)
    hidden_layer = Dense(64, activation='relu')(dropout_layer)

    #Concatenate
    non_seq_input = Input(shape=(num_non_seq_features,))
    concat_layer = Concatenate()([dropout_layer, non_seq_input])
    
    #Second layers
    hidden_layer = Dense(128, activation='relu')(concat_layer)
    dropout_layer = Dropout(0.3)(hidden_layer)

    #Output
    output_layer = Dense(1, activation='sigmoid')(dropout_layer)

    model = Model(inputs=[input_layer, non_seq_input], outputs=output_layer)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['mae'])
    return model

In [11]:
#Train/Validation Split
y = X['Grade']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=testSize, random_state=seed)

train_X_nonseq = np.asarray(list(X_train["nonseq_input"]))
test_X_nonseq = np.asarray(list(X_test["nonseq_input"]))
train_essays = X_train['Text']
test_essays = X_test['Text']

In [12]:
#Fit model
print('Train Essays:', len(X_train))
print('Test Essays:', len(X_test))

sentences = []
for essay in train_essays:
        sentences += essay_to_sentences(essay, remove_stopwords = True)
        
print("Training Word2Vec Model...")
model = Word2Vec(sentences, workers=num_workers, vector_size=num_features, min_count = min_word_count, window = context, sample = downsampling)
model.init_sims(replace=True)
model.wv.save_word2vec_format(saveName + '_voc.bin', binary=True)
print('Vocabulary Size:', len(model.wv.index_to_key))
print('Saved Word2Vec:', saveName + '_voc.bin')

clean_train_essays = []
for essay_v in train_essays:
    clean_train_essays.append(essay_to_wordlist(essay_v, remove_stopwords=True))
trainDataVecs = getAvgFeatureVecs(clean_train_essays, model, num_features)

trainDataVecs = np.array(trainDataVecs)
print(trainDataVecs.shape)

lstm_model = get_model(num_features)
lstm_model.fit([trainDataVecs, train_X_nonseq], y_train, batch_size=batchSize, epochs=numEpochs, verbose=0, callbacks=[TqdmCallback(verbose=1)])

print('Saved Model:', saveName + '.h5')
lstm_model.save(saveName + '.h5')        

Train Essays: 1575
Test Essays: 175
Training Word2Vec Model...
Vocabulary Size: 3164
Saved Word2Vec: DENSE_3004_voc.bin
(1575, 500)


100%|██████████| 500/500 [01:04<00:00,  7.78epoch/s, loss=0.602, mae=0.0651]


Saved Model: DENSE_3004.h5


In [13]:
#Predicting Validation Essays
clean_test_essays = []
for essay_v in test_essays:
    clean_test_essays.append(essay_to_wordlist( essay_v, remove_stopwords=True ))
testDataVecs = getAvgFeatureVecs( clean_test_essays, model, num_features )
testDataVecs = np.array(testDataVecs)
#testDataVecs = np.reshape(testDataVecs, (testDataVecs.shape[0], 1, testDataVecs.shape[1]))
y_pred = lstm_model.predict([testDataVecs, test_X_nonseq])



In [14]:
#MAE
nanValuesTrack = len(y_pred)
y_pred_clean = y_pred[~np.isnan(y_pred)]
print('NaN predictions:', nanValuesTrack - len(y_pred_clean))
print(len(y_pred_clean))

inspectionList = []
mae = 0
for pred, test in zip(y_pred_clean,y_test.to_list()):
        mae += np.abs(pred*10 - test*10)
        inspectionList.append([pred*10, test*10])
mae = mae/len(y_pred_clean)

print('MAE:', mae)
print(inspectionList)

NaN predictions: 0
175
MAE: 0.6638128166539329
[[7.386829257011414, 6.0], [8.092232942581177, 9.0], [7.799837589263916, 8.0], [6.654412746429443, 6.0], [7.3316216468811035, 8.0], [6.957600712776184, 7.0], [7.610688209533691, 7.0], [7.364897727966309, 8.0], [8.478503227233887, 8.0], [8.042458295822144, 8.0], [7.087007164955139, 5.0], [6.671549677848816, 7.0], [6.897796988487244, 8.0], [6.955739855766296, 7.0], [7.606330513954163, 9.0], [5.678170323371887, 6.0], [5.470612645149231, 3.0], [7.103715538978577, 6.0], [7.26502537727356, 7.0], [7.212899923324585, 8.0], [7.175543904304504, 7.0], [5.862725377082825, 5.0], [7.857930064201355, 8.0], [7.1925950050354, 8.0], [5.244874358177185, 4.0], [3.3785131573677063, 6.0], [6.2250107526779175, 6.0], [6.938796043395996, 6.0], [5.68595290184021, 6.0], [8.300674557685852, 8.0], [6.568998098373413, 7.0], [7.437593340873718, 8.0], [7.832106351852417, 8.0], [7.519814968109131, 8.0], [6.985836029052734, 8.0], [7.067549228668213, 6.0], [7.05170214176178

In [15]:
#QWK
preds = y_pred_clean
actuals = y_test.to_list()
preds = [int(round(i*10)) for i in preds]
actuals = [int(round(i*10)) for i in actuals]
print(list(zip(preds,actuals)))
print('QWK:', cohen_kappa_score(preds, actuals, weights='quadratic'))

[(7, 6), (8, 9), (8, 8), (7, 6), (7, 8), (7, 7), (8, 7), (7, 8), (8, 8), (8, 8), (7, 5), (7, 7), (7, 8), (7, 7), (8, 9), (6, 6), (5, 3), (7, 6), (7, 7), (7, 8), (7, 7), (6, 5), (8, 8), (7, 8), (5, 4), (3, 6), (6, 6), (7, 6), (6, 6), (8, 8), (7, 7), (7, 8), (8, 8), (8, 8), (7, 8), (7, 6), (7, 6), (9, 9), (6, 6), (7, 8), (7, 8), (7, 6), (7, 8), (8, 7), (6, 4), (7, 7), (8, 8), (7, 7), (8, 8), (7, 7), (7, 8), (6, 6), (9, 9), (7, 7), (8, 6), (7, 6), (8, 8), (8, 6), (6, 5), (5, 4), (7, 8), (7, 8), (7, 8), (8, 8), (7, 6), (8, 8), (7, 5), (5, 5), (7, 8), (5, 4), (8, 9), (8, 9), (7, 7), (7, 7), (7, 8), (7, 7), (7, 6), (5, 4), (8, 8), (7, 8), (5, 6), (6, 6), (6, 4), (7, 7), (8, 8), (2, 3), (7, 8), (7, 7), (8, 8), (7, 6), (6, 6), (7, 7), (7, 7), (7, 8), (7, 7), (8, 7), (7, 7), (8, 8), (7, 6), (7, 6), (7, 6), (8, 8), (7, 7), (5, 5), (6, 5), (6, 6), (7, 7), (6, 4), (4, 5), (7, 7), (5, 6), (8, 8), (5, 4), (8, 8), (7, 8), (8, 8), (7, 5), (7, 8), (7, 8), (7, 7), (8, 9), (7, 6), (8, 7), (8, 8), (8, 8),