In [None]:
# Code Citations
# Ridley, R., He, L., Dai, X., Huang, S., & Chen, J. (2020). Prompt agnostic essay scorer: a domain generalization approach to cross-prompt automated essay scoring. arXiv preprint arXiv:2008.01441.
# Pethani, M. (2019) Automated Essay Scoring: Kaggle Competition — End to End Project Implementation. Medium. Retrieved from https://medium.com/@mayurmorin/automated-essay-scoring-kaggle-competition-end-to-end-project-implementation-part-1-b75a043903c4

In [1]:
#Imports
import numpy as np
import pandas as pd
import os
import re
import ast
import readability
import tensorflow as tf
import math
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from gensim.models import KeyedVectors
import gensim.models
from sklearn.metrics import cohen_kappa_score
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
from itertools import chain, combinations
ignore_warnings = True
if ignore_warnings:
    import warnings
    warnings.filterwarnings("ignore")
    import os
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Oscar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Oscar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
#Load Settings
loadName = 'DENSE_3004'
includesGrades = True
maxSentences = 15

#Load Models
lstm_model = tf.keras.models.load_model(loadName + '.h5')
model = gensim.models.KeyedVectors.load_word2vec_format(loadName + '_voc.bin', binary=True)

#Load Settings
settingsdf = pd.read_csv(loadName + '_settings.csv')
print(settingsdf['num_features'][0], settingsdf['MAX_SENTLEN'][0], settingsdf['keepCats'][0])
num_features = settingsdf['num_features'][0]
MAX_SENTLEN = settingsdf['MAX_SENTLEN'][0]
keepCats = ast.literal_eval(settingsdf['keepCats'][0])

#Load Test Data
X = pd.read_csv('./Data/test.csv')

500 100 ['Kincaid', 'complex_words', 'type_token_ratio', 'words', 'wordtypes', 'subordination', 'conjunction', 'preposition']


In [3]:
#Adjust dataframe
OGcount = len(X)
if not includesGrades:
    X['Grade'] = -1
X["nonseq_input"] = X.apply(lambda x: [x["Grade"]] + [x["Grade"]], axis=1)

In [4]:
#Generate all possible Coalitions of sentences                  
def powerset(iterable):
    "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(len(s)+1))

for index, row in X.iterrows():
    essay = row['Text']
    grade = row['Grade']
    trackIndex = row['Index']
    
    sentences = re.split(r'(\? |! |\. )', essay)
    lastS = sentences[-1]
    sentences = ["".join(pair) for pair in zip(sentences[::2], sentences[1::2])] 
    sentences = sentences + [lastS]


    while len(sentences) > maxSentences:
        lengths = [len(x)+len(sentences[i+1]) for i,x in enumerate(sentences) if i < len(sentences)-1]
        indexMin = np.argmin(lengths)
        newList = []
        for i,s in enumerate(sentences):
            if i == indexMin:
                newItem = sentences[i] + sentences[i+1]
                newList.append(newItem)
            elif i == (indexMin+1):
                continue
            else:
                newList.append(s)
        sentences = newList

    all = list(np.arange(len(sentences)))
    coalitions = list(powerset(all))

    oneOut = []
    inTrack = []
    for indices in coalitions:
        newText = [sentences[x] for x in indices]
        flatList = '.'.join(newText)
        inTrack.append(indices)
        oneOut.append(flatList)

    d = {'essay_id': trackIndex, 'sentence_out': inTrack, 'Text': oneOut,'Grade': grade}
    
    df = pd.DataFrame(data=d)
    if index == 0:
        Xs = df[1:]
    else:
        Xs = pd.concat([Xs, df[1:]], ignore_index=True, axis=0)
        
Xs['Set'] = -1
Xs["nonseq_input"] = Xs.apply(lambda x: [x["Set"]] + [x["Set"]], axis=1)
Xs
X = Xs

In [6]:
#Functions Word2Vec (Pethani, M., 2019)
def essay_to_wordlist(essay_v, remove_stopwords):
    """Remove the tagged labels and word tokenize the sentence."""
    essay_v = re.sub("[^a-zA-Z]", " ", essay_v)
    words = essay_v.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return (words)

def essay_to_sentences(essay_v, remove_stopwords):
    """Sentence tokenize the essay and call essay_to_wordlist() for word tokenization."""
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sentences = tokenizer.tokenize(essay_v.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(essay_to_wordlist(raw_sentence, remove_stopwords))
    return sentences

def makeFeatureVec(words, model, num_features):
    """Make Feature Vector from the words list of an Essay."""
    featureVec = np.zeros((num_features,),dtype="float32")
    num_words = 0.
    index2word_set = set(model.index_to_key)
    for word in words:
        if word in index2word_set:
            num_words += 1
            featureVec = np.add(featureVec,model.get_vector(word))        
    featureVec = np.divide(featureVec,num_words)
    return featureVec

def getAvgFeatureVecs(essays, model, num_features):
    """Main function to generate the word vectors for word2vec model."""
    counter = 0
    essayFeatureVecs = np.zeros((len(essays),num_features),dtype="float32")
    for essay in essays:
        essayFeatureVecs[counter] = makeFeatureVec(essay, model, num_features)
        counter = counter + 1
    return essayFeatureVecs

In [7]:
#Writing Features Functions (Ridley et al., 2020)
def replace_url(text):
    replaced_text = re.sub('(http[s]?://)?((www)\.)?([a-zA-Z0-9]+)\.{1}((com)(\.(cn))?|(org))', '<url>', text)
    return replaced_text

def tokenize(string):
    tokens = nltk.word_tokenize(string)
    for index, token in enumerate(tokens):
        if token == '@' and (index+1) < len(tokens):
            tokens[index+1] = '@' + re.sub('[0-9]+.*', '', tokens[index+1])
            tokens.pop(index)
    return tokens

def shorten_sentence(sent, max_sentlen):
    new_tokens = []
    sent = sent.strip()
    tokens = nltk.word_tokenize(sent)
    if len(tokens) > max_sentlen:
        split_keywords = ['because', 'but', 'so', 'You', 'He', 'She', 'We', 'It', 'They', 'Your', 'His', 'Her']
        k_indexes = [i for i, key in enumerate(tokens) if key in split_keywords]
        processed_tokens = []
        if not k_indexes:
            num = len(tokens) / max_sentlen
            num = int(round(num))
            k_indexes = [(i+1)*max_sentlen for i in range(num)]

        processed_tokens.append(tokens[0:k_indexes[0]])
        len_k = len(k_indexes)
        for j in range(len_k-1):
            processed_tokens.append(tokens[k_indexes[j]:k_indexes[j+1]])
        processed_tokens.append(tokens[k_indexes[-1]:])

        for token in processed_tokens:
            if len(token) > max_sentlen:
                num = len(token) / max_sentlen
                num = int(np.ceil(num))
                s_indexes = [(i+1)*max_sentlen for i in range(num)]

                len_s = len(s_indexes)
                new_tokens.append(token[0:s_indexes[0]])
                for j in range(len_s-1):
                    new_tokens.append(token[s_indexes[j]:s_indexes[j+1]])
                new_tokens.append(token[s_indexes[-1]:])

            else:
                new_tokens.append(token)
    else:
        return [tokens]

    return new_tokens

def tokenize_to_sentences(text, max_sentlength, create_vocab_flag=False):
    sents = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\!|\?)\s', text)
    processed_sents = []
    for sent in sents:
        if re.search(r'(?<=\.{1}|\!|\?|\,)(@?[A-Z]+[a-zA-Z]*[0-9]*)', sent):
            s = re.split(r'(?=.{2,})(?<=\.{1}|\!|\?|\,)(@?[A-Z]+[a-zA-Z]*[0-9]*)', sent)
            ss = " ".join(s)
            ssL = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\!|\?)\s', ss)

            processed_sents.extend(ssL)
        else:
            processed_sents.append(sent)

    if create_vocab_flag:
        sent_tokens = [tokenize(sent) for sent in processed_sents]
        tokens = [w for sent in sent_tokens for w in sent]
        return tokens

    sent_tokens = []
    for sent in processed_sents:
        shorten_sents_tokens = shorten_sentence(sent, max_sentlength)
        sent_tokens.extend(shorten_sents_tokens)
    return sent_tokens

def text_tokenizer(text, replace_url_flag=True, tokenize_sent_flag=True, create_vocab_flag=False):
    text = replace_url(text)
    text = text.replace(u'"', u'')
    if "..." in text:
        text = re.sub(r'\.{3,}(\s+\.{3,})*', '...', text)
    if "??" in text:
        text = re.sub(r'\?{2,}(\s+\?{2,})*', '?', text)
    if "!!" in text:
        text = re.sub(r'\!{2,}(\s+\!{2,})*', '!', text)

    tokens = tokenize(text)
    if tokenize_sent_flag:
        text = " ".join(tokens)
        sent_tokens = tokenize_to_sentences(text, MAX_SENTLEN, create_vocab_flag)
        return sent_tokens
    else:
        raise NotImplementedError

In [8]:
#Generate Writing Features (Ridley et al., 2020)
i_ = 0
from spellchecker import SpellChecker
spell = SpellChecker()

for index, row in X.iterrows():
    if i_ % 1000:
        print(index,'/', len(X), end='\r')
    content = row['Text']
    score = row['Grade']

    sent_tokens = text_tokenizer(content, replace_url_flag=True, tokenize_sent_flag=True)
    sentences = [' '.join(sent) + '\n' for sent in sent_tokens]
    sentences = ''.join(sentences)
    readability_scores = readability.getmeasures(sentences, lang='en')
    
    features = []
    cats = []
    #keepCats =['Kincaid', 'ARI', 'Coleman-Liau', 'FleschReadingEase', 'GunningFogIndex', 'LIX', 'SMOGIndex', 'RIX', 'DaleChallIndex', 'characters_per_word', 'syll_per_word', 'words_per_sentence', 'sentences_per_paragraph', 'type_token_ratio', 'directspeech_ratio', 'characters', 'syllables', 'words', 'wordtypes', 'sentences', 'paragraphs', 'long_words', 'complex_words', 'complex_words_dc', 'tobeverb', 'auxverb', 'conjunction', 'pronoun', 'preposition', 'nominalization', 'pronoun', 'interrogative', 'article', 'subordination', 'conjunction', 'preposition']

    for cat in readability_scores.keys():
        for subcat in readability_scores[cat].keys():
            if subcat in keepCats:
                cats.append(subcat)
                ind_score = readability_scores[cat][subcat]
                features.append(ind_score)

        # find those words that may be misspelled
    sentences = sentences.replace('\n', ' ').replace('\r', '').replace('etc', '')
    words = sentences.split(" ")
    words = [x for x in words if '\'' not in x and len(x)>3]
    misspelled = spell.unknown(words)
    features.append(len(misspelled))

    X.at[i_, 'nonseq_input' ] = features
    i_ += 1

664477 / 664478/ 664478 / 664478 / 664478 664478 664478 / 664478/ 664478664478/ 664478664478/ 664478 / 664478 / 664478/ 664478/ 664478 / 664478/ 664478 / 664478 / 664478 664478664478/ 664478/ 664478 / 664478 / 664478/ 664478 / 664478

In [10]:
#Predicting Test Set
test_X_nonseq = np.asarray(list(X["nonseq_input"]))
test_essays = X['Text']

clean_test_essays = []
for essay_v in test_essays:
    clean_test_essays.append(essay_to_wordlist( essay_v, remove_stopwords=True ))
testDataVecs = getAvgFeatureVecs( clean_test_essays, model, num_features )
testDataVecs = np.array(testDataVecs)
y_pred = lstm_model.predict([testDataVecs, test_X_nonseq])



In [11]:
#Adding Predictions to Df
y_pred = np.nan_to_num(y_pred)
X['predGrade'] = y_pred * 10
X[['Text', 'predGrade']].to_csv('nan.csv')

X.predGrade = X.predGrade.round(3)
X = X.rename(columns={"sentence_out": "c_set"})

In [12]:
#Generate SHAP values based on predictions (Brute-Force)
d = {'essay_id': -1,'item': -1, 'Marginal': -1, 'sentence': ['a']}
mdf = pd.DataFrame(data=d)

g_ = 0
for essayid in X['essay_id'].unique():  
    #Track Progress
    g_ += 1

    #Loc values of specific essay
    df = X.loc[X['essay_id'] == essayid]
    all = list(df.iloc[-1]['c_set'])
    coalitions = list(powerset(all))[1:]
    i_ = 0

    #Calculate contribution of sentence X
    for item in all: 
        print(g_ , '/', len(X['essay_id'].unique()), '|', i_, '/', len(all), end= '\r')
        cumWeight = 0
        marginal = 0
        for coal in coalitions:
            coalTemp = list(coal)
            if item in coal:
                removed_element = coalTemp.pop(coal.index(item))
                if len(coalTemp) == 0:
                    p = len(all)
                    s = len(coalTemp)
                    weight = round((math.factorial(s) * math.factorial(p - s - 1)) / math.factorial(p), 18)
                else:
                    p = len(all)
                    s = len(coalTemp)
                    weight = round((math.factorial(s) * math.factorial(p - s - 1)) / math.factorial(p), 18)
                    marginal += weight * (df.loc[(df['c_set'] == coal)]['predGrade'].item() - df.loc[(df['c_set'] == tuple(coalTemp))]['predGrade'].item())

        text = df.loc[(df['c_set'] == tuple([item]))]['Text'].item()
        d = {'essay_id': essayid, 'item': [item], 'Marginal': [marginal], 'sentence': text} 
        tdf = pd.DataFrame(data=d)
        mdf = pd.concat([mdf, tdf], ignore_index=True, axis=0)
        mdf.to_csv('./Data/InspectSHAP.csv')
        i_ += 1
        
mdf = mdf[1:]
mdf.head()

50 / 50 | 11 / 12

Unnamed: 0,essay_id,item,Marginal,sentence
1,575,0,0.307852,Libraries have always had interesting material...
2,575,1,0.288676,Libraries are one of the best places to find t...
3,575,2,0.38954,I do not believe that libraries shoul...
4,575,3,0.217903,Thats all that libraries like are people inter...
5,575,4,0.325146,If some people find different books offensive ...


In [13]:
#Normalize for words in the sentence
def calculate_score_per_word(row):
    text = row['sentence']
    word_count = len(text.split())
    grade = row['Marginal']
    score_per_word = grade / word_count if word_count != 0 else 0
    return score_per_word

# Apply the function to each row
mdf['Marginal_per_word'] = mdf.apply(calculate_score_per_word, axis=1)
mdf.to_csv('./Data/InspectSHAP.csv')