In [None]:
# Code Citations
# Ridley, R., He, L., Dai, X., Huang, S., & Chen, J. (2020). Prompt agnostic essay scorer: a domain generalization approach to cross-prompt automated essay scoring. arXiv preprint arXiv:2008.01441.
# Pethani, M. (2019) Automated Essay Scoring: Kaggle Competition — End to End Project Implementation. Medium. Retrieved from https://medium.com/@mayurmorin/automated-essay-scoring-kaggle-competition-end-to-end-project-implementation-part-1-b75a043903c4

In [1]:
#Imports
import numpy as np
import pandas as pd
import os
import re
import ast
import readability
import tensorflow as tf
import math
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from gensim.models import KeyedVectors
import gensim.models
from sklearn.metrics import cohen_kappa_score
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
ignore_warnings = True
if ignore_warnings:
    import warnings
    warnings.filterwarnings("ignore")
    import os
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
import lime
import lime.lime_tabular
import docx 
from docx.enum.text import WD_COLOR_INDEX 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Oscar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Oscar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
#Load Settings
loadName = 'DENSE_3004'
includesGrades = True

#Load Models
lstm_model = tf.keras.models.load_model(loadName + '.h5')
model = gensim.models.KeyedVectors.load_word2vec_format(loadName + '_voc.bin', binary=True)

#Load Settings
settingsdf = pd.read_csv(loadName + '_settings.csv')
print(settingsdf['num_features'][0], settingsdf['MAX_SENTLEN'][0], settingsdf['keepCats'][0])
num_features = settingsdf['num_features'][0]
MAX_SENTLEN = settingsdf['MAX_SENTLEN'][0]
keepCats = ast.literal_eval(settingsdf['keepCats'][0])

#Load Test Data
X = pd.read_csv('./Data/test.csv')
Xtrain = pd.read_csv('./Data/train.csv')

500 100 ['Kincaid', 'complex_words', 'type_token_ratio', 'words', 'wordtypes', 'subordination', 'conjunction', 'preposition']


In [3]:
#Adjust Dataframe
if not includesGrades:
    X['Grade'] = -1
    Xtrain['Grade'] = -1
X["nonseq_input"] = X.apply(lambda x: [x["Grade"]] + [x["Grade"]], axis=1)
Xtrain["nonseq_input"] = Xtrain.apply(lambda x: [x["Grade"]] + [x["Grade"]], axis=1)

In [4]:
#Calculate writing features for explanation categories
explainCategories =  ['Kincaid', 'ARI', 'Coleman-Liau', 'FleschReadingEase', 'GunningFogIndex', 'LIX', 
 'SMOGIndex', 'RIX', 'DaleChallIndex', 'characters_per_word', 'syll_per_word', 'words_per_sentence', 'sentences_per_paragraph', 
 'type_token_ratio', 'directspeech_ratio', 'characters', 'syllables', 'words', 'wordtypes', 'sentences', 'paragraphs', 'long_words', 
 'complex_words', 'complex_words_dc', 'tobeverb', 'auxverb', 'conjunction', 'pronoun', 'preposition', 'nominalization', 'pronoun', 
 'interrogative', 'article', 'subordination', 'conjunction', 'preposition'] 

for cat in explainCategories:
    X[cat] = 0
    Xtrain[cat] = 0

X['spelling_mistakes'] = 0
Xtrain['spelling_mistakes'] = 0

X['word_count'] = 0
Xtrain['word_count'] = 0


In [5]:
#Functions Word2Vec (Pethani, M., 2019)
def essay_to_wordlist(essay_v, remove_stopwords):
    """Remove the tagged labels and word tokenize the sentence."""
    essay_v = re.sub("[^a-zA-Z]", " ", essay_v)
    words = essay_v.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return (words)

def essay_to_sentences(essay_v, remove_stopwords):
    """Sentence tokenize the essay and call essay_to_wordlist() for word tokenization."""
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sentences = tokenizer.tokenize(essay_v.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(essay_to_wordlist(raw_sentence, remove_stopwords))
    return sentences

def makeFeatureVec(words, model, num_features):
    """Make Feature Vector from the words list of an Essay."""
    featureVec = np.zeros((num_features,),dtype="float32")
    num_words = 0.
    index2word_set = set(model.index_to_key)
    for word in words:
        if word in index2word_set:
            num_words += 1
            featureVec = np.add(featureVec,model.get_vector(word))        
    featureVec = np.divide(featureVec,num_words)
    return featureVec

def getAvgFeatureVecs(essays, model, num_features):
    """Main function to generate the word vectors for word2vec model."""
    counter = 0
    essayFeatureVecs = np.zeros((len(essays),num_features),dtype="float32")
    for essay in essays:
        essayFeatureVecs[counter] = makeFeatureVec(essay, model, num_features)
        counter = counter + 1
    return essayFeatureVecs

In [6]:
#Writing Features Functions (Ridley et al., 2020)
def replace_url(text):
    replaced_text = re.sub('(http[s]?://)?((www)\.)?([a-zA-Z0-9]+)\.{1}((com)(\.(cn))?|(org))', '<url>', text)
    return replaced_text

def tokenize(string):
    tokens = nltk.word_tokenize(string)
    for index, token in enumerate(tokens):
        if token == '@' and (index+1) < len(tokens):
            tokens[index+1] = '@' + re.sub('[0-9]+.*', '', tokens[index+1])
            tokens.pop(index)
    return tokens

def shorten_sentence(sent, max_sentlen):
    new_tokens = []
    sent = sent.strip()
    tokens = nltk.word_tokenize(sent)
    if len(tokens) > max_sentlen:
        split_keywords = ['because', 'but', 'so', 'You', 'He', 'She', 'We', 'It', 'They', 'Your', 'His', 'Her']
        k_indexes = [i for i, key in enumerate(tokens) if key in split_keywords]
        processed_tokens = []
        if not k_indexes:
            num = len(tokens) / max_sentlen
            num = int(round(num))
            k_indexes = [(i+1)*max_sentlen for i in range(num)]

        processed_tokens.append(tokens[0:k_indexes[0]])
        len_k = len(k_indexes)
        for j in range(len_k-1):
            processed_tokens.append(tokens[k_indexes[j]:k_indexes[j+1]])
        processed_tokens.append(tokens[k_indexes[-1]:])

        for token in processed_tokens:
            if len(token) > max_sentlen:
                num = len(token) / max_sentlen
                num = int(np.ceil(num))
                s_indexes = [(i+1)*max_sentlen for i in range(num)]

                len_s = len(s_indexes)
                new_tokens.append(token[0:s_indexes[0]])
                for j in range(len_s-1):
                    new_tokens.append(token[s_indexes[j]:s_indexes[j+1]])
                new_tokens.append(token[s_indexes[-1]:])

            else:
                new_tokens.append(token)
    else:
        return [tokens]

    return new_tokens

def tokenize_to_sentences(text, max_sentlength, create_vocab_flag=False):
    sents = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\!|\?)\s', text)
    processed_sents = []
    for sent in sents:
        if re.search(r'(?<=\.{1}|\!|\?|\,)(@?[A-Z]+[a-zA-Z]*[0-9]*)', sent):
            s = re.split(r'(?=.{2,})(?<=\.{1}|\!|\?|\,)(@?[A-Z]+[a-zA-Z]*[0-9]*)', sent)
            ss = " ".join(s)
            ssL = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\!|\?)\s', ss)

            processed_sents.extend(ssL)
        else:
            processed_sents.append(sent)

    if create_vocab_flag:
        sent_tokens = [tokenize(sent) for sent in processed_sents]
        tokens = [w for sent in sent_tokens for w in sent]
        return tokens

    sent_tokens = []
    for sent in processed_sents:
        shorten_sents_tokens = shorten_sentence(sent, max_sentlength)
        sent_tokens.extend(shorten_sents_tokens)
    return sent_tokens

def text_tokenizer(text, replace_url_flag=True, tokenize_sent_flag=True, create_vocab_flag=False):
    text = replace_url(text)
    text = text.replace(u'"', u'')
    if "..." in text:
        text = re.sub(r'\.{3,}(\s+\.{3,})*', '...', text)
    if "??" in text:
        text = re.sub(r'\?{2,}(\s+\?{2,})*', '?', text)
    if "!!" in text:
        text = re.sub(r'\!{2,}(\s+\!{2,})*', '!', text)

    tokens = tokenize(text)
    if tokenize_sent_flag:
        text = " ".join(tokens)
        sent_tokens = tokenize_to_sentences(text, MAX_SENTLEN, create_vocab_flag)
        return sent_tokens
    else:
        raise NotImplementedError

In [7]:
#Generate Writing Features (Ridley et al., 2020)
from spellchecker import SpellChecker
spell = SpellChecker()

i_ = 0
for index, row in X.iterrows():
    content = row['Text']
    score = row['Grade']

    sent_tokens = text_tokenizer(content, replace_url_flag=True, tokenize_sent_flag=True)
    sentences = [' '.join(sent) + '\n' for sent in sent_tokens]
    sentences = ''.join(sentences)
    readability_scores = readability.getmeasures(sentences, lang='en')
    
    features = []
    cats = []
    #keepCats =['Kincaid', 'ARI', 'Coleman-Liau', 'FleschReadingEase', 'GunningFogIndex', 'LIX', 'SMOGIndex', 'RIX', 'DaleChallIndex', 'characters_per_word', 'syll_per_word', 'words_per_sentence', 'sentences_per_paragraph', 'type_token_ratio', 'directspeech_ratio', 'characters', 'syllables', 'words', 'wordtypes', 'sentences', 'paragraphs', 'long_words', 'complex_words', 'complex_words_dc', 'tobeverb', 'auxverb', 'conjunction', 'pronoun', 'preposition', 'nominalization', 'pronoun', 'interrogative', 'article', 'subordination', 'conjunction', 'preposition']

    for cat in readability_scores.keys():
        for subcat in readability_scores[cat].keys():
            ind_score = readability_scores[cat][subcat]
            if subcat in explainCategories:
                X.at[i_, subcat] += ind_score
            if subcat in keepCats:
                cats.append(subcat)
                features.append(ind_score)
    # find those words that may be misspelled
    sentences = sentences.replace('\n', ' ').replace('\r', '').replace('etc', '')
    words = sentences.split(" ")
    X.at[i_, 'word_count'] = len(words)
    words = [x for x in words if '\'' not in x and len(x)>3]
    misspelled = spell.unknown(words)
    features.append(len(misspelled))
    X.at[i_, 'spelling_mistakes'] = len(misspelled)

    X.at[i_, 'nonseq_input' ] = features
    i_ += 1


g_ = 0
for index, row in Xtrain.iterrows():
    content = row['Text']
    score = row['Grade']

    sent_tokens = text_tokenizer(content, replace_url_flag=True, tokenize_sent_flag=True)
    sentences = [' '.join(sent) + '\n' for sent in sent_tokens]
    sentences = ''.join(sentences)
    readability_scores = readability.getmeasures(sentences, lang='en')
    
    features = []
    cats = []
    #keepCats =['Kincaid', 'ARI', 'Coleman-Liau', 'FleschReadingEase', 'GunningFogIndex', 'LIX', 'SMOGIndex', 'RIX', 'DaleChallIndex', 'characters_per_word', 'syll_per_word', 'words_per_sentence', 'sentences_per_paragraph', 'type_token_ratio', 'directspeech_ratio', 'characters', 'syllables', 'words', 'wordtypes', 'sentences', 'paragraphs', 'long_words', 'complex_words', 'complex_words_dc', 'tobeverb', 'auxverb', 'conjunction', 'pronoun', 'preposition', 'nominalization', 'pronoun', 'interrogative', 'article', 'subordination', 'conjunction', 'preposition']

    for cat in readability_scores.keys():
        for subcat in readability_scores[cat].keys():
            ind_score = readability_scores[cat][subcat]
            if subcat in explainCategories:
                Xtrain.at[g_, subcat] += ind_score
            if subcat in keepCats:
                cats.append(subcat)
                features.append(ind_score)
                
    # find those words that may be misspelled
    sentences = sentences.replace('\n', ' ').replace('\r', '').replace('etc', '')
    words = sentences.split(" ")
    Xtrain.at[g_, 'word_count'] = len(words)
    words = [x for x in words if '\'' not in x and len(x)>3]
    misspelled = spell.unknown(words)
    features.append(len(misspelled))
    Xtrain.at[g_, 'spelling_mistakes'] = len(misspelled)

    Xtrain.at[g_, 'nonseq_input' ] = features
    g_ += 1

In [8]:
#Predicting Test set and saving the scores
test_X_nonseq = np.asarray(list(X["nonseq_input"]))
test_essays = X['Text']

clean_test_essays = []
for essay_v in test_essays:
    clean_test_essays.append(essay_to_wordlist( essay_v, remove_stopwords=True ))
testDataVecs = getAvgFeatureVecs( clean_test_essays, model, num_features )
testDataVecs = np.array(testDataVecs)
#testDataVecs = np.reshape(testDataVecs, (testDataVecs.shape[0], 1, testDataVecs.shape[1]))
y_pred = lstm_model.predict([testDataVecs, test_X_nonseq])

d = {'Index': list(X['Index'].values), 'Score': [x[0]*10 for x in y_pred]}
scoreDf = pd.DataFrame(data=d)
scoreDf.to_csv('./Data/scoreDf.csv', index=False)



In [10]:
#Including the training data for explanations
Xtrain = pd.concat((Xtrain, X))

Xtrain_nonseq = np.asarray(list(Xtrain['nonseq_input']))
Xtrain_essays = Xtrain['Text']

clean_essays = []
for essay_v in Xtrain_essays:
    clean_essays.append(essay_to_wordlist(essay_v, remove_stopwords=True ))

DataVecs = getAvgFeatureVecs(clean_essays, model, num_features )
DataVecs = np.array(DataVecs)
y_pred_train = lstm_model.predict([DataVecs, Xtrain_nonseq])

Xtrain['pGrade'] = y_pred_train



In [13]:
#Add some custom explanation features

#Argumentative words
Xtrain['countWords_argumentation'] = Xtrain['Text'].apply(lambda x: x.count('because') + x.count('Because') + x.count('therefore') + x.count('Therefore') + x.count('but') + x.count('But') + x.count('believe')+ x.count('think') + x.count('perspective')+ x.count('then')+ x.count('Then')+ x.count('example'))

#Prompt adherence
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(Xtrain['Text'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_sim_df = pd.DataFrame(cosine_sim, columns=Xtrain.index, index=Xtrain.index)
average_similarity_per_text = np.mean(cosine_sim, axis=1)
Xtrain['sim'] = average_similarity_per_text

In [15]:
#Selection of writing features for groupings

#Writing Quality
Xtrain['spelling_mistakes'] = Xtrain['spelling_mistakes'] * -1
Xtrain['spelling_mistakes'] = Xtrain['spelling_mistakes'] / Xtrain['word_count']
Xtrain['Coleman-Liau'] = Xtrain['Coleman-Liau']

#Word Usage
Xtrain['wordtypes'] = Xtrain['wordtypes']
Xtrain['complex_words'] = Xtrain['complex_words']  
Xtrain['characters_per_word'] = Xtrain['characters_per_word']

#Argumentation
Xtrain['preposition'] = Xtrain['preposition'] 
Xtrain['complex_words'] = Xtrain['complex_words']

#Prompt adherence
Xtrain['sim'] = Xtrain['sim']


In [16]:
#Rank all variables column-wise for normalization
for column in Xtrain.columns[6:]:
    Xtrain[column] = Xtrain[column].rank(method='first')
    Xtrain[column] = Xtrain[column].astype(int)

In [17]:
#Inspect variables selected
print('Writing Quality')
print('CW', Xtrain['pGrade'].corr(Xtrain['characters_per_word']))
print('SPEL', Xtrain['pGrade'].corr(Xtrain['spelling_mistakes']))
print()
print('Word Usage')
print('TTR', Xtrain['pGrade'].corr(Xtrain['type_token_ratio']))
print('complex', Xtrain['pGrade'].corr(Xtrain['complex_words']))
print()
print('Argumentation')
print('PREP', Xtrain['pGrade'].corr(Xtrain['preposition']))
print('ARG', Xtrain['pGrade'].corr(Xtrain['countWords_argumentation']))
print()
print('Prompt adherence')
print('sim', Xtrain['pGrade'].corr(Xtrain['sim']))


Writing Quality
CW 0.45795265163147686
SPEL 0.6448969356677785

Word Usage
TTR -0.359701944352452
complex 0.847760113506208

Argumentation
PREP 0.6985830201387924
ARG 0.29513676804632755

Prompt adherence
sim 0.42816066507016415


In [18]:
#Create interpretable groups
d = {'Index': np.repeat(-1, len(X)), 'wordUsage': -1, 'writingQuality': -1, 'textLength': -1, 'argumentation': -1}

Xtrain['wordUsage'] =  Xtrain['characters_per_word'] + Xtrain['wordtypes'] + Xtrain['complex_words'] 
Xtrain['writingQuality'] = Xtrain['Coleman-Liau']  + Xtrain['spelling_mistakes']
Xtrain['textLength'] = Xtrain['sim'] 
Xtrain['argumentation'] = Xtrain['preposition'] + Xtrain['countWords_argumentation']

print('WU', Xtrain['pGrade'].corr(Xtrain['wordUsage']))
print('WQ', Xtrain['pGrade'].corr(Xtrain['writingQuality']))
print('TL', Xtrain['pGrade'].corr(Xtrain['textLength']))
print('ARG', Xtrain['pGrade'].corr(Xtrain['argumentation']))

Xtrain = Xtrain[['Index','Text','Grade','nonseq_input', 'wordUsage', 'writingQuality', 'textLength', 'argumentation']]

WU 0.8532541284785863
WQ 0.6919491503164783
TL 0.42816066507016415
ARG 0.5794891015400212


In [19]:
#Rank groups
for column in Xtrain.columns[4:]:
    Xtrain[column] = Xtrain[column].rank(method='first')
    Xtrain[column] = Xtrain[column].astype(int)

In [20]:
#Retrieve test data
X = Xtrain.loc[Xtrain['Index'].isin(X['Index'].values)]

In [21]:
#Find closest values
def find_closest_values(array2D, differenceDf):
    cvalues = [] #column values
    indexSelection = []
    
    for h_, column in enumerate(differenceDf.columns[4:]):
        cvalues.append(differenceDf[column].values)
        
    i_ = 0
    for array in array2D:
        result = 0
        for i in range(len(cvalues)):
            a = cvalues[i]
            b = np.repeat((array[i]), len(cvalues[i]))
            result += np.abs(a - b)

        bestCandidates = np.where(result == result.min())
        best = np.random.choice(bestCandidates[0])
        best = differenceDf.iloc[best]['Index']
        i_ += 1
        indexSelection.append(best)
        
    return indexSelection

In [22]:
#Generate Explanations with LIME
text_features = Xtrain[['Text', 'nonseq_input']].values
custom_features = Xtrain[Xtrain.columns[4:]].values

def predict_fn(data):

    testCase = find_closest_values([data[0]], X)
    Xtest = X.loc[X['Index'] == testCase[0]]

    pertubations = data[1:]
    mostSimilar = find_closest_values(pertubations, Xtrain)
    XmostSimilar = pd.DataFrame()
    for index in mostSimilar:
        XmostSimilar = pd.concat([XmostSimilar, Xtrain.loc[Xtrain['Index'] == index]], ignore_index=True)
    
    Xcompute = pd.concat([Xtest, XmostSimilar], ignore_index=True, axis=0)
    X_nonseq = np.asarray(list(Xcompute['nonseq_input']))
    X_essays = Xcompute['Text']

    clean_essays = []
    for essay_v in X_essays:
        clean_essays.append(essay_to_wordlist(essay_v, remove_stopwords=True ))

    DataVecs = getAvgFeatureVecs(clean_essays, model, num_features )
    DataVecs = np.array(DataVecs)
    y_pred_ = lstm_model.predict([DataVecs, X_nonseq])
    return y_pred_*10
catFeat = list(np.arange(len(Xtrain.columns[4:])))
width = math.sqrt(len(Xtrain.columns[4:])) * 0.75 
explainer = lime.lime_tabular.LimeTabularExplainer(custom_features, feature_names=Xtrain.columns[4:], mode='regression', kernel_width=width, discretize_continuous= True, discretizer='quartile')


In [23]:
#Save results
sample_instance_index = 0
pattern = r'[<>]'
scoreDf = pd.read_csv('./Data/scoreDf.csv')

d = {'Index': np.repeat(-1, len(X)), 'wordUsage': -1, 'writingQuality': -1, 'textLength': -1, 'argumentation': -1}
limeDf = pd.DataFrame(data=d)

i_ = 0
for sample_instance_index in range(len(X)):
    print('Essay:', i_, '/', len(X))

    sample_instance = X.iloc[sample_instance_index][Xtrain.columns[4:]]
    print('Sample:', X.iloc[sample_instance_index]['Index'])
    text = X.iloc[sample_instance_index]['Text']
    trackID = X.iloc[sample_instance_index]['Index']
    
    exp = explainer.explain_instance(sample_instance, predict_fn, num_features=len(Xtrain.columns[4:]), num_samples=5000)
    expList = exp.as_list()
   
    #Create Overview
    wordUsageScs = []
    writingQualityScs = []
    textLengthScs = []
    argumentationScs = []

    for item in expList:
        print('item:', item)
        category = str(item).split('<')
        if len(category) == 2:
            category = category[0].strip()[2:]
        elif len(category) == 3:
            category = category[1].strip()
        else:
            category = category[0].split('>')[0][2:].strip()
        
        if category == 'wordUsage':
            wordUsageScs.append(item[1])
        elif category == 'writingQuality':
            writingQualityScs.append(item[1])
        elif category == 'textLength':
            textLengthScs.append(item[1])
        elif category in 'argumentation':
            argumentationScs.append(item[1])
        else:
            print('Wrong:', category)
    
    limeDf.at[i_, 'Index'] = trackID
    limeDf.at[i_, 'wordUsage'] = sum(wordUsageScs)
    limeDf.at[i_, 'writingQuality'] = sum(writingQualityScs)
    limeDf.at[i_, 'textLength'] = sum(textLengthScs)
    limeDf.at[i_, 'argumentation'] = sum(argumentationScs)
    i_+= 1
     
        


Essay: 0 / 50
Sample: 575


item: ('writingQuality > 1350.25', 0.39825719500674334)
item: ('textLength > 1350.25', 0.22537897713885613)
item: ('450.75 < wordUsage <= 900.50', -0.18686204350897465)
item: ('450.75 < argumentation <= 900.50', -0.10488411836496435)
Essay: 1 / 50
Sample: 1512
item: ('writingQuality > 1350.25', 0.3986045731682742)
item: ('argumentation <= 450.75', -0.3042128730104195)
item: ('900.50 < wordUsage <= 1350.25', 0.26611706495187015)
item: ('textLength <= 450.75', -0.2565966708781153)
Essay: 2 / 50
Sample: 896
item: ('wordUsage <= 450.75', -0.7161786646621319)
item: ('writingQuality <= 450.75', -0.43092314829901013)
item: ('450.75 < textLength <= 900.50', -0.11469018699415322)
item: ('450.75 < argumentation <= 900.50', -0.07698199567078882)
Essay: 3 / 50
Sample: 1349
item: ('wordUsage <= 450.75', -0.746970968147116)
item: ('textLength <= 450.75', -0.2911646261932919)
item: ('900.50 < writingQuality <= 1350.25', 0.1568624110129333)
item: ('450.75 < argumentation <= 900.50', -0.069985933155580

In [27]:
limeDf.to_csv('./Data/limeDf.csv', index=False)

def count_positive_values(row):
    return sum(1 for value in row if value > 0)

limeDf['positive_count'] = limeDf.apply(count_positive_values, axis=1)
limeDf['positive_count'] = limeDf['positive_count'] -1
limeDf


Unnamed: 0,Index,wordUsage,writingQuality,textLength,argumentation,positive_count
0,575,-0.186862,0.398257,0.225379,-0.104884,3
1,1512,0.266117,0.398605,-0.256597,-0.304213,3
2,896,-0.716179,-0.430923,-0.11469,-0.076982,0
3,1349,-0.746971,0.156862,-0.291165,-0.069986,2
4,1795,-0.723339,-0.430234,-0.107689,-0.295101,0
5,438,-0.19061,-0.066515,-0.082003,-0.328384,0
6,1178,-0.182588,0.413589,-0.061156,-0.052503,2
7,1468,-0.71698,-0.081229,-0.081505,-0.073549,0
8,461,-0.744794,-0.434095,0.121277,-0.264683,2
9,845,-0.730682,0.1428,-0.016025,-0.323195,2


In [25]:
expList = exp.as_list()
expList

[('writingQuality > 1350.25', 0.326440983475948),
 ('argumentation <= 450.75', -0.27770559050880156),
 ('450.75 < wordUsage <= 900.50', -0.1848107723919241),
 ('450.75 < textLength <= 900.50', -0.09176521850165788)]

In [26]:
duplicate_rows = X.duplicated(subset=['wordUsage', 'writingQuality', 'textLength', 'argumentation'])