In [8]:
from __future__ import absolute_import, division, print_function, unicode_literals

import pathlib
from collections import Counter
from statistics import mean

import language_tool_python
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import seaborn as sns; 
import textstat as ts
from textblob import TextBlob
from textblob import Word
nltk.download('punkt'); nltk.download('brown'); nltk.download('averaged_perceptron_tagger')
sns.set(style="ticks", color_codes=True)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yeshu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\yeshu\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\yeshu\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [9]:
DATASET_DIR = 'data/'

import os
import pandas as pd

X = pd.read_csv(os.path.join(DATASET_DIR, 'training_set_rel3.tsv'), sep='\t', encoding='ISO-8859-1')
y = X['domain1_score']
X = X.dropna(axis=1)
df = X.drop(columns=['rater1_domain1', 'rater2_domain1'])

In [10]:
df

Unnamed: 0,essay_id,essay_set,essay,domain1_score
0,1,1,"Dear local newspaper, I think effects computer...",8
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",10
4,5,1,"Dear @LOCATION1, I know having computers has a...",8
...,...,...,...,...
12971,21626,8,In most stories mothers and daughters are eit...,35
12972,21628,8,I never understood the meaning laughter is th...,32
12973,21629,8,"When you laugh, is @CAPS5 out of habit, or is ...",40
12974,21630,8,Trippin' on fen...,40


In [11]:
meta_features = ['essay_length', 'avg_sentence_length', 'avg_word_length']
grammar_features = ['sentiment', 'noun_phrases', 'syntax_errors']
redability_features = ['readability_index', 'difficult_words']

df.reindex(columns=meta_features + grammar_features + redability_features, fill_value=np.zeros)
essays = df['essay'].values

In [13]:
def add_meta_feature_columns(index, df, blob):
    
    # Essay Length (number of words)
    df.at[index, 'essay_length'] = len(blob.words)

    # Average Sentence Length
    sentence_lengths = [len(sentence.split(' ')) for sentence in blob.sentences]
    df.at[index, 'avg_sentence_length'] = mean(sentence_lengths)

    # Average Word Length
    word_lengths = [len(word) for word in blob.words]
    df.at[index, 'avg_word_length'] = mean(word_lengths)

    # Sentiment
    df.at[index, 'sentiment'] = blob.sentiment.polarity

def add_grammar_feature_columns(index, df, blob, essay):
    
    # Number of noun phrases
    df.at[index, 'noun_phrases'] = len(blob.noun_phrases)

    # Number of possible spelling and grammatical Mistakes
    # Takes significant amount of time
    print("Processed %5d essays for correctness..." % (index + 1), end="\r")
    languageTool = language_tool_python.LanguageTool('en-US')
    df.at[index, 'syntax_errors'] = len(languageTool.check(essay))

def add_redability_feature_columns(index, df, essay):
    
    # Number of noun phrases
    df.at[index, 'readability_index'] = ts.automated_readability_index(essay)

    df.at[index, 'difficult_words'] = ts.difficult_words(essay)
print("Adding feature Columns...")

for i in range(df.shape[0]):
    blob = TextBlob(essays[i])

    add_meta_feature_columns(i, df, blob)
    
    add_grammar_feature_columns(i, df, blob, essays[i])
    
    add_redability_feature_columns(i, df, essays[i])

print("\nDone!")

Adding feature Columns...
Processed     1 essays for correctness...

Downloading LanguageTool 5.7: 100%|██████████| 225M/225M [00:16<00:00, 13.7MB/s] 
Unzipping C:\Users\yeshu\AppData\Local\Temp\tmp5nkj765b.zip to C:\Users\yeshu\.cache\language_tool_python.
Downloaded https://www.languagetool.org/download/LanguageTool-5.7.zip to C:\Users\yeshu\.cache\language_tool_python.


Processed    12 essays for correctness...

KeyboardInterrupt: 