In [11]:
import pandas as pd
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize,RegexpTokenizer
import re
import enchant

[nltk_data] Downloading package stopwords to C:\Users\siliang
[nltk_data]     zhang\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\siliang
[nltk_data]     zhang\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\siliang zhang\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
dataset_root = './dataset/'
filename = 'globalenglish_essay_scoring.csv'

df = pd.read_csv(dataset_root + filename,encoding='latin-1')
print("=========================================")
print(df.head(5))

print("total number of essay: {}".format(len(df.index)))

sample_essay = ''
for index in range(df.shape[0]):
    sample_essay += df['essay'][index]
print("=========================================")
print(sample_essay[:20])

   essay_id  essay_set                                              essay  \
0         1          1  Dear local newspaper, I think effects computer...   
1         2          1  Dear @CAPS1 @CAPS2, I believe that using compu...   
2         3          1  Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...   
3         4          1  Dear Local Newspaper, @CAPS1 I have found that...   
4         5          1  Dear @LOCATION1, I know having computers has a...   

   rater1_domain1  rater2_domain1  domain1_score  
0             4.0             4.0            8.0  
1             5.0             4.0            9.0  
2             4.0             3.0            7.0  
3             5.0             5.0           10.0  
4             4.0             4.0            8.0  
total number of essay: 12978
Dear local newspaper


In [4]:


word_tokens = word_tokenize(sample_essay)

#remove punctuation
nonPunct = re.compile('.*[A-Za-z0-9].*')

word_tokens = [w for w in word_tokens if nonPunct.match(w)]

#lowercase all words
word_tokens = [w.lower() for w in word_tokens]

custom_stop_words = set(["''","``","'s","n't"])

stop_words = set(stopwords.words('english')) | custom_stop_words
filtered_words = [w for w in word_tokens if w not in stop_words]

print(filtered_words[:20])
print(len(filtered_words))

#count the frequency of words
fdist = nltk.FreqDist(filtered_words)

most_frequent_words = []
for word, freq in fdist.most_common(100):
    most_frequent_words.append(word)
    
print(most_frequent_words)

['dear', 'local', 'newspaper', 'think', 'effects', 'computers', 'people', 'great', 'learning', 'skills/affects', 'give', 'us', 'time', 'chat', 'friends/new', 'people', 'helps', 'us', 'learn', 'globe']
1404175
['people', 'would', 'caps1', 'computers', 'like', 'one', 'time', 'computer', 'get', 'could', 'books', 'also', 'think', 'building', 'things', 'many', 'go', 'caps2', 'family', 'book', 'even', 'way', 'author', 'parents', 'know', 'life', 'make', 'friends', 'good', 'going', 'offensive', 'want', 'take', 'story', 'us', 'caps3', 'see', 'read', 'new', 'day', 'something', 'home', 'said', 'much', 'got', 'mood', 'library', 'back', 'state', 'dirigibles', 'use', 'num1', 'children', 'help', 'another', 'music', 'cyclist', 'find', 'person1', 'always', 'need', 'really', 'empire', 'around', 'thing', 'say', 'kids', 'caps4', 'person', 'world', 'first', 'right', 'bad', 'made', 'movies', 'libraries', 'someone', 'reason', 'month1', 'paragraph', 'mast', 'never', 'everyone', 'school', 'learn', 'away', 'wat

In [6]:
def pos_tagging_feature(essay):
    """
        we only need 
        'NN/NNS/NNP/NNPS' -> noun
        'VB/VBD/VBG/VBN/VBP/VBZ' -> verb
        'JJ/JJR/JJS' -> adjective
    """
    word_tokens = word_tokenize(essay)
    words_tagged = nltk.pos_tag(word_tokens)
    
    tag_freq = nltk.FreqDist([tag for (word, tag) in words_tagged])
    
    tag_list = tag_freq.most_common()
    print(tag_list)
    tag_dict = {}
    tag_dict['NN'] = sum([ pair[1] for pair in tag_list if re.match(r"NN.*",pair[0])])
    tag_dict['VB'] = sum([pair[1] for pair in tag_list if re.match(r"VB.*",pair[0])])
    tag_dict['JJ'] = sum([pair[1] for pair in tag_list if re.match(r"JJ.*",pair[0])])
    
    return tag_dict

one_essay = df['essay'][0]
tag_dict = pos_tagging_feature(one_essay)
print(tag_dict)

[('NN', 55), ('IN', 53), ('PRP', 29), ('VB', 21), ('RB', 21), ('JJ', 20), ('DT', 20), (',', 18), ('NNS', 17), ('.', 16), ('VBP', 14), ('VBZ', 14), ('CC', 14), ('VBG', 14), ('NNP', 12), ('PRP$', 12), ('TO', 10), ('MD', 5), ('WRB', 4), ('VBN', 4), ('RP', 3), (':', 2), ('JJR', 2), ('(', 1), (')', 1), ('EX', 1), ('VBD', 1), ('WP', 1), ('POS', 1)]
{'NN': 84, 'VB': 68, 'JJ': 22}


In [15]:
def statistical_feature(essay):
    
    #average sentence length
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    sent_list = sent_detector.tokenize(essay.strip())
    
    tokenizer = RegexpTokenizer(r'\w+')
    
    #sentence count
    num_sentence = len(sent_list)

    #average length
    average_sent_length = sum([len(tokenizer.tokenize(sentence)) for sentence in sent_list ]) / num_sentence
    
    
    word_tokens = tokenizer.tokenize(essay)
    
    #lower case
    word_tokens = [w.lower() for w in word_tokens]
    
    #remove stopwords

    stop_words = set(stopwords.words('english'))
    
    word_tokens = [w for w in word_tokens if w not in stop_words]
    
    #words count
    num_words = len(word_tokens)
    
    statis_dict = {}
    statis_dict['num_words'] = num_words
    statis_dict['num_sentences'] = num_sentence
    statis_dict['average_sent_length'] = average_sent_length

    return statis_dict
stat = statistical_feature(one_essay)
print(stat)

{'num_words': 166, 'num_sentences': 16, 'average_sent_length': 21.875}


In [None]:
def orth_feature(essay):
    