In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
import csv
import copy
import numpy as np
import nltk
import nltk.tokenize as tk
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
os.chdir('/content/drive/My Drive')
nltk.download('punkt')

Load data

In [None]:
# Read Set1 dataset，convert to list 
set_data = pd.read_csv(open(r'/content/drive/My Drive/TrainData/set1.txt', 'r'), sep='\t')
# Shuffle examples 
set_data = set_data.to_dict(orient='list')
size = len(set_data['domain1_score'])
df_spell_errors = pd.read_csv('/content/drive/My Drive/Handcrafted Features/spell_errors.txt', sep='\t')

Initialize df_features

In [None]:
zero = [0] * size
df_features = {'id': zero, 'word_avg_len': zero, 'word_var_len':zero, 
               'sent_avg_len': zero, 'sent_var_len': zero, 'essay_len_word': zero, 
               'essay_len_char': zero, 'label': zero}
df_features  = pd.DataFrame(df_features)

Handling some hand-crafted features：id, word length, sentence length, essay length etc.

In [None]:
def vanilla_features(item):
    global cnt, set_data
    if cnt == 0:
        cnt += 1
        return item
    sid = "set1_" + str(cnt-1)
    essay = set_data['essay'][cnt-1]
    words = tk.word_tokenize(essay)
    sents = tk.sent_tokenize(essay)
    sent_len = []
    word_len = []
  
    for word in words:
        word_len.append(len(word)) 
    for sent in sents:
        sent_words = tk.word_tokenize(sent)
        sent_len.append(len(sent_words))
    
    item['id'] = sid
    item['label'] = set_data['domain1_score'][cnt-1]
    item['essay_len_word'] = len(words)
    item['essay_len_char'] = len(essay)
    item['word_avg_len'] = sum(word_len) / len(word_len)
    item['word_var_len'] = max(word_len) - min(word_len)
    item['sent_avg_len'] = sum(sent_len) / len(sent_len)
    item['sent_var_len'] = max(sent_len) - min(sent_len)
    cnt += 1
  
    return item

In [None]:
cnt = 0
df_features = df_features.apply(lambda x: vanilla_features(x), axis=1)

In [None]:
df_features.head(n=4)

Combined features with those generate from two dictionaries

In [None]:
df_features = pd.merge(df_features, df_spell_errors, how='outer', on='id')

Grammar errors：based on bigram and trigram

In [None]:
vec_2 = CountVectorizer(lowercase=True, ngram_range=(2,2), min_df=1)
vec_3 = CountVectorizer(lowercase=True, ngram_range=(3,3), min_df=1)

In [None]:
vec2_fit = vec_2.fit_transform(set_data['essay'])
vec3_fit = vec_3.fit_transform(set_data['essay'])

In [None]:
vec2_array = vec2_fit.toarray()
vec3_array = vec3_fit.toarray()

In [None]:
vec2_sum = np.sum(vec2_array, axis=0)
vec3_sum = np.sum(vec3_array, axis=0)

In [None]:
np.array([1,0,3]) / np.array([1,2,1])

In [None]:
df_grammar = {'id': ['0']*size, 'bigram_word': [0]*size, 'trigram_word': [0]*size}
df_grammar  = pd.DataFrame(df_grammar)

In [None]:
for i in range(size):
    # each wornd in an essay 
    essay_array2 = vec2_array[i]
    essay_array3 = vec3_array[i]
    # Calculate tf/TF
    # 越高则越可能是个语法错误
    essay_array2 = essay_array2 / vec2_sum
    essay_array2 = sum(essay_array2)
    essay_array3 = essay_array3 / vec3_sum
    essay_array3 = sum(essay_array3)
    df_grammar.iloc[i, 0] = "set1_" + str(i)
    df_grammar.iloc[i, 1] = essay_array2
    df_grammar.iloc[i, 2] = essay_array3
  

In [None]:
df_features = pd.merge(df_features, df_grammar, how='outer', on='id')

In [None]:
df_features.head()