# Data Preprocessing

In [19]:
import re
import string

def clean_text(essay):
    essay=str(essay)
    result = re.sub(r'http[^\s]*','',essay)
    # Replace sentence that is http followed till non-whitespace character, with blank space.
    # [^abc] means Find any character NOT between the brackets
    # \s means whitespace character
    # [^\s] means Find any non-whitespace NOT between the brackets
    result = re.sub('[0-9]+','', result).lower()
    # Replace any number between 0-9. + mean one or more
    result = re.sub('@[a-z0-9]+', '', result)
    return re.sub('[%s]*' % string.punctuation, '',result)

In [51]:
print(clean_text('hello http:\\www.yahoo.com cool asf98t 123'))

hello  cool asft 


In [4]:
# This function will remove any emojis in essay
def deEmojify(essay):
    return essay.encode('ascii', 'ignore').decode('ascii')

In [5]:
print(deEmojify('hello 😍😍😍😍😍😍😍😝'))

hello 


In [27]:
# Function to calculate number of characters in a  essay
def char_count(essay):
    clean_essay = re.sub(r'\s', '', str(essay).lower())
    #Remove any whitespace
    return len(clean_essay)

In [29]:
print(char_count('hello I am good'))

12


In [6]:
import nltk
nltk.download('punkt')

# Function to calculate number of words in a  essay
def word_count(essay):
    clean_essay = re.sub(r'\W', ' ', essay)
    # Replace a non-word character with space
    # \W  mean to find a non-word character
    words = nltk.word_tokenize(clean_essay)
    # A sentence or data can be split into words using the method
    return len(words)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sahil\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [44]:
print(word_count("All work 😍 and no play makes jack a dull boy, all work and no play"))
# 'a' and 😍 is not a token

15


In [45]:
def sent_count(essay):
    sentences = nltk.sent_tokenize(essay)
    return len(sentences)

In [50]:
print(sent_count('All work and no play makes jack dull boy. All work and no play makes jack a dull boy. I am, good'))

3


In [64]:
# Function to return average length of words in sentence
def avg_word_len(essay):
    clean_essay = re.sub(r'\W', ' ', essay)
    words = nltk.word_tokenize(clean_essay)
    #z = 0
    #for word in words:
    #    z = z + len(word)
    #print(z/len(words))
    return sum(len(word) for word in words) / len(words)

In [65]:
print(avg_word_len("All work and no play makes jack dull boy. All work and no play makes jack a dull boy."))

3.4210526315789473


In [87]:
# Function to get tokenize words in a sentence
def get_wordlist(sentence): 
    clean_sentence = re.sub("[^A-Z0-9a-z]"," ", sentence)
    # Replace non-alphanumeric with space.
    wordlist = nltk.word_tokenize(clean_sentence)
    return wordlist

In [88]:
get_wordlist('All?work and no#play makes jack dull boy.')

['All', 'work', 'and', 'no', 'play', 'makes', 'jack', 'dull', 'boy']

In [90]:
# Function to tokenize whole essay into words.
def tokenize(essay):
    stripped_essay = essay.strip()
    # Remove trailing and leading spaces.
    
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sentences = tokenizer.tokenize(stripped_essay)
    # Tokenize into sentences
    
    tokenized_sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            tokenized_sentences.append(get_wordlist(raw_sentence))
    
    return tokenized_sentences

In [108]:
essay = "  All work and no play makes jack dull boy. Goal is nice. Hello me. To the group. I am nice. "
tokenize(essay)

[['All', 'work', 'and', 'no', 'play', 'makes', 'jack', 'dull', 'boy'],
 ['Goal', 'is', 'nice'],
 ['Hello', 'me'],
 ['To', 'the', 'group'],
 ['I', 'am', 'nice']]

In [138]:
from nltk.corpus import wordnet
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

# pos_tag require nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger')

# Function to create lemmas and return the number of lemma in essay
def count_lemmas(essay):
    
    tokenized_sentences = tokenize(essay)
    # Return word token of each sentence
    
    lemmas = []
    wordnet_lemmatizer = WordNetLemmatizer()
    #Lemmatization is the process of grouping together the different inflected forms of a word so they can be analysed as a single item.
    #So it links words with similar meaning to one word.
    #Examples of lemmatization:
    #rocks : rock
    #corpora : corpus
    #better : good
    for sentence in tokenized_sentences:
        tagged_tokens = nltk.pos_tag(sentence)
        # Return words in a sentence w/ POS tag 
        for token_tuple in tagged_tokens:
            pos_tag = token_tuple[1]
            if pos_tag.startswith('N'): 
                pos = wordnet.NOUN
                # Lemmatize take simple pos tags. NN will be given as n and so on for others. 
                lemmas.append(wordnet_lemmatizer.lemmatize(token_tuple[0], pos))
                # Lemmatize word w/ pos tag to get similar meaning even if words are diffrent
            elif pos_tag.startswith('J'):
                pos = wordnet.ADJ
                lemmas.append(wordnet_lemmatizer.lemmatize(token_tuple[0], pos))
            elif pos_tag.startswith('V'):
                pos = wordnet.VERB
                lemmas.append(wordnet_lemmatizer.lemmatize(token_tuple[0], pos))
            elif pos_tag.startswith('R'):
                pos = wordnet.ADV
                lemmas.append(wordnet_lemmatizer.lemmatize(token_tuple[0], pos))
            else:
                pos = wordnet.NOUN
                lemmas.append(wordnet_lemmatizer.lemmatize(token_tuple[0], pos))  
    lemma_count = len(set(lemmas))
    # Set remove repeating items
    return lemma_count

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sahil\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\sahil\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [139]:
essay = "  All work and no play makes jack dull good boy. Goal is nice. Hello me. To the better group. I am nice. "
tc = tokenize(essay)


z = nltk.pos_tag(tc[0]) 

print(z)
print(z[0])
print(z[0][1])

print(wordnet.NOUN)

print(WordNetLemmatizer().lemmatize('better', 'a'))

count_lemmas(essay)

[('All', 'DT'), ('work', 'NN'), ('and', 'CC'), ('no', 'DT'), ('play', 'NN'), ('makes', 'VBZ'), ('jack', 'NN'), ('dull', 'JJ'), ('good', 'JJ'), ('boy', 'NN')]
('All', 'DT')
DT
n
good


19

In [136]:
set([1,1,2,3,4,2,4,2,5])

{1, 2, 3, 4, 5}

In [147]:
import collections

# Function to count the number of mispell words in our essay
def count_spell_error(essay):    
    clean_essay = re.sub(r'\W', ' ', str(essay).lower())
    clean_essay = re.sub(r'[0-9]', '', clean_essay)
    #big.txt: It is a concatenation of public domain book excerpts from Project Gutenberg and lists of most frequent words from Wiktionary and the British National Corpus.It contains about a million words.
    data = open('big.txt').read()
    words_ = re.findall('[a-z]+', data.lower())
    word_dict = collections.defaultdict(lambda: 0)
    for word in words_:
        word_dict[word] += 1
    clean_essay = re.sub(r'\W', ' ', str(essay).lower())
    clean_essay = re.sub(r'[0-9]', '', clean_essay)
    mispell_count = 0
    words = clean_essay.split()   
    for word in words:
        if not word in word_dict:
            mispell_count += 1
    return mispell_count

In [148]:
count_spell_error("I am wronf. What shoulf I do?")

2

In [142]:
# Function to calculate number of nouns, adjectives, verbs and adverbs in an essay
def count_pos(essay):
    tokenized_sentences = tokenize(essay)
    noun_count = 0
    adj_count = 0
    verb_count = 0
    adv_count = 0
    for sentence in tokenized_sentences:
        tagged_tokens = nltk.pos_tag(sentence)
        for token_tuple in tagged_tokens:
            pos_tag = token_tuple[1]
            if pos_tag.startswith('N'): 
                noun_count += 1
            elif pos_tag.startswith('J'):
                adj_count += 1
            elif pos_tag.startswith('V'):
                verb_count += 1
            elif pos_tag.startswith('R'):
                adv_count += 1
    return noun_count, adj_count, verb_count, adv_count

In [20]:
from sklearn.feature_extraction.text import CountVectorizer
# getiing Bag of Words (BOW) counts
def get_count_vectors(essays):
    vectorizer=CountVectorizer(max_features=10000,ngram_range=(1,3))
    # Convert a collection of text documents to a matrix of token counts
    # If ‘english’, a built-in stop word list for English is used. 
    count_vectors = vectorizer.fit_transform(essays)
    #Learn the vocabulary dictionary and return term-document matrix.
    #Shows which documents contain which terms and how many times they appear.
    feature_names = vectorizer.get_feature_names()
    return feature_names, count_vectors

# Training

In [22]:
import pandas as pd
dataframe = pd.read_csv('essays_and_scores.csv', encoding = 'latin-1')
data = dataframe[['essay_set','essay','domain1_score']].copy()
data.shape

(12978, 3)

In [21]:
from sklearn.model_selection import train_test_split
#splitting data into train data and test data (70/30)
feature_names_cv,count_vectors=get_count_vectors(data[data['essay_set']==1]['essay'])
X_cv=count_vectors.toarray()
y_cv=data[data['essay_set']==1]['domain1_score'].as_matrix()
X_train,X_test,y_train,y_test=train_test_split(X_cv,y_cv,test_size=0.3)

  """


In [17]:
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.metrics import cohen_kappa_score
import numpy as np
# Training a Linear Regression model using only Bag of Words (BOW)
linear_regressor = LinearRegression()
linear_regressor.fit(X_train, y_train)
y_pred = linear_regressor.predict(X_test)
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
print('Variance score: %.2f' % linear_regressor.score(X_test, y_test))
print('Cohen\'s kappa score: %.2f' % cohen_kappa_score(np.rint(y_pred), y_test))
# The Kappa statistic (or value) is a metric that compares an Observed Accuracy with an Expected Accuracy (random chance).

Mean squared error: 1.76
Variance score: 0.32
Cohen's kappa score: 0.17


In [15]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import GridSearchCV
# Training a Lasso Regression model using only Bag of Words (BOW)
alphas = np.array([3, 1, 0.3, 0.1, 0.03, 0.01])
lasso_regressor = Lasso()
grid = GridSearchCV(estimator = lasso_regressor, param_grid = dict(alpha=alphas))
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
print('Variance score: %.2f' % grid.score(X_test, y_test))
print('Cohen\'s kappa score: %.2f' % cohen_kappa_score(np.rint(y_pred), y_test))



Mean squared error: 1.06
Variance score: 0.59
Cohen's kappa score: 0.24


In [166]:
# extracting essay features
def extract_features(data):
    features = data.copy()
    features['char_count'] = features['essay'].apply(char_count)
    features['word_count'] = features['essay'].apply(word_count)
    features['sent_count'] = features['essay'].apply(sent_count)
    features['avg_word_len'] = features['essay'].apply(avg_word_len)
    features['lemma_count'] = features['essay'].apply(count_lemmas)
    features['spell_err_count'] = features['essay'].apply(count_spell_error)
    features['noun_count'], features['adj_count'], features['verb_count'], features['adv_count'] = zip(*features['essay'].map(count_pos))
    return features

In [167]:
features_set1=extract_features(data[data['essay_set']==1])

In [168]:
# splitting data (BOW + other features) into train data and test data (70/30)
X = np.concatenate((features_set1.iloc[:, 3:].as_matrix(), X_cv), axis = 1)
y = features_set1['domain1_score'].as_matrix()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

  This is separate from the ipykernel package so we can avoid doing imports until
  """


In [169]:
# Training a Lasso Regression model using all the features(BOW+other features)
alphas = np.array([3, 1, 0.3, 0.1])
lasso_regressor = Lasso()
grid = GridSearchCV(estimator = lasso_regressor, param_grid = dict(alpha=alphas))
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
print('Variance score: %.2f' % grid.score(X_test, y_test))
print('Cohen\'s kappa score: %.2f' % cohen_kappa_score(np.rint(y_pred), y_test))



0.7102698076299961
0.1
Mean squared error: 0.65
Variance score: 0.71
Cohen's kappa score: 0.37


In [170]:
# splitting data (only 10 features) into train data and test data (70/30)
X = features_set1.iloc[:, 3:].as_matrix()
y = features_set1['domain1_score'].as_matrix()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

  This is separate from the ipykernel package so we can avoid doing imports until
  """


In [171]:
# Training a Lasso Regression model using only 10 features
alphas = np.array([3, 1, 0.3, 0.1, 0.3])
lasso_regressor = Lasso()
grid = GridSearchCV(estimator = lasso_regressor, param_grid = dict(alpha=alphas))
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
print('Variance score: %.2f' % grid.score(X_test, y_test))
print('Cohen\'s kappa score: %.2f' % cohen_kappa_score(np.rint(y_pred), y_test))



0.7158628075449887
0.3
Mean squared error: 0.71
Variance score: 0.69
Cohen's kappa score: 0.37
