In [289]:
import pandas as pd
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

from string import punctuation

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

import re

In [290]:
df = pd.read_csv('../data/samples_with_titles.csv',
                   skipinitialspace=True,
                   sep=',', 
                   quotechar='"', 
                   escapechar='\\',
                   error_bad_lines=False,
                    usecols = ['Grade','Text'])
df.Text = df.Text.str[1:-1]
df.head()

Unnamed: 0,Grade,Text
0,1,How Much I Know About Space I had just gone to...
1,1,My Favorite Pet My cat is fluffy. His name is ...
2,1,Sweet Spring Spring is sweet because we can go...
3,1,A Happy Day One day baby Josh came home. He wa...
4,1,My Trip to Mexico One time I went to Mexico. I...


In [291]:
df_no_titles = pd.read_csv('../data/samples_no_titles.csv',
                   skipinitialspace=True,
                   sep=',', 
                   quotechar='"', 
                   escapechar='\\',
                   error_bad_lines=False,
                    usecols = ['Grade','Text'])
df_no_titles

Unnamed: 0,Grade,Text
0,1,I had just gone to Chobot Space and Science Ce...
1,1,My cat is fluffy. His name is Buzz. He is my f...
2,1,Spring is sweet because we can go boat riding ...
3,1,One day baby Josh came home. He was in a yello...
4,1,One time I went to Mexico. It was a blast! I m...
...,...,...
284,0,I like Make Way for Ducklings the best because...
285,0,My mom is special because she plays with me.
286,11,To whom it may concern: A group of parents ...
287,9,To Teachers and Whom It May Concern: A prop...


In [292]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 289 entries, 0 to 288
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Grade   289 non-null    int64 
 1   Text    289 non-null    object
dtypes: int64(1), object(1)
memory usage: 4.6+ KB


In [293]:
df.Grade.value_counts().sort_index()

0     18
1     30
2     37
3     34
4     36
5     18
6     47
7      8
8      8
9     44
10     2
11     4
12     3
Name: Grade, dtype: int64

nltk_tag_to_wordnet_tag() courtesy of [Gaurav Gupta](https://medium.com/@gaurav5430/using-nltk-for-lemmatizing-sentences-c1bfff963258)

In [294]:
def unique_characters(series):
    unique = set([])
    for text in series:
        characters = set(text)
        unique = unique.union(characters)
    return unique

def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None    
    
def retag(tagged_list):
    return [(w[0], nltk_tag_to_wordnet_tag(w[1])) for w in tagged_list]

def lemma(tagged_tokens):
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for w in tagged_tokens:
        if w[1] is None:
            lemmas.append(lemmatizer.lemmatize(w[0]))
        else:
            lemmas.append(lemmatizer.lemmatize(w[0],w[1]))                   
    return lemmas

def lower_case(string):
    return string.lower()

def letters(string):
    letters = re.findall('[a-z]+',string)
    return letters

def clean_tokenize(series):
    return series.apply(lower_case).apply(letters)

def lemmatize(series):
    return series.apply(pos_tag).apply(retag).apply(lemma)

def replace_urls(series):
    return series.str.replace('www.','',regex=False).replace('\w*\.\w{2,}', value = "*url", regex=True)

def isolate_punctuation(string):
    for punct in punctuation.replace('*',''):
        string = string.replace(punct,f' {punct} ').replace('  ',' ')
    return string

def tag_uppercase(series):
    repl = lambda w: f'+ {w.group(0).lower()}'
    return series.str.replace('[A-Z]\w+',repl,regex=True)

def remove_weird_chars(string):
    weird_chars = [ '¦', '©', '±', '³', '½', 'Â', 'Ã', 'â', '“', '”', '€']
    return ''.join([c for c in string if c not in weird_chars])

def grammar_text(series):
    series = replace_urls(series)
    series = series.apply(remove_weird_chars).apply(isolate_punctuation)
    series = tag_uppercase(series)
    return series

def lemma_text(series):
    series = clean_tokenize(series)
    series = lemmatize(series)
    return series

In [295]:
df['lemmatized'] = lemma_text(df.Text)
df['grammarized'] = grammar_text(df.Text)
df

Unnamed: 0,Grade,Text,lemmatized,grammarized
0,1,How Much I Know About Space I had just gone to...,"[how, much, i, know, about, space, i, have, ju...",+ how + much I + know + about + space I had ju...
1,1,My Favorite Pet My cat is fluffy. His name is ...,"[my, favorite, pet, my, cat, be, fluffy, his, ...",+ my + favorite + pet + my cat is fluffy . + h...
2,1,Sweet Spring Spring is sweet because we can go...,"[sweet, spring, spring, be, sweet, because, we...",+ sweet + spring + spring is sweet because we ...
3,1,A Happy Day One day baby Josh came home. He wa...,"[a, happy, day, one, day, baby, josh, come, ho...",A + happy + day + one day baby + josh came hom...
4,1,My Trip to Mexico One time I went to Mexico. I...,"[my, trip, to, mexico, one, time, i, go, to, m...",+ my + trip to + mexico + one time I went to +...
...,...,...,...,...
284,0,Make Way for Ducklings I like Make Way for Duc...,"[make, way, for, duckling, i, like, make, way,...",+ make + way for + ducklings I like + make + w...
285,0,My mom is special because she plays with me.,"[my, mom, be, special, because, she, play, wit...",+ my mom is special because she plays with me .
286,11,To whom it may concern: A group of parents and...,"[to, whom, it, may, concern, a, group, of, par...",+ to whom it may concern : A group of parents ...
287,9,To Teachers and Whom It May Concern: A proposa...,"[to, teacher, and, whom, it, may, concern, a, ...",+ to + teachers and + whom + it + may + concer...


In [306]:
df.to_csv('processed_text', index=False)

In [297]:
## Split train and test sets
X = df['lemmatized'].str.join(' ')
df.Grade = df.Grade.astype(str)
df.loc[df['Grade'].isin(['10','11','12']), 'Grade'] = '10'
y = df.Grade
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [298]:
y_test.value_counts()

6     12
9     11
3      9
4      9
2      9
1      8
5      5
0      4
8      2
10     2
7      2
Name: Grade, dtype: int64

In [299]:
y_train.value_counts()

6     35
9     33
2     28
4     27
3     25
1     22
0     14
5     13
10     7
8      6
7      6
Name: Grade, dtype: int64

Relative Error Size is  0.3970149253731343
