In [289]:
import pandas as pd
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

from string import punctuation

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

import re

In [310]:
df_with_titles = pd.read_csv('../data/samples_with_titles.csv',
                   skipinitialspace=True,
                   sep=',', 
                   quotechar='"', 
                   escapechar='\\',
                   error_bad_lines=False,
                    usecols = ['Grade','Text'])
df_with_titles.Text = df_with_titles.Text.str[1:-1]
df_with_titles.tail()

Unnamed: 0,Grade,Text
284,0,Make Way for Ducklings I like Make Way for Duc...
285,0,My mom is special because she plays with me.
286,11,To whom it may concern: A group of parents and...
287,9,To Teachers and Whom It May Concern: A proposa...
288,6,Tom Sawyer Having complicated characters in a ...


In [309]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 289 entries, 0 to 288
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Grade   289 non-null    int64 
 1   Text    289 non-null    object
dtypes: int64(1), object(1)
memory usage: 4.6+ KB


In [315]:
df = pd.read_csv('../data/samples_no_title.csv',
                   skipinitialspace=True,
                   sep=',', 
                   quotechar='"', 
                   escapechar='\\',
                   error_bad_lines=False,
                    usecols = ['Grade','Text'])
df.loc[df['Grade'].isin([10,11,12]),'Grade'] = 10
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 323 entries, 0 to 322
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Grade   323 non-null    int64 
 1   Text    323 non-null    object
dtypes: int64(1), object(1)
memory usage: 5.2+ KB


In [316]:
df.Grade.value_counts().sort_index()

0     21
1     32
2     39
3     36
4     38
5     20
6     49
7     10
8     11
9     47
10    20
Name: Grade, dtype: int64

nltk_tag_to_wordnet_tag() courtesy of [Gaurav Gupta](https://medium.com/@gaurav5430/using-nltk-for-lemmatizing-sentences-c1bfff963258)

In [323]:
def unique_characters(series):
    unique = set([])
    for text in series:
        characters = set(text)
        unique = unique.union(characters)
    return unique

def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None    
    
def retag(tagged_list):
    return [(w[0], nltk_tag_to_wordnet_tag(w[1])) for w in tagged_list]

def lemma(tagged_tokens):
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for w in tagged_tokens:
        if w[1] is None:
            lemmas.append(lemmatizer.lemmatize(w[0]))
        else:
            lemmas.append(lemmatizer.lemmatize(w[0],w[1]))                   
    return lemmas

def lower_case(string):
    return string.lower()

def letters(string):
    letters = re.findall('[a-z]+',string)
    return letters

def clean_tokenize(series):
    return series.apply(lower_case).apply(letters)

def lemmatize(series):
    return series.apply(pos_tag).apply(retag).apply(lemma)

def replace_urls(series):
    return series.str.replace('www.','',regex=False).replace('\w*\.\w{2,}', value = "*url", regex=True)

def isolate_punctuation(string):
    for punct in punctuation.replace('*',''):
        string = string.replace(punct,f' {punct} ').replace('  ',' ')
    return string

def tag_uppercase(series):
    repl = lambda w: f'+ {w.group(0).lower()}'
    return series.str.replace('[A-Z]\w+',repl,regex=True)

def remove_weird_chars(string):
    weird_chars = [ '¦', '©', '±', '³', '½', 'Â', 'Ã', 'â', '“', '”', '€']
    return ''.join([c for c in string if c not in weird_chars])

def grammar_text(series):
    series = replace_urls(series)
    series = series.apply(remove_weird_chars).apply(isolate_punctuation)
    series = tag_uppercase(series)
    return series

def lemma_text(series):
    series = clean_tokenize(series)
    series = lemmatize(series)
    return series

def process_text(df):
    df['lemmatized'] = lemma_text(df.Text).str.join(' ')
    df['grammarized'] = grammar_text(df.Text)
    return df

In [324]:
df = process_text(df)
df

Unnamed: 0,Grade,Text,lemmatized,grammarized
0,1,I had just gone to Chobot Space and Science Ce...,i have just go to chobot space and science cen...,I had just gone to + chobot + space and + scie...
1,1,My cat is fluffy. His name is Buzz. He is my f...,my cat be fluffy his name be buzz he be my fav...,+ my cat is fluffy . + his name is + buzz . + ...
2,1,Spring is sweet because we can go boat riding ...,spring be sweet because we can go boat riding ...,+ spring is sweet because we can go boat ridin...
3,1,One day baby Josh came home. He was in a yello...,one day baby josh come home he be in a yellow ...,+ one day baby + josh came home . + he was in ...
4,1,One time I went to Mexico. It was a blast! I m...,one time i go to mexico it be a blast i meet p...,+ one time I went to + mexico . + it was a bla...
...,...,...,...,...
318,10,America in the years leading up to 1918 was as...,america in the year lead up to be a confident ...,+ america in the years leading up to 1918 was ...
319,10,The modern world is full of problems and issue...,the modern world be full of problem and issue ...,+ the modern world is full of problems and iss...
320,10,The violin is arguably the most cherished and ...,the violin be arguably the most cherished and ...,+ the violin is arguably the most cherished an...
321,10,Have you ever wondered how to design complex w...,have you ever wonder how to design complex woo...,+ have you ever wondered how to design complex...


In [325]:
df.to_csv('processed_text', index=False)