In [123]:
import pandas as pd
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import re

In [161]:
df = pd.read_csv('../data/samples_with_titles.csv',
                   skipinitialspace=True,
                   sep=',', 
                   quotechar='"', 
                   escapechar='\\',
                   error_bad_lines=False,
                    usecols = ['Grade','Text'])
df.head()

Unnamed: 0,Grade,Text
0,1,"""How Much I Know About Space I had just gone t..."
1,1,"""My Favorite Pet My cat is fluffy. His name is..."
2,1,"""Sweet Spring Spring is sweet because we can g..."
3,1,"""A Happy Day One day baby Josh came home. He w..."
4,1,"""My Trip to Mexico One time I went to Mexico. ..."


In [125]:
df_no_titles = pd.read_csv('../data/samples_no_titles.csv',
                   skipinitialspace=True,
                   sep=',', 
                   quotechar='"', 
                   escapechar='\\',
                   error_bad_lines=False,
                    usecols = ['Grade','Text'])
df_no_titles

Unnamed: 0,Grade,Text
0,1,I had just gone to Chobot Space and Science Ce...
1,1,My cat is fluffy. His name is Buzz. He is my f...
2,1,Spring is sweet because we can go boat riding ...
3,1,One day baby Josh came home. He was in a yello...
4,1,One time I went to Mexico. It was a blast! I m...
...,...,...
284,0,I like Make Way for Ducklings the best because...
285,0,My mom is special because she plays with me.
286,11,To whom it may concern: A group of parents ...
287,9,To Teachers and Whom It May Concern: A prop...


In [126]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 289 entries, 0 to 288
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Grade   289 non-null    int64 
 1   Text    289 non-null    object
dtypes: int64(1), object(1)
memory usage: 4.6+ KB


In [127]:
df.Grade.value_counts().sort_index()

0     18
1     30
2     37
3     34
4     36
5     18
6     47
7      8
8      8
9     44
10     2
11     4
12     3
Name: Grade, dtype: int64

nltk_tag_to_wordnet_tag() courtesy of [Gaurav Gupta](https://medium.com/@gaurav5430/using-nltk-for-lemmatizing-sentences-c1bfff963258)

In [128]:
def unique_characters(series):
    unique = set([])
    for text in series:
        characters = set(text)
        unique = unique.union(characters)
    return unique

def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None    
    
def retag(tagged_list):
    return [(w[0], nltk_tag_to_wordnet_tag(w[1])) for w in tagged_list]

def lemma(tagged_tokens):
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for w in tagged_tokens:
        if w[1] is None:
            lemmas.append(lemmatizer.lemmatize(w[0]))
        else:
            lemmas.append(lemmatizer.lemmatize(w[0],w[1]))                   
    return lemmas

def lower_case(string):
    return string.lower()

def letters(string):
    letters = re.findall('[a-z]+',string)
    return letters

def clean_tokenize(series):
    return series.apply(lower_case).apply(letters)

def lemmatize(series):
    return series.apply(pos_tag).apply(retag).apply(lemma)

def remove_weird_chars(string):
    weird_chars = [ '¦', '©', '±', '³', '½', 'Â', 'Ã', 'â', '˜', '“', '”', '€', '[', ']', '_', '*', '$', '%', '&','"']
    return ''.join([c for c in string if c not in weird_chars])

In [146]:
unique_characters(df['Text'])

{' ',
 '!',
 '"',
 '$',
 '%',
 '&',
 '(',
 ')',
 '*',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '?',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 '[',
 ']',
 '_',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '¦',
 '©',
 '±',
 '³',
 '½',
 'Â',
 'Ã',
 'â',
 '˜',
 '“',
 '”',
 '€'}

In [130]:
df['Text Without Weirdness'] = df.Text.apply(remove_weird_chars)
df['Full Tokens'] = df.Text.apply(remove_weird_chars).str.split()
df['Tokens'] = clean_tokenize(df.Text)
df['Lemmas'] = lemmatize(df.Tokens)
df

Unnamed: 0,Grade,Text,Text Without Weirdness,Full Tokens,Tokens,Lemmas
0,1,"""How Much I Know About Space I had just gone t...",How Much I Know About Space I had just gone to...,"[How, Much, I, Know, About, Space, I, had, jus...","[how, much, i, know, about, space, i, had, jus...","[how, much, i, know, about, space, i, have, ju..."
1,1,"""My Favorite Pet My cat is fluffy. His name is...",My Favorite Pet My cat is fluffy. His name is ...,"[My, Favorite, Pet, My, cat, is, fluffy., His,...","[my, favorite, pet, my, cat, is, fluffy, his, ...","[my, favorite, pet, my, cat, be, fluffy, his, ..."
2,1,"""Sweet Spring Spring is sweet because we can g...",Sweet Spring Spring is sweet because we can go...,"[Sweet, Spring, Spring, is, sweet, because, we...","[sweet, spring, spring, is, sweet, because, we...","[sweet, spring, spring, be, sweet, because, we..."
3,1,"""A Happy Day One day baby Josh came home. He w...",A Happy Day One day baby Josh came home. He wa...,"[A, Happy, Day, One, day, baby, Josh, came, ho...","[a, happy, day, one, day, baby, josh, came, ho...","[a, happy, day, one, day, baby, josh, come, ho..."
4,1,"""My Trip to Mexico One time I went to Mexico. ...",My Trip to Mexico One time I went to Mexico. I...,"[My, Trip, to, Mexico, One, time, I, went, to,...","[my, trip, to, mexico, one, time, i, went, to,...","[my, trip, to, mexico, one, time, i, go, to, m..."
...,...,...,...,...,...,...
284,0,"""Make Way for Ducklings I like Make Way for Du...",Make Way for Ducklings I like Make Way for Duc...,"[Make, Way, for, Ducklings, I, like, Make, Way...","[make, way, for, ducklings, i, like, make, way...","[make, way, for, duckling, i, like, make, way,..."
285,0,"""My mom is special because she plays with me.""",My mom is special because she plays with me.,"[My, mom, is, special, because, she, plays, wi...","[my, mom, is, special, because, she, plays, wi...","[my, mom, be, special, because, she, play, wit..."
286,11,"""To whom it may concern: A group of parents an...",To whom it may concern: A group of parents and...,"[To, whom, it, may, concern:, A, group, of, pa...","[to, whom, it, may, concern, a, group, of, par...","[to, whom, it, may, concern, a, group, of, par..."
287,9,"""To Teachers and Whom It May Concern: A propos...",To Teachers and Whom It May Concern: A proposa...,"[To, Teachers, and, Whom, It, May, Concern:, A...","[to, teachers, and, whom, it, may, concern, a,...","[to, teacher, and, whom, it, may, concern, a, ..."


In [131]:
unique_characters(df['Tokens'].apply(''.join))

{'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z'}

In [162]:
df.loc[72,'Text']

'"Caught in the Net Hello. My name is Kim. Im an online-aholic. There. Ive said it. I guess Ive been addicted for quite some time now but I have just begun to realize it. My first interaction with the Net began when I was only 15 years old. My dad was the computer coordinator at our school so he wanted to try Internet access at home before installing it at school. We became America Online members in 1993. None of my friends had email back then. My relationship with email started like a Romeo/Juliet conspiracy. I experimented with the Net on the sly at night when the rest of my family was sleeping. Thats when I first created my own screen name. Although I was too young to drive the locked doors of adolescence were suddenly flung open before me. I could be social in the evening rather than hang out with my familyâ€”the typical family that every adolescent wants to escape from. Hanging out in chat rooms became a nightly ritual. I quickly found friends who would meet me there give advice f

In [143]:
df[df.Text.str.contains('.com', regex=False)]

Unnamed: 0,Grade,Text,Text Without Weirdness,Full Tokens,Tokens,Lemmas
72,9,"""Caught in the Net Hello. My name is Kim. Im a...",Caught in the Net Hello. My name is Kim. Im an...,"[Caught, in, the, Net, Hello., My, name, is, K...","[caught, in, the, net, hello, my, name, is, ki...","[caught, in, the, net, hello, my, name, be, ki..."
77,9,"""Internet Plagiarism Not all thieves lurk in d...",Internet Plagiarism Not all thieves lurk in da...,"[Internet, Plagiarism, Not, all, thieves, lurk...","[internet, plagiarism, not, all, thieves, lurk...","[internet, plagiarism, not, all, thief, lurk, ..."
