# Challenge
### Build your own NLP

In [1]:
from nltk.corpus import gutenberg, stopwords
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
import spacy, re
import numpy as np
import pandas as pd

# gutenberg.fileids()

In [2]:
# Quick test after noticing odd apostrophe escapement behavior
print(r"'" == r'\'')
test = r"Emma doing just what she liked;\nhighly esteeming Miss Taylor's judgment, but directed chiefly by\nher own."
re.sub("n", '*', test)

False


"Emma doi*g just what she liked;\\*highly esteemi*g Miss Taylor's judgme*t, but directed chiefly by\\*her ow*."

In [91]:
moby = gutenberg.raw('melville-moby_dick.txt')
emma = gutenberg.raw('austen-emma.txt')

def cleanup(frame):
    # Removes the '\r\n' items across the text
    frame = re.sub('\r\n', ' ', frame)
    # The following two remove '--' which can have undesired effects
    frame = re.sub('--', '-', frame)
    frame = re.sub('--', '-', frame)
    # Removes '\n' across the text
    frame = re.sub(r"\\", '', frame)
    frame = re.sub('\n', ' ', frame)
    # The following two remove volume and chapter titles
    frame = re.sub(r'CHAPTER [A-Z]', '', frame)
    frame = re.sub(r'VOLUME [A-Z]', '', frame)
    return frame

def punct(sents, sents2):
    # Remove punctuation function
    for sent in sents:
        # This handles punctuation filtering and lowercase conversion
        sents2.append(re.sub('[^a-zA-Z\' -]+', '', sent).lower())
        
moby2 = cleanup(moby)
emma2 = cleanup(emma)

moby_sents = sent_tokenize(moby2)
emma_sents = sent_tokenize(emma2)
moby_sents[0] = 'etymology'
emma_sents[0] = 'Emma Woodhouse, handsome, clever, and rich, with a comfortable home and happy disposition, seemed to unite some of the best blessings of existence; and had lived nearly twenty-one years in the world with very little to distress or vex her.'

moby_sents2 = []
emma_sents2 = []

punct(moby_sents, moby_sents2)
punct(emma_sents, emma_sents2)

moby_df = pd.DataFrame(moby_sents2, columns=['original sentence'])
moby_df['source'] = 'moby_dick'
emma_df = pd.DataFrame(emma_sents2, columns=['original sentence'])
emma_df['source'] = 'emma'
df = pd.concat([moby_df, emma_df], ignore_index=True)
df.loc[np.random.randint(len(df), size=10)]

Unnamed: 0,original sentence,source
9086,so ahab's proceedings in this matter were not ...,moby_dick
7756,moreover the ship's forge was ordered to be ho...,moby_dick
2532,to be short then a whale is a spouting fish wi...,moby_dick
1530,the seven hundred and seventy-seventh lay capt...,moby_dick
2159,cetus is a constellation in the south,moby_dick
1242,i thought the bumpkin's hour of doom was come,moby_dick
1537,ye insult me man past all natural bearing ye i...,moby_dick
4646,after a stiff pull their harpooneer got fast a...,moby_dick
9474,the ferrule has not stood sir said the carpent...,moby_dick
388,much was i disappointed upon learning that the...,moby_dick


In [92]:
X = df['original sentence']
y = df['source']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

print(df.source.value_counts())
print('\n', df.loc[X_train.index]['source'].value_counts())

moby_dick    9846
emma         7439
Name: source, dtype: int64

 moby_dick    6892
emma         5207
Name: source, dtype: int64


In [93]:
vec = CountVectorizer(ngram_range=(1,2), stop_words='english', max_features=10)
tvec = TfidfVectorizer(ngram_range=(1,2), stop_words='english', max_features=10)
    
bow = vec.fit_transform(X_train)
tfidf = tvec.fit_transform(X_train)

for name in vec.get_feature_names():
    df[name] = 0
    
bow_csr = bow.tocsr()
tf_csr = tfidf.tocsr()

df.head(3)

Unnamed: 0,original sentence,source,did,emma,like,little,man,mr,mrs,said,time,whale
0,etymology,moby_dick,0,0,0,0,0,0,0,0,0,0
1,supplied by a late consumptive usher to a gram...,moby_dick,0,0,0,0,0,0,0,0,0,0
2,he was ever dusting his old lexicons and gramm...,moby_dick,0,0,0,0,0,0,0,0,0,0


In [104]:
# Single indexing and slicing are returning different values
print(X_train[0], '\n\n\n', X_train[:1])
# Using X_train's index should help to iterate
print('\n\n\n', df.loc[X_train.index[0], 'original sentence'])

etymology 


 13013    and though the consequent shock and alarm was ...
Name: original sentence, dtype: object



 and though the consequent shock and alarm was very great and much more durable-indeed i believe it was half an hour before any of us were comfortable again- yet that was too general a sensation for any thing of peculiar anxiety to be observable


In [105]:
for i, j in zip(*bow.nonzero()):
    df.loc[X_train.index[i], vec.get_feature_names()[j]] = bow_csr[i, j]
    
df.head(10)

Unnamed: 0,original sentence,source,did,emma,like,little,man,mr,mrs,said,time,whale
0,etymology,moby_dick,0,0,0,0,0,0,0,0,0,0
1,supplied by a late consumptive usher to a gram...,moby_dick,0,0,0,0,0,0,0,0,0,0
2,he was ever dusting his old lexicons and gramm...,moby_dick,0,0,0,0,0,0,0,0,0,0
3,he loved to dust his old grammars it somehow m...,moby_dick,0,0,0,0,0,0,0,0,0,0
4,while you take in hand to school others and to...,moby_dick,0,0,0,0,0,0,0,0,0,1
