# Challenge
### Build your own NLP

In [10]:
from nltk.corpus import gutenberg, stopwords
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import re
import numpy as np
import pandas as pd

# gutenberg.fileids()

In [2]:
# Quick test after noticing odd apostrophe escapement behavior
print(r"'" == r'\'')
test = r"Emma doing just what she liked;\nhighly esteeming Miss Taylor's judgment, but directed chiefly by\nher own."
re.sub("n", '*', test)

False


"Emma doi*g just what she liked;\\*highly esteemi*g Miss Taylor's judgme*t, but directed chiefly by\\*her ow*."

In [39]:
moby = gutenberg.raw('melville-moby_dick.txt')
emma = gutenberg.raw('austen-emma.txt')

def cleanup(frame, source):
    # Removes the '\r\n' items across the text
    frame = re.sub('\r\n', ' ', frame)
    # The following two remove '--' which can have undesired effects
    frame = re.sub('--', '-', frame)
    frame = re.sub('--', '-', frame)
    # Removes '\n' across the text
    frame = re.sub(r"\\", '', frame)
    frame = re.sub('\n', ' ', frame)
    # The following two remove volume and chapter titles
    frame = re.sub(r'CHAPTER [A-Z]', '', frame)
    frame = re.sub(r'VOLUME [A-Z]', '', frame)
    # Tokenize by sentence with nltk
    frame_sents = sent_tokenize(frame)
    # Create empty list to populate with clean tokens
    frame_sents2 = []
    for sent in frame_sents:
        frame_sents2.append((re.sub('[^a-zA-Z\' -]+', '', sent).lower()))
    df = pd.DataFrame(frame_sents2, columns=['original_sentence'])
    df['source'] = source
    return df

moby_df = cleanup(moby, 'moby')
emma_df = cleanup(emma, 'emma')

def clean_title(moby_, emma_):
    moby_.loc[0]['original_sentence'] = 'etymology'
    emma_.loc[0]['original_sentence'] = 'Emma Woodhouse, handsome, clever, and rich, with a comfortable home and happy disposition, seemed to unite some of the best blessings of existence; and had lived nearly twenty-one years in the world with very little to distress or vex her.'
    return pd.concat([moby_, emma_], ignore_index=True)

df = clean_title(moby_df, emma_df)
# tfidf copy
df2 = df.copy()
# copy to improve
df3 = df.copy()
# Let's take a look across the dataframe
df.loc[np.random.randint(len(df), size=10)]

Unnamed: 0,original_sentence,source
1479,but unlike captain peleg-who cared not a rush ...,moby
17185,i am particularly glad to see and shake hands ...,emma
6424,on more accounts than one a pity it is that th...,moby
13996,miss hawkins perhaps wanted a home and thought...,emma
16396,who had been at pains to give harriet notions ...,emma
1080,he said no more but slowly waving a benedictio...,moby
10953,but ah,emma
12170,jane was quite longing to go to ireland from h...,emma
16982,knightley',emma
5458,the unflinching earnestness with which he decl...,moby


In [17]:
# Using bag of words approach on the entire corpus assumes there are no words unique to one collection
# Here we set vectorizers and fit them
vec = CountVectorizer(ngram_range=(1,2), stop_words='english', max_features=100)
tvec = TfidfVectorizer(ngram_range=(1,2), stop_words='english', max_features=100)
bow = vec.fit_transform(df['original_sentence'])
tfidf = tvec.fit_transform(df['original_sentence'])

# This series of steps 
for name in vec.get_feature_names():
    df[name] = 0
    
for i, j in zip(*bow.nonzero()):
    df.loc[df.index[i], vec.get_feature_names()[j]] = bow[i, j]

X = df[vec.get_feature_names()]
y = df['source']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

# These similar steps replicate the process for tfidf
for name in tvec.get_feature_names():
    df2[name] = 0

for i, j in zip(*tfidf.nonzero()):
    df2.loc[df2.index[i], tvec.get_feature_names()[j]] = tfidf[i, j]
    
X2 = df2[tvec.get_feature_names()]
y2 = df2['source']
    
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.3, stratify=y2)

In [18]:
clf_log = LogisticRegression()
clf_log.fit(X_train, y_train)
clf_log.score(X_test, y_test)



0.7903972232934825

In [19]:
clf_tree = DecisionTreeClassifier()
clf_tree.fit(X_train, y_train)
clf_tree.score(X_test, y_test)

0.7747782491322792

In [21]:
clf_log2 = LogisticRegression()
clf_log2.fit(X_train2, y_train2)
clf_log2.score(X_test2, y_test2)



0.7948322406478981

In [22]:
clf_tree2 = DecisionTreeClassifier()
clf_tree2.fit(X_train2, y_train2)
clf_tree2.score(X_test2, y_test2)

0.7767065175472426

In [41]:
tvec2 = TfidfVectorizer(max_df=0.5, # drop words that occur in more than half the paragraphs
                        min_df=3, # only use words that appear at least three times
                        stop_words='english', 
                        lowercase=True, # this shouldn't be necessary, but still
                        use_idf=True, # we definitely want to use inverse document frequencies in our weighting
                        norm=u'l2', # Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                        smooth_idf=True, # Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                        ngram_range=(1,2) # This creates n-grams of 1 and 2
                       )

final = tvec2.fit_transform(df3['original_sentence'])

for name in tvec2.get_feature_names():
    df3[name] = 0

for i, j in zip(*tfidf.nonzero()):
    df2.loc[df2.index[i], tvec.get_feature_names()[j]] = tfidf[i, j]
    
X2 = df2[tvec.get_feature_names()]
y2 = df2['source']
    
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.3, stratify=y2)

In [42]:
for i, j in zip(*final.nonzero()):
    df3

<17285x11152 sparse matrix of type '<class 'numpy.float64'>'
	with 145752 stored elements in Compressed Sparse Row format>

In [24]:


svd = TruncatedSVD(100)
lsa = make_pipeline(svd, Normalizer(copy=False))
lsa.fit(X_train2, y_train2)
lsa.score(X_test2, y_test2)

ValueError: n_components must be < n_features; got 100 >= 100