# Challenge: Build your own NLP Model

For this challenge, you will need to choose a corpus of data from nltk or another source that includes categories you can predict and create an analysis pipeline that includes the following steps:

* Data cleaning / processing / language parsing
* Create features using two different NLP methods: For example, BoW vs tf-idf.
* Use the features to fit supervised learning models for each feature set to predict the category outcomes.
* Assess your models using cross-validation and determine whether one model performed better.
* Pick one of the models and try to increase accuracy by at least 5 percentage points.

Write up your report in a Jupyter notebook. Be sure to explicitly justify the choices you make throughout, and submit it below.

In [57]:
import numpy as np
import pandas as pd
import scipy
import sklearn
from sklearn import ensemble
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from collections import Counter
import nltk
from nltk.corpus import gutenberg, stopwords
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
%matplotlib inline

In [2]:
# Grab and process the raw data.
print(gutenberg.fileids())

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


In [3]:
hamlet = gutenberg.raw('shakespeare-hamlet.txt')
emma = gutenberg.raw('austen-emma.txt')

## Data Cleaning

In [4]:
hamlet = ' '.join(hamlet.split())
emma = ' '.join(emma.split())


In [5]:
# Utility function for standard text cleaning.

def text_cleaner(text):
    
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    
    # Get rid of chapter titles.
    text = re.sub(r'Chapter \d+','',text)
    text = re.sub(r'CHAPTER \d+', '', text)
    text = re.sub("\\n\\n.*?\\n\\n", '', text)
    
    return text

In [6]:
#Clean the Texts

hamlet = text_cleaner(hamlet)
emma = text_cleaner(emma)

hamlet = text_cleaner(hamlet[:int(len(hamlet)/40)])
emma = text_cleaner(emma[:int(len(emma)/40)])

In [7]:
#Apply Spacy

nlp = spacy.load('en')
hamlet_doc = nlp(hamlet)
emma_doc = nlp(emma)

In [8]:
# Group into sentences.

hamlet_sents = [[sent, "Shakespeare"] for sent in hamlet_doc.sents]
emma_sents = [[sent, "Austen"] for sent in emma_doc.sents]

# Combine the sentences from the two novels into one data frame.

sentences = pd.DataFrame(hamlet_sents + emma_sents)
sentences.head()

Unnamed: 0,0,1
0,"(Actus, Primus, .)",Shakespeare
1,"(Scoena, Prima, .)",Shakespeare
2,"(Enter, Barnardo, and, Francisco, two, Centine...",Shakespeare
3,"(Barnardo, .)",Shakespeare
4,"(Who, 's, there, ?)",Shakespeare


# Feature Creation

In [9]:
# Utility function to create a list of the 2000 most common words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(2000)]
    

# Creates a data frame with features for each word in our common word set.
# Each value is the count of the times the word appears in each sentence.
def bow_features(sentences, common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 50 == 0:
            print("Processing row {}".format(i))
            
    return df

# Set up the bags.
hamletwords = bag_of_words(hamlet_doc)
emmawords = bag_of_words(emma_doc)

# Combine bags to create a set of unique words.
common_words = set(hamletwords + emmawords)

In [10]:
# Create our data frame with features. This can take a while to run.
word_counts = bow_features(sentences, common_words)
word_counts.head()

Processing row 0
Processing row 50
Processing row 100
Processing row 150
Processing row 200
Processing row 250
Processing row 300


Unnamed: 0,stand,yond,disagreeable,sorrow,married,introduce,laugh,angry,hath,shame,...,Leige,appear'd,bargain,James,look,volume,depressed,unexceptionable,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Actus, Primus, .)",Shakespeare
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Scoena, Prima, .)",Shakespeare
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Enter, Barnardo, and, Francisco, two, Centine...",Shakespeare
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Barnardo, .)",Shakespeare
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Who, 's, there, ?)",Shakespeare


# Trying out BOW

In [11]:
from sklearn import ensemble
from sklearn.model_selection import train_test_split

rfc = ensemble.RandomForestClassifier()
Y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence','text_source'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)
train = rfc.fit(X_train, y_train)

print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

Training set score: 0.9897959183673469

Test set score: 0.8787878787878788




In [12]:
#Evaluate using cross valuation score

cross_val_score(rfc, X_train, y_train, cv=5)

array([0.75      , 0.9       , 0.8       , 0.86842105, 0.81578947])

Test set score is 87%

# Linear Regression

In [13]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(penalty='l2') # No need to specify l2 as it's the default. But we put it for demonstration.
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

(196, 900) (196,)
Training set score: 0.9591836734693877

Test set score: 0.8560606060606061




In [14]:
#Evaluate using cross valuation score

cross_val_score(lr, X_train, y_train, cv=5)



array([0.725     , 0.825     , 0.75      , 0.84210526, 0.81578947])

Linear regression performed slightly worse than the Random Forest Classifier with 85.6%

# BoW with Gradient Boosting

In [15]:
clf = ensemble.GradientBoostingClassifier()
train = clf.fit(X_train, y_train)

print('Training set score:', clf.score(X_train, y_train))
print('\nTest set score:', clf.score(X_test, y_test))

Training set score: 0.9948979591836735

Test set score: 0.8712121212121212


In [16]:
#Evaluate using cross valuation score

cross_val_score(clf, X_train, y_train, cv=5)

array([0.75      , 0.9       , 0.8       , 0.86842105, 0.84210526])

Gradient Boosting had the same score as Random Forest of 87%. Let's see if we can increase that score by using GridSearch to find the best parameters. 

# Finding Best Parameters

In [17]:
from sklearn.model_selection import GridSearchCV
clf_parameters = {
             'n_estimators':[100, 200, 500, 1000, 2000],
              'max_depth':[2, 4, 6, 8, 10, 12],
              'max_features':[2, 4, 6, 8, 10, 12]
}

clf_grid = GridSearchCV(clf, clf_parameters, cv=5, verbose=1, n_jobs=-1)


clf_grid.fit(X_train, y_train)

print('Best parameters:')
print(clf_grid.best_params_)
print('Best Score:')
print(clf_grid.best_score_)

Fitting 5 folds for each of 180 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   11.4s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   35.0s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:  1.7min finished


Best parameters:
{'max_depth': 10, 'max_features': 2, 'n_estimators': 2000}
Best Score:
0.8877551020408163


In [18]:
clf = ensemble.GradientBoostingClassifier(n_estimators=2000,
                                         max_depth=10, max_features=2)

cross_val_score(clf, X_train, y_train, cv=5)

array([0.775     , 0.95      , 0.925     , 0.89473684, 0.86842105])

In [19]:
# Scores are consistent but lets try it on a test set.
cross_val_score(clf, X_test, y_test, cv=5)

array([0.88888889, 0.88888889, 0.88888889, 0.84615385, 0.84      ])

In [20]:
train = clf.fit(X_train, y_train)

print('Training set score:', clf.score(X_train, y_train))
print('\nTest set score:', clf.score(X_test, y_test))

Training set score: 0.9948979591836735

Test set score: 0.9166666666666666


Using GridSearcH CV, I was able to increase the model accuracy by 4.6%.


# tfidf

In [21]:

emma=gutenberg.paras('austen-emma.txt')
#processing
emma_paras=[]
for paragraph in emma:
    para=paragraph[0]
    #removing the double-dash from all words
    para=[re.sub(r'--','',word) for word in para]
    #Forming each paragraph into a string and adding it to the list of strings.
    emma_paras.append(' '.join(para))

print(emma_paras[0:4])

['[ Emma by Jane Austen 1816 ]', 'VOLUME I', 'CHAPTER I', 'Emma Woodhouse , handsome , clever , and rich , with a comfortable home and happy disposition , seemed to unite some of the best blessings of existence ; and had lived nearly twenty - one years in the world with very little to distress or vex her .']


In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

X_train, X_test = train_test_split(emma_paras, test_size=0.4, random_state=0)

vectorizer = TfidfVectorizer(max_df=0.5, # drop words that occur in more than half the paragraphs
                             min_df=2, # only use words that appear at least twice
                             stop_words='english', 
                             lowercase=True, #convert everything to lower case (since Alice in Wonderland has the HABIT of CAPITALIZING WORDS for EMPHASIS)
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                            )


#Applying the vectorizer
emma_paras_tfidf=vectorizer.fit_transform(emma_paras)
print("Number of features: %d" % emma_paras_tfidf.get_shape()[1])

#splitting into training and test sets
X_train_tfidf, X_test_tfidf= train_test_split(emma_paras_tfidf, test_size=0.4, random_state=0)


#Reshapes the vectorizer output into something people can read
X_train_tfidf_csr = X_train_tfidf.tocsr()

#number of paragraphs
n = X_train_tfidf_csr.shape[0]
#A list of dictionaries, one per paragraph
tfidf_bypara = [{} for _ in range(0,n)]
#List of features
terms = vectorizer.get_feature_names()
#for each paragraph, lists the feature words and their tf-idf scores
for i, j in zip(*X_train_tfidf_csr.nonzero()):
    tfidf_bypara[i][terms[j]] = X_train_tfidf_csr[i, j]

#Keep in mind that the log base 2 of 1 is 0, so a tf-idf score of 0 indicates that the word was present once in that sentence.
print('Original sentence:', X_train[5])
print('Tf_idf vector:', tfidf_bypara[5])

Number of features: 1948
Original sentence: A very few minutes more , however , completed the present trial .
Tf_idf vector: {'minutes': 0.7127450310382584, 'present': 0.701423210857947}


Number of features: 70
Original sentence: Qu .
Tf_idf vector: {'qu': 1.0}


# Trying out TFIDF

In [186]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
 

In [29]:
wordSet = set(hamletwords).union(set(emmawords))

In [35]:
WordDictA = dict.fromkeys(wordSet, 0)
WordDictB= dict.fromkeys(wordSet, 0)

In [37]:
for word in hamletwords:
    WordDictA[word] += 1

for word in emmawords:
    WordDictB[word]+=1

In [40]:
pd.DataFrame([WordDictA, WordDictB])

Unnamed: 0,Actus,Ambitious,Apparition,Armour,Bar,Barn,Barnardo,Bell,Brazon,Broadway,...,word,work,world,worthy,wright,year,yond,you_.,young,youth
0,1,1,1,1,1,1,1,1,1,0,...,0,1,1,0,1,0,1,0,0,0
1,0,0,0,0,0,0,0,0,0,1,...,1,1,1,1,0,1,0,1,1,1


In [42]:
def computeTF(wordDict, bow):
    tfDict = {}
    bowCount = len(bow)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bowCount)
    return tfDict

In [44]:
tfA = computeTF(WordDictA, hamletwords)
tfB = computeTF(WordDictB, emmawords)

In [45]:
def computeIDF(documents):
    import math
    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict

In [46]:
idfs = computeIDF([WordDictA, WordDictB])

In [47]:
def computeTFIDF(tfBagOfWords, idfs):
    tfidf = {}
    for word, val in tfBagOfWords.items():
        tfidf[word] = val * idfs[word]
    return tfidf

In [48]:
tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)
df = pd.DataFrame([tfidfA, tfidfB])

In [49]:
df

Unnamed: 0,Actus,Ambitious,Apparition,Armour,Bar,Barn,Barnardo,Bell,Brazon,Broadway,...,word,work,world,worthy,wright,year,yond,you_.,young,youth
0,0.002762,0.002762,0.002762,0.002762,0.002762,0.002762,0.002762,0.002762,0.002762,0.0,...,0.0,0.0,0.0,0.0,0.002762,0.0,0.002762,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001012,...,0.001012,0.0,0.0,0.001012,0.0,0.001012,0.0,0.001012,0.001012,0.001012


In [61]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(hamletwords)

print(vectorizer.get_feature_names())
print(X.shape)

['actus', 'againe', 'ambitious', 'angry', 'answer', 'apparition', 'appear', 'appeare', 'approue', 'armour', 'art', 'assaile', 'auouch', 'away', 'bar', 'barn', 'barnardo', 'beat', 'bed', 'beleefe', 'beleeue', 'bell', 'bid', 'bitter', 'boade', 'brazon', 'breake', 'burn', 'bury', 'cannon', 'carefully', 'cast', 'centinels', 'charge', 'cold', 'combat', 'combate', 'come', 'compact', 'course', 'dane', 'dar', 'day', 'dayly', 'dead', 'denmarke', 'diuide', 'doth', 'downe', 'dreaded', 'eare', 'emulate', 'enter', 'erruption', 'esteem', 'euen', 'exit', 'eye', 'faire', 'fantasie', 'farwel', 'fear', 'figure', 'forfeite', 'forme', 'forraigne', 'fortify', 'fortinbras', 'fra', 'fran', 'francisco', 'friend', 'frown', 'ghost', 'giue', 'go', 'god', 'good', 'goodnight', 'grosse', 'ground', 'guard', 'ha', 'hamlet', 'harrow', 'hast', 'hath', 'haue', 'heare', 'heart', 'heauen', 'heraldrie', 'hold', 'holla', 'honest', 'hor', 'hora', 'horatio', 'houre', 'ice', 'illume', 'image', 'implement', 'impresse', 'informe