In [1]:
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from collections import Counter
from nltk.corpus import gutenberg, stopwords
from sklearn import ensemble
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
%matplotlib inline

---

Choosing which text to build an NLP from

In [3]:
gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [6]:
#Choosing two similarly themed literary works
shakes = gutenberg.raw('shakespeare-macbeth.txt')
milton = gutenberg.raw('milton-paradise.txt')

---

Cleaning and Parsing the Data

In [7]:
# Utility function to clean text.
def text_cleaner(text):
    
    # Visual inspection shows spaCy does not recognize the double dash '--'.
    # Better get rid of it now!
    text = re.sub(r'--',' ',text)
    
    # Get rid of headings in square brackets.
    text = re.sub("[\[].*?[\]]", "", text)
    
    # Get rid of chapter titles.
    text = re.sub(r'Chapter \d+','',text)
    text = re.sub(r'CHAPTER \d+', '', text)
    text = re.sub("\\n\\n.*?\\n\\n", '', text)
  
    # Get rid of extra whitespace.
    text = ' '.join(text.split())

    
    return text

In [8]:
# Clean both documents
shakes = text_cleaner(shakes)
milton = text_cleaner(milton)

In [9]:
# run spacy and analyze the documents
nlp = spacy.load('en')

# Clean Caesar first
shakes_doc = nlp(shakes)
milton_doc = nlp(milton)

In [10]:
# Group into sentences.
shakes_sents = [[sent, "Shakes"] for sent in shakes_doc.sents]
milton_sents = [[sent, "Milton"] for sent in milton_doc.sents]

# Combine the sentences from the two novels into one data frame.
sentences = pd.DataFrame(shakes_sents + milton_sents)
sentences.head()

Unnamed: 0,0,1
0,"(Thunder, and, Lightning, .)",Shakes
1,"(Enter, three, Witches, .)",Shakes
2,"(1, .)",Shakes
3,"(When, shall, we, three, meet, againe, ?)",Shakes
4,"(In, Thunder, ,, Lightning, ,, or, in, Raine, ...",Shakes


In [11]:
len(sentences)

5264

In [13]:
# Utility function to create a list of the 2000 most common words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(2000)]
    

# Creates a data frame with features for each word in our common word set.
# Each value is the count of the times the word appears in each sentence.
def bow_features(sentences, common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence']):
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 500 == 0:
            print("Processing row {}".format(i))
            
    return df

# Set up the bags.
shakeswords = bag_of_words(shakes_doc)
miltonwords = bag_of_words(milton_doc)

# Combine bags to create a set of unique words.
common_words = set(shakeswords + miltonwords)

# How many words we got?
len(common_words)

3511

In [19]:
# Create our data frame with features. This can take a while to run.
word_counts = bow_features(sentences, common_words)
word_counts.head()

Processing row 0
Processing row 500
Processing row 1000
Processing row 1500
Processing row 2000
Processing row 2500
Processing row 3000
Processing row 3500
Processing row 4000
Processing row 4500
Processing row 5000


Unnamed: 0,likeness,lauish,transform,donal,Strange,Morrow,speech,reflection,Menteth,shrine,...,receiu'd,Liege,Heere,journey,binde,Charme,ouerthrowne,plague,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Thunder, and, Lightning, .)",Shakes
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Enter, three, Witches, .)",Shakes
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(1, .)",Shakes
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(When, shall, we, three, meet, againe, ?)",Shakes
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(In, Thunder, ,, Lightning, ,, or, in, Raine, ...",Shakes


In [21]:
# random forest fitting 
rfc = ensemble.RandomForestClassifier()
Y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence','text_source'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4)
train = rfc.fit(X_train, y_train)

print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))



Training set score: 0.974350854971501

Test set score: 0.869420702754036


In [22]:
# Overfitting has occurred but test set still did okay.
# Lets cross validate.

cross_val_score(rfc, X_train, y_train, cv=5)

array([0.83886256, 0.84335443, 0.87955626, 0.86846276, 0.84627575])

In [23]:
# Gradient Boosting.
clf = ensemble.GradientBoostingClassifier()

# Fit the model
fit_clf = clf.fit(X_train, y_train)

print('Training set score:', clf.score(X_train, y_train))
print('\nTest set score:', clf.score(X_test, y_test))

Training set score: 0.8055731475617479

Test set score: 0.7934472934472935


In [24]:
# Scores are more consistent. Overfitting hasn't seem to have occurred.
cross_val_score(clf, X_train, y_train, cv=5)

array([0.75671406, 0.78006329, 0.79397781, 0.78763867, 0.78129952])

In [25]:
# Logistic Regression Model
lr = LogisticRegression()
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))



(3158, 3511) (3158,)
Training set score: 0.9648511716276124

Test set score: 0.9088319088319088


In [26]:
# Looks like overfitting has occurred. I will attempt to fix that later. 
cross_val_score(lr, X_train, y_train, cv=5)



array([0.90363349, 0.90348101, 0.90015848, 0.91600634, 0.8858954 ])

---

Improving Gradient Boosting

In [27]:
from sklearn.model_selection import GridSearchCV
# Use GS-CV in order to find the optimal parameters.
clf_parameters = {
             'n_estimators':[100,200,500,1000],
              'max_depth':[2,4,6,8],
              'max_features':[2,4,6,8]
}

clf_grid = GridSearchCV(clf, clf_parameters, cv=5, verbose=1, n_jobs=-1)

#Fit the logistic regression 
clf_grid.fit(X_train, y_train)

#return best parameters and best score
print('Best parameters:')
print(clf_grid.best_params_)
print('Best Score:')
print(clf_grid.best_score_)

Fitting 5 folds for each of 64 candidates, totalling 320 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   23.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:  5.2min finished


Best parameters:
{'max_depth': 8, 'max_features': 2, 'n_estimators': 1000}
Best Score:
0.9103863204559848


In [32]:
#Improved the score by increasing iterations and features.
clf = ensemble.GradientBoostingClassifier(n_estimators=1000,
                                         max_depth=8, max_features=4)

cross_val_score(clf, X_train, y_train, cv=5)

array([0.92259084, 0.88607595, 0.91283677, 0.9207607 , 0.90649762])

In [29]:
# Scores are consistent but lets try it on a test set.
cross_val_score(clf, X_test, y_test, cv=5)

array([0.9028436 , 0.87914692, 0.91232227, 0.9047619 , 0.88809524])

---

TF-IDF

In [39]:
# Create the tf-idf function
def document_freq(data, sentences, common_words, doc_names, doc_words):
    
    # initialize df
    df = pd.DataFrame(columns = common_words)
    df.iloc[:, 0] = [0, 0, 0, 0, 0]
    df.loc[:, common_words] = 0
    df.rename(index={0:'df', 1:'cf', 2:'idf', 3:'Shakes', 4:'Milton'}, inplace=True)
    
    for word in common_words:
        # find document frequency & collection frequency
        df.loc['df', word] = data[data[word] > 0][word].count()
        df.loc['cf', word] = data.loc[:, word].sum()
        
        # find idf
        df.loc['idf', word] = np.log2(len(sentences)/df.loc['df', word])
        
    # assign the idf value to the documents
    for word in df.columns:
        for i in range(len(doc_names)):
            if word in doc_words[i]:
                df.loc[doc_names[i], word] = df.loc['idf', word]
        
    return df

In [40]:
# Create arrays to identify and hold my words.
doc_names = ['Shakes', 'Milton']
doc_words = [shakeswords, miltonwords]
tf_idf = document_freq(word_counts, sentences, common_words, doc_names, doc_words)
tf_idf

Unnamed: 0,likeness,lauish,transform,donal,Strange,Morrow,speech,reflection,Menteth,shrine,...,audience,interprete,receiu'd,Liege,Heere,journey,binde,Charme,ouerthrowne,plague
df,5.0,1.0,4.0,2.0,1.0,1.0,15.0,3.0,2.0,5.0,...,8.0,1.0,2.0,3.0,3.0,12.0,1.0,5.0,1.0,6.0
cf,6.0,1.0,4.0,2.0,1.0,1.0,15.0,3.0,2.0,5.0,...,8.0,1.0,2.0,3.0,3.0,12.0,1.0,5.0,1.0,6.0
idf,10.040016,12.361944,10.361944,11.361944,12.361944,12.361944,8.455053,10.776981,11.361944,10.040016,...,9.361944,12.361944,11.361944,10.776981,10.776981,8.776981,12.361944,10.040016,12.361944,9.776981
Shakes,0.0,12.361944,0.0,11.361944,12.361944,12.361944,8.455053,10.776981,11.361944,0.0,...,0.0,12.361944,11.361944,10.776981,10.776981,0.0,12.361944,10.040016,12.361944,9.776981
Milton,10.040016,0.0,10.361944,0.0,0.0,0.0,8.455053,0.0,0.0,10.040016,...,9.361944,0.0,0.0,0.0,0.0,8.776981,0.0,0.0,0.0,9.776981


In [41]:
# Let's make it so that the rows become the columns. 
tf_idf = tf_idf.T
tf_idf.head()

Unnamed: 0,df,cf,idf,Shakes,Milton
likeness,5.0,6.0,10.040016,0.0,10.040016
lauish,1.0,1.0,12.361944,12.361944,0.0
transform,4.0,4.0,10.361944,0.0,10.361944
donal,2.0,2.0,11.361944,11.361944,0.0
Strange,1.0,1.0,12.361944,12.361944,0.0


In [42]:

# Set up a threshold to count and see which word belongs where.
threshold = 5
tf_idf['Shakes_threshold'] = 0
tf_idf['Milton_threshold'] = 0

tf_idf['Shakes_threshold'] = np.where(tf_idf['Shakes'] > threshold, 1, 0)
tf_idf['Milton_threshold'] = np.where(tf_idf['Milton'] > threshold, 1, 0)

tf_idf.head()

Unnamed: 0,df,cf,idf,Shakes,Milton,Shakes_threshold,Milton_threshold
likeness,5.0,6.0,10.040016,0.0,10.040016,0,1
lauish,1.0,1.0,12.361944,12.361944,0.0,1,0
transform,4.0,4.0,10.361944,0.0,10.361944,0,1
donal,2.0,2.0,11.361944,11.361944,0.0,1,0
Strange,1.0,1.0,12.361944,12.361944,0.0,1,0


In [43]:

# Set up a way to determine which word goes into which group.
# default with both
tf_idf['source'] = 'both'

# Create a method
def determine_who(df):
    # Create a loop that iterates through each row and determines where it goes.
    for i in range(len(df)):
        # make a counter
        flag = 0
        source = 'Both'
        
        if (df.iloc[i, 5] == 1):
            flag = 1
            source = 'Shakes'
           
        if (df.iloc[i, 6] == 1):
            if (flag == 1):
                continue
            flag = 1
            source = 'Milton'
            
        df.iloc[i, 7] = source
        
    return df

In [44]:
# Was testing the output and accidentally created another row. 
tf_idf_test = determine_who(tf_idf)

tf_idf_test.head(30)

Unnamed: 0,df,cf,idf,Shakes,Milton,Shakes_threshold,Milton_threshold,source
likeness,5.0,6.0,10.040016,0.0,10.040016,0,1,Milton
lauish,1.0,1.0,12.361944,12.361944,0.0,1,0,Shakes
transform,4.0,4.0,10.361944,0.0,10.361944,0,1,Milton
donal,2.0,2.0,11.361944,11.361944,0.0,1,0,Shakes
Strange,1.0,1.0,12.361944,12.361944,0.0,1,0,Shakes
Morrow,1.0,1.0,12.361944,12.361944,0.0,1,0,Shakes
speech,15.0,15.0,8.455053,8.455053,8.455053,1,1,both
reflection,3.0,3.0,10.776981,10.776981,0.0,1,0,Shakes
Menteth,2.0,2.0,11.361944,11.361944,0.0,1,0,Shakes
shrine,5.0,5.0,10.040016,0.0,10.040016,0,1,Milton


In [47]:

# Finally time to test the models! 
# Drop everything except for the tf-idf values
rfc = ensemble.RandomForestClassifier()
Y2 = tf_idf_test['source']
X2 = tf_idf_test.drop(['source', 'Shakes_threshold',
                'Milton_threshold' ,'Shakes', 'Milton'], axis=1)

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, 
                                                    Y2,
                                                    test_size=0.3)
train = rfc.fit(X2_train, y2_train)

print('Training set score:', rfc.score(X2_train, y2_train))
print('\nTest set score:', rfc.score(X2_test, y2_test))



Training set score: 0.820919820919821

Test set score: 0.7912713472485768


In [48]:
# Cross validate the rfc model. 
cross_val_score(rfc, X2_train, y2_train, cv=5)

array([0.76923077, 0.75609756, 0.79837067, 0.79387755, 0.79387755])

In [49]:
# logistic regression fitting
lr = LogisticRegression()
train = lr.fit(X2_train, y2_train)
print(X2_train.shape, y2_train.shape)
print('Training set score:', lr.score(X2_train, y2_train))
print('\nTest set score:', lr.score(X2_test, y2_test))



(2457, 3) (2457,)
Training set score: 0.7431827431827431

Test set score: 0.7504743833017078


In [50]:
cross_val_score(lr, X2_train, y2_train, cv=5)



array([0.72672065, 0.73577236, 0.75560081, 0.75102041, 0.74489796])

In [51]:
clf2 = ensemble.GradientBoostingClassifier()

# Gradient Boosting Model.
train = clf2.fit(X2_train, y2_train)

print('Training set score:', clf2.score(X2_train, y2_train))
print('\nTest set score:', clf2.score(X2_test, y2_test))

Training set score: 0.8156288156288156

Test set score: 0.7922201138519924


In [52]:
cross_val_score(clf2, X2_train, y2_train, cv=5)

array([0.77530364, 0.76626016, 0.80855397, 0.80612245, 0.79183673])

In [53]:
# Lets improve the GB model since it will be faster than the random forest. 
# Use GS-CV in order to find the optimal parameters.
clf_parameters = {
             'n_estimators':[100,200,500,1000],
              'max_depth':[2,4,6,8],
             'max_features':['auto']
}

clf_grid = GridSearchCV(clf, clf_parameters, cv=5, verbose=1, n_jobs=-1)

#Fit the logistic regression 
clf_grid.fit(X2_train, y2_train)

#return best parameters and best score
print('Best parameters:')
print(clf_grid.best_params_)
print('Best Score:')
print(clf_grid.best_score_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   15.2s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:   47.9s finished


Best parameters:
{'max_depth': 4, 'max_features': 'auto', 'n_estimators': 100}
Best Score:
0.7891737891737892
