In [98]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import re
import spacy
from collections import Counter

from sklearn import ensemble
from sklearn.model_selection import train_test_split

from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LogisticRegression

from sklearn.svm import SVC
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

In [2]:
from nltk.corpus import gutenberg, stopwords

In [42]:
# Utility function to clean text.
def text_cleaner(text, pattern1, pattern2):
    
    # Visual inspection shows spaCy does not recognize the double dash '--'.
    # Better get rid of it now!
    text = re.sub(r'--',' ',text)
    
    # Get rid of headings in square brackets.
    text = re.sub("[\[].*?[\]]", "", text)
    
    # Get rid of chapter titles.
    text = re.sub(pattern1,'',text)
    text = re.sub(pattern2,'',text)
    
    # Get rid of extra whitespace.
    text = ' '.join(text.split())
    
    text = text.lower()
    
    return text

In [43]:
caesar = gutenberg.raw('shakespeare-caesar.txt')
hamlet = gutenberg.raw('shakespeare-hamlet.txt')
macbeth = gutenberg.raw('shakespeare-macbeth.txt')

caesar = text_cleaner(caesar, r'Actus .*', r'Scoena .*')
hamlet = text_cleaner(hamlet, r'Actus .*', r'Scoena .*')
macbeth = text_cleaner(macbeth, r'Actus .*', r'Scoena .*')

In [24]:
nlp = spacy.load('en')

In [45]:
caesar_doc = nlp(caesar)

In [46]:
hamlet_doc = nlp(hamlet)

In [47]:
macbeth_doc = nlp(macbeth)

In [37]:
# Utility function to calculate how frequently words appear in the text.
def word_frequencies(text, include_stop=False):
    
    # Build a list of words.
    # Strip out punctuation and, optionally, stop words.
    words = []
    for token in text:
        if not token.is_punct and (not token.is_stop or include_stop):
            words.append(token.text)
            
    # Build and return a Counter object containing word counts.
    return Counter(words)

In [48]:
caesar_freq = word_frequencies(caesar_doc, include_stop=False).most_common(10)
print('Caesar:', caesar_freq)

Caesar: [('caesar', 189), ('brutus', 161), ('bru', 153), ('haue', 147), ('shall', 125), ('thou', 115), ('cassi', 107), ('cassius', 85), ('antony', 75), ('come', 74)]


In [49]:
hamlet_freq = word_frequencies(hamlet_doc, include_stop=False).most_common(10)
print('Hamlet:', hamlet_freq)

Hamlet: [('ham', 337), ('lord', 211), ('haue', 175), ('king', 172), ("'s", 122), ('shall', 107), ('thou', 104), ('let', 104), ('come', 104), ('hamlet', 99)]


In [50]:
macbeth_freq = word_frequencies(macbeth_doc, include_stop=False).most_common(10)
print('Hamlet:', macbeth_freq)

Hamlet: [('macb', 137), ("'s", 131), ('haue', 122), ('thou', 87), ('enter', 81), ('shall', 68), ('macbeth', 61), ('thee', 61), ('vpon', 60), ('macd', 58)]


In [51]:
# Group into sentences.
caesar_sents = [[sent, "Caesar"] for sent in caesar_doc.sents]
hamlet_sents = [[sent, "Hamlet"] for sent in hamlet_doc.sents]
macbeth_sents = [[sent, "Macbeth"] for sent in macbeth_doc.sents]

In [53]:
# Combine the sentences from the two novels into one data frame.
sentences = pd.DataFrame(caesar_sents + macbeth_sents + macbeth_sents)
sentences.head()

Unnamed: 0,0,1
0,"(enter, flauius, ,, murellus, ,, and, certaine...",Caesar
1,"(flauius, .)",Caesar
2,"(hence, :, home, you, idle, creatures, ,, get,...",Caesar
3,"(what, ,, know, you, not, (, being, mechanical...",Caesar
4,"(you, ought, not, walke, vpon, a, labouring, d...",Caesar


In [54]:
# Utility function to create a list of the 2000 most common words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(2000)]

In [57]:
bow_caesar = bag_of_words(caesar_doc)
bow_hamlet = bag_of_words(hamlet_doc)
bow_macbeth = bag_of_words(macbeth_doc)

In [58]:
common_words = set(bow_caesar + bow_hamlet + bow_macbeth)

In [60]:
# Each value is the count of the times the word appears in each sentence.
def bow_features(sentences, common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 500 == 0:
            print("Processing row {}".format(i))
            
    return df

In [61]:
# Create our data frame with features. This can take a while to run.
word_counts = bow_features(sentences, common_words)
word_counts.head()

Processing row 0
Processing row 500
Processing row 1000
Processing row 1500
Processing row 2000
Processing row 2500
Processing row 3000
Processing row 3500
Processing row 4000
Processing row 4500


Unnamed: 0,vnrighteous,taper,cheere,gaming,stab'd,flagon,prophetique,fly,illusion,backward,...,hew,physick,flye,infect,end,rapier,aboord,bellow,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(enter, flauius, ,, murellus, ,, and, certaine...",Caesar
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(flauius, .)",Caesar
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(hence, :, home, you, idle, creatures, ,, get,...",Caesar
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(what, ,, know, you, not, (, being, mechanical...",Caesar
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(you, ought, not, walke, vpon, a, labouring, d...",Caesar


### With Random Forest... the cross validation test scores range in the 0.85-0.88 range.

In [63]:
rfc = ensemble.RandomForestClassifier()
Y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence','text_source'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)
train = rfc.fit(X_train, y_train)

print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

Training set score: 0.980749398419

Test set score: 0.864948453608


In [65]:
scores = cross_val_score(rfc, X, Y, cv=5)
scores    

array([ 0.87641607,  0.85979381,  0.87216495,  0.8875129 ,  0.85758514])

### Also in a similiar range for Logistic Regression- this time 0.84-0.88.

In [84]:
lr = LogisticRegression(solver='newton-cg')
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

(2909, 4012) (2909,)
Training set score: 0.941216913029

Test set score: 0.867525773196


In [85]:
scores2 = cross_val_score(lr, X, Y, cv=5)
scores2

array([ 0.86096807,  0.84329897,  0.88041237,  0.875129  ,  0.84623323])

### The lowest scores so far for Gradient Boosting Classifier...

In [69]:
clf = ensemble.GradientBoostingClassifier()

train = clf.fit(X_train, y_train)

print('Training set score:', clf.score(X_train, y_train))
print('\nTest set score:', clf.score(X_test, y_test))

Training set score: 0.830525953936

Test set score: 0.812886597938


In [70]:
scores3 = cross_val_score(clf, X, Y, cv=5)
scores3 

array([ 0.81359423,  0.78041237,  0.83298969,  0.82765738,  0.78637771])

### Support Vector Machine did the best overall...

In [74]:
svm = SVC(kernel = 'linear')
svm.fit(X_train, y_train)

print('Training set score:', svm.score(X_train, y_train))
print('\nTest set score:', svm.score(X_test, y_test))

Training set score: 0.961155036095

Test set score: 0.865979381443


In [75]:
scores4 = cross_val_score(svm, X, Y, cv=5)
scores4 

array([ 0.87435633,  0.86597938,  0.88453608,  0.875129  ,  0.86377709])

### So Support Vector Machine did best overall. 

In [86]:
print("Random Forest: ", np.mean(scores))
print("Logistic Regression: ", np.mean(scores2))
print("Gradient Boosting Classifier: ", np.mean(scores3))
print("Support Vector Machine: ", np.mean(scores4))

Random Forest:  0.870694573603
Logistic Regression:  0.861208328692
Gradient Boosting Classifier:  0.808206276465
Support Vector Machine:  0.872755577269


In [91]:
#reading in the data, this time in the form of paragraphs
caesar_p = gutenberg.paras('shakespeare-caesar.txt')
hamlet_p = gutenberg.paras('shakespeare-hamlet.txt')
macbeth_p = gutenberg.paras('shakespeare-macbeth.txt')

shakespear_p = caesar_p + hamlet_p + macbeth_p
#processing
shakespear_paras=[]
for paragraph in shakespear_p:
    para=paragraph[0]
    #removing the double-dash from all words
    para=[re.sub(r'--','',word) for word in para]
    #Forming each paragraph into a string and adding it to the list of strings.
    shakespear_paras.append(' '.join(para))

print(shakespear_paras[0:4])

['[ The Tragedie of Julius Caesar by William Shakespeare 1599 ]', 'Actus Primus .', 'Enter Flauius , Murellus , and certaine Commoners ouer the Stage .', 'Flauius .']


In [97]:
X_train, X_test = train_test_split(shakespear_paras, test_size=0.4, random_state=0)

vectorizer = TfidfVectorizer(max_df=0.5, # drop words that occur in more than half the paragraphs
                             min_df=2, # only use words that appear at least twice
                             stop_words='english', 
                             lowercase=True, #convert everything to lower case (since Alice in Wonderland has the HABIT of CAPITALIZING WORDS for EMPHASIS)
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                            )


#Applying the vectorizer
shakespear__paras_tfidf=vectorizer.fit_transform(shakespear_paras)
print("Number of features: %d" % shakespear__paras_tfidf.get_shape()[1])

#splitting into training and test sets
X_train_tfidf, X_test_tfidf= train_test_split(shakespear__paras_tfidf, test_size=0.4, random_state=0)


#Reshapes the vectorizer output into something people can read
X_train_tfidf_csr = X_train_tfidf.tocsr()

#number of paragraphs
n = X_train_tfidf_csr.shape[0]
#A list of dictionaries, one per paragraph
tfidf_bypara = [{} for _ in range(0,n)]
#List of features
terms = vectorizer.get_feature_names()
#for each paragraph, lists the feature words and their tf-idf scores
for i, j in zip(*X_train_tfidf_csr.nonzero()):
    tfidf_bypara[i][terms[j]] = X_train_tfidf_csr[i, j]

#Keep in mind that the log base 2 of 1 is 0, so a tf-idf score of 0 indicates that the word was present once in that sentence.
og_sentence_1 = X_train[5]
tf_idf_vector1 = tfidf_bypara[5]

print('Original sentence: ', og_sentence_1)
print('Tf_idf vector: ', tf_idf_vector1)

Number of features: 330
Original sentence:  Guild .
Tf_idf vector:  {'guild': 1.0}


In [110]:
X_train_tfidf_csr.get_shape()

(1423, 330)

In [111]:
len(shakespear_paras)

2372

In [102]:
#Our SVD data reducer.  We are going to reduce the feature space from 1379 to 130.
svd= TruncatedSVD(130)
lsa = make_pipeline(svd, Normalizer(copy=False))
# Run SVD on the training data, then project the training data.
X_train_lsa = lsa.fit_transform(X_train_tfidf)

variance_explained=svd.explained_variance_ratio_
total_variance = variance_explained.sum()
print("Percent variance captured by all components:",total_variance*100)

#Looking at what sorts of paragraphs our solution considers similar, for the first five identified topics
paras_by_component=pd.DataFrame(X_train_lsa,index=X_train)
for i in range(35,40):
    print('Component {}:'.format(i))
    print(paras_by_component.loc[:,i].sort_values(ascending=False)[0:10])

Percent variance captured by all components: 93.1159739067
Component 35:
Osr .     0.838335
Osr .     0.838335
Osr .     0.838335
Osr .     0.838335
Osr .     0.838335
Osr .     0.838335
Osr .     0.838335
Osr .     0.838335
Barn .    0.467084
Barn .    0.467084
Name: 35, dtype: float64
Component 36:
Por .    0.767166
Por .    0.767166
Por .    0.767166
Por .    0.767166
Por .    0.767166
Por .    0.767166
Por .    0.767166
Por .    0.767166
Osr .    0.007660
Osr .    0.007660
Name: 36, dtype: float64
Component 37:
Scena Sexta .      0.939481
Scena Septima .    0.938871
Scena Quarta .     0.910838
Scena Secunda .    0.865348
Scena Secunda .    0.865348
Scena Secunda .    0.865348
Scena Secunda .    0.865348
Scena Secunda .    0.865348
Scena Tertia .     0.800506
Scena Tertia       0.800506
Name: 37, dtype: float64
Component 38:
Thunder .                                                                 0.997595
Thunder .                                                                 0.9

In [103]:
paras_by_component

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,120,121,122,123,124,125,126,127,128,129
Exeunt .,8.929495e-16,3.479567e-16,-3.103309e-15,1.487366e-15,1.000000e+00,2.719247e-14,-1.277165e-15,1.544830e-15,7.533312e-16,3.630066e-16,...,1.012242e-12,8.621326e-13,-7.876929e-13,6.507878e-13,-2.186713e-13,-1.466414e-13,2.785624e-13,5.716160e-13,4.793390e-13,3.116004e-14
"Enter Horatio , with an Attendant .",-5.026364e-12,-5.205305e-11,4.370653e-10,1.456299e-02,3.802207e-09,-1.178602e-08,-2.201796e-08,4.088215e-09,-8.384306e-09,-3.056583e-08,...,-3.061699e-02,5.006733e-02,-1.316553e-01,2.427657e-01,-3.370142e-02,2.478235e-02,4.920367e-02,1.135875e-02,1.249401e-01,-3.620789e-01
Exit Cinna .,-4.839970e-13,5.127248e-12,1.529943e-12,2.531993e-03,3.277219e-10,-2.932106e-10,-7.346362e-10,9.078140e-10,-1.399579e-09,-5.196153e-09,...,1.106443e-03,1.013774e-02,5.402763e-03,5.485327e-03,-1.499077e-03,-1.001063e-02,-2.633145e-02,-1.289026e-02,6.857092e-04,3.356570e-03
Exeunt .,2.400817e-16,1.147213e-15,8.585683e-16,3.000257e-15,1.000000e+00,2.044854e-14,-1.306904e-15,3.040102e-15,5.563456e-16,6.045393e-16,...,1.012480e-12,8.624034e-13,-7.872077e-13,6.510909e-13,-2.190938e-13,-1.462800e-13,2.786712e-13,5.718040e-13,4.792292e-13,3.087709e-14
Luc .,-4.622801e-19,1.134004e-15,-1.246750e-15,-5.009813e-15,-5.191615e-14,1.235475e-14,3.460359e-14,-1.427846e-13,1.514300e-14,7.036984e-13,...,7.728383e-09,1.326757e-08,1.076105e-08,7.787145e-09,-8.359648e-09,5.693005e-11,-4.008360e-10,1.015528e-09,1.357683e-08,-1.982521e-08
Guild .,9.861222e-17,1.466872e-15,1.652165e-14,1.504032e-14,2.809281e-15,-5.523672e-14,-3.229042e-13,5.793700e-13,-5.594969e-14,-1.037823e-12,...,2.666795e-08,-3.800521e-08,-1.071452e-07,-1.813448e-08,8.144070e-08,-3.457410e-08,-7.549005e-09,8.943792e-09,-4.577618e-08,1.095982e-07
Macb .,-4.198454e-15,2.869985e-14,1.000000e+00,7.131972e-14,-8.162780e-16,6.712969e-15,6.656716e-15,-5.567048e-15,5.394500e-16,-9.827843e-16,...,-2.746092e-14,-3.698214e-14,9.782633e-15,1.390045e-14,-4.256348e-14,6.254003e-14,1.328141e-14,2.800828e-14,-2.684503e-14,-6.850405e-14
Guil .,1.955625e-15,1.388273e-13,-1.778063e-13,4.798112e-13,-1.003434e-12,7.100971e-12,8.905416e-12,1.672219e-11,-2.719237e-11,-7.419883e-11,...,2.853589e-06,-6.120499e-06,-9.737617e-06,-2.708530e-06,3.691216e-06,3.682623e-07,-1.787110e-06,6.074424e-06,-4.716529e-06,2.838375e-06
"I goe , and it is done : the Bell inuites me .",-2.593717e-13,-3.417505e-11,-4.183707e-10,1.808245e-06,1.050068e-09,-5.677010e-10,2.556087e-09,-5.294442e-09,-1.454858e-08,-3.534084e-08,...,-3.391945e-02,5.308512e-02,7.956329e-02,-1.144931e-01,-3.837067e-02,-1.926084e-02,-2.828922e-02,-4.877281e-02,9.906610e-02,1.897513e-02
Hor .,-1.926744e-16,-1.779907e-15,1.403594e-15,1.147640e-15,2.412387e-15,2.189743e-15,1.000000e+00,6.786147e-15,4.257194e-15,4.579339e-16,...,4.526344e-12,4.313232e-12,-2.535288e-12,1.156095e-16,8.078954e-13,-8.946316e-13,-9.087842e-14,-2.152945e-13,1.907964e-12,6.531017e-12
