In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import gutenberg, stopwords
from collections import Counter
import nltk
nltk.download('gutenberg')
!python -m spacy download en

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package gutenberg to /Users/Isaac/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!

[93m    Linking successful[0m
    /anaconda3/lib/python3.6/site-packages/en_core_web_sm -->
    /anaconda3/lib/python3.6/site-packages/spacy/data/en

    You can now load the model via spacy.load('en')



In [166]:
# Load the data.
blake_poems = gutenberg.raw('blake-poems.txt')
whitman_leaves = gutenberg.raw('whitman-leaves.txt')

In [167]:
# Utility function for standard text cleaning.
def text_cleaner(text):
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text

In [168]:
blake_poems = text_cleaner(blake_poems)
whitman_leaves = text_cleaner(whitman_leaves)

In [169]:
blake_poems[0:1000]

'SONGS OF INNOCENCE AND OF EXPERIENCE and THE BOOK of THEL SONGS OF INNOCENCE INTRODUCTION Piping down the valleys wild, Piping songs of pleasant glee, On a cloud I saw a child, And he laughing said to me: "Pipe a song about a Lamb!" So I piped with merry cheer. "Piper, pipe that song again;" So I piped: he wept to hear. "Drop thy pipe, thy happy pipe; Sing thy songs of happy cheer:!" So I sang the same again, While he wept with joy to hear. "Piper, sit thee down and write In a book, that all may read." So he vanish\'d from my sight; And I pluck\'d a hollow reed, And I made a rural pen, And I stain\'d the water clear, And I wrote my happy songs Every child may joy to hear. THE SHEPHERD How sweet is the Shepherd\'s sweet lot! From the morn to the evening he stays; He shall follow his sheep all the day, And his tongue shall be filled with praise. For he hears the lambs\' innocent call, And he hears the ewes\' tender reply; He is watching while they are in peace, For they know when their 

In [170]:
whitman_leaves[0:1000]

"Come, said my soul, Such verses for my Body let us write, (for we are one,) That should I after return, Or, long, long hence, in other spheres, There to some group of mates the chants resuming, (Tallying Earth's soil, trees, winds, tumultuous waves,) Ever with pleas'd smile I may keep on, Ever and ever yet the verses owning as, first, I here and now Signing for Soul and Body, set to them my name, Walt Whitman } One's-Self I Sing One's-self I sing, a simple separate person, Yet utter the word Democratic, the word En-Masse. Of physiology from top to toe I sing, Not physiognomy alone nor brain alone is worthy for the Muse, I say the Form complete is worthier far, The Female equally with the Male I sing. Of Life immense in passion, pulse, and power, Cheerful, for freest action form'd under the laws divine, The Modern Man I sing. } As I Ponder'd in Silence As I ponder'd in silence, Returning upon my poems, considering, lingering long, A Phantom arose before me with distrustful aspect, Terr

In [171]:
# Parse the cleaned novels. This can take a bit.
nlp = spacy.load('en')
blake_poems_doc = nlp(blake_poems)
whitman_leaves_doc = nlp(whitman_leaves)

In [182]:
# Group into sentences.
# Separatng Blake poem titles from poem text

blake_lines = [[line, "Blake"] for line in blake_poems_doc.sents if not line.text.isupper()]
blake_titles = [[title, "Blake"] for title in blake_poems_doc.sents if title.text.isupper()]
whitman_sents = [[sent, "Whitman"] for sent in whitman_leaves_doc.sents]

# Combine the sentences from the two novels into one data frame.
sentences = pd.DataFrame(blake_lines + blake_titles + whitman_sents)
sentences.head()

Unnamed: 0,0,1
0,"(SONGS, OF, INNOCENCE, AND, OF, EXPERIENCE, an...",Blake
1,"(INTRODUCTION, Piping, down, the, valleys, wil...",Blake
2,"(So, I, piped, with, merry, cheer, ., "")",Blake
3,"(Piper, ,, pipe, that, song, again)",Blake
4,"(;, "", So, I, piped, :, he, wept, to, hear, ., "")",Blake


In [183]:
sentences[0][sentences[1] == 'Blake'][-35:-25]

463    (Why, a, tender, curb, upon, the, youthful, bu...
464    (Why, a, little, curtain, of, flesh, on, the, ...
465    (The, Virgin, started, from, her, seat, ,, &, ...
466                                      (THE, SHEPHERD)
467                                (THE, ECHOING, GREEN)
468                           (THE, CHIMNEY, -, SWEEPER)
469                                     (LAUGHING, SONG)
470                                      (DIVINE, IMAGE)
471                                              (NIGHT)
472                                             (SPRING)
Name: 0, dtype: object

In [205]:
# Utility function to create a list of the 2000 most common words. Augmented to exclude titles - 
# if not line.text.isupper()

def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(2000)]
    

# Creates a data frame with features for each word in our common word set.
# Each value is the count of the times the word appears in each sentence.
def bow_features(sentences, common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence'][::100]):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 50 == 0:
            print("Processing row {}".format(i))
            
    return df

# Set up the bags.
blakewords = bag_of_words(blake_poems_doc)
whitmanwords = bag_of_words(whitman_leaves_doc)

# Combine bags to create a set of unique words.
common_words = set(blakewords + whitmanwords)

In [209]:
# Create our data frame with features. This can take a while to run.
word_counts = bow_features(sentences, common_words)

Processing row 0
Processing row 50


In [211]:
# Creating sentiment column with TextBlob
from textblob import TextBlob

word_counts['text_sentence_sentiment_polarity'] = word_counts['text_sentence'].apply(str).apply(lambda x: TextBlob(x).sentiment.polarity)
word_counts['text_sentence_sentiment_subjectivity'] = word_counts['text_sentence'].apply(str).apply(lambda x: TextBlob(x).sentiment.subjectivity)

In [212]:
word_counts['text_sentence_avg_word_length'] = word_counts['text_sentence'].apply(str).apply(lambda x: x.split())
word_counts['text_sentence_avg_word_length'] = word_counts['text_sentence_avg_word_length'].apply(lambda words: sum(len(word) for word in words) / len(words))

In [213]:
word_counts.head()

Unnamed: 0,personality,poor,stony,sell,victory,door,delight,substance,ned,bar,...,flash,whatev,manacle,while,build,text_sentence,text_source,text_sentence_sentiment_polarity,text_sentence_sentiment_subjectivity,text_sentence_avg_word_length
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,"(SONGS, OF, INNOCENCE, AND, OF, EXPERIENCE, an...",Blake,0.0,0.0,4.5
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,"(INTRODUCTION, Piping, down, the, valleys, wil...",Blake,0.287037,0.551852,4.2
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,"(So, I, piped, with, merry, cheer, ., "")",Blake,0.0,0.0,3.428571
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,"(Piper, ,, pipe, that, song, again)",Blake,0.0,0.0,4.6
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,"(;, "", So, I, piped, :, he, wept, to, hear, ., "")",Blake,0.0,0.0,2.777778


In [215]:
# Adding a feature encoding titles for blake poems

len_blake_lines = len(blake_lines)
len_total_blake = len(blake_lines + blake_titles)

word_counts['titles'] = 0
word_counts['titles'].loc[len_blake_lines:len_total_blake] = 1

word_counts.iloc[len_blake_lines-1:len_blake_lines+1]

Unnamed: 0,personality,poor,stony,sell,victory,door,delight,substance,ned,bar,...,whatev,manacle,while,build,text_sentence,text_source,text_sentence_sentiment_polarity,text_sentence_sentiment_subjectivity,text_sentence_avg_word_length,titles
465,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,"(The, Virgin, started, from, her, seat, ,, &, ...",Blake,0.0,0.0,4.095238,0
466,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,"(THE, SHEPHERD)",Blake,0.0,0.0,5.5,1


In [229]:
# Encoding Blake
word_counts.text_source = word_counts.text_source.eq('Blake').mul(1)
word_counts.head()

Unnamed: 0,personality,poor,stony,sell,victory,door,delight,substance,ned,bar,...,whatev,manacle,while,build,text_sentence,text_source,text_sentence_sentiment_polarity,text_sentence_sentiment_subjectivity,text_sentence_avg_word_length,titles
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,"(SONGS, OF, INNOCENCE, AND, OF, EXPERIENCE, an...",1,0.0,0.0,4.5,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,"(INTRODUCTION, Piping, down, the, valleys, wil...",1,0.287037,0.551852,4.2,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,"(So, I, piped, with, merry, cheer, ., "")",1,0.0,0.0,3.428571,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,"(Piper, ,, pipe, that, song, again)",1,0.0,0.0,4.6,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,"(;, "", So, I, piped, :, he, wept, to, hear, ., "")",1,0.0,0.0,2.777778,0


# BoW with Logistic Regression

In [230]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

Y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence','text_source'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)

lr = LogisticRegression()
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))



(3724, 2584) (3724,)
Training set score: 0.9264232008592911

Test set score: 0.9335481272654047


In [239]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

pred = lr.predict(X_test)
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))

[[2297    1]
 [ 164   21]]
              precision    recall  f1-score   support

           0       0.93      1.00      0.97      2298
           1       0.95      0.11      0.20       185

   micro avg       0.93      0.93      0.93      2483
   macro avg       0.94      0.56      0.58      2483
weighted avg       0.93      0.93      0.91      2483



# SVM

In [232]:
from sklearn.svm import LinearSVC

svm = LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
                intercept_scaling=1, loss='squared_hinge', max_iter=1000,
                multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
                verbose=0)

# svm = SVC()
train = svm.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', svm.score(X_train, y_train))
print('\nTest set score:', svm.score(X_test, y_test))



(3724, 2584) (3724,)
Training set score: 0.9296455424274973

Test set score: 0.93717277486911


In [240]:
pred = svm.predict(X_test)
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))

[[2297    1]
 [ 155   30]]
              precision    recall  f1-score   support

           0       0.94      1.00      0.97      2298
           1       0.97      0.16      0.28       185

   micro avg       0.94      0.94      0.94      2483
   macro avg       0.95      0.58      0.62      2483
weighted avg       0.94      0.94      0.92      2483



# Gradient Boosting

In [234]:
from sklearn import ensemble

clf = ensemble.GradientBoostingClassifier()

train = clf.fit(X_train, y_train)

print('Training set score:', clf.score(X_train, y_train))
print('\nTest set score:', clf.score(X_test, y_test))

Training set score: 0.9299140708915145

Test set score: 0.9387837293596456


In [241]:
pred = clf.predict(X_test)
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))

[[2296    2]
 [ 150   35]]
              precision    recall  f1-score   support

           0       0.94      1.00      0.97      2298
           1       0.95      0.19      0.32       185

   micro avg       0.94      0.94      0.94      2483
   macro avg       0.94      0.59      0.64      2483
weighted avg       0.94      0.94      0.92      2483



# Explanation

It appears that the Gradient Boosting model worked the best. Unfortunately I am getting obviously low recall score and f1-score. A system with high precision but low recall is returning very few (positive) results, but most of its predicted labels are correct when compared to the training labels. Recall measures effectiveness of labelling true positives.

The implication here is that there is a class imbalance (there are simply fewer words for Blake text). The evidence is below (5709 positives vs 498). But the model is good.

In [254]:
word_counts.groupby('text_source').count()

Unnamed: 0_level_0,personality,poor,stony,sell,victory,door,delight,substance,ned,bar,...,flash,whatev,manacle,while,build,text_sentence,text_sentence_sentiment_polarity,text_sentence_sentiment_subjectivity,text_sentence_avg_word_length,titles
text_source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,5709,5709,5709,5709,5709,5709,5709,5709,5709,5709,...,5709,5709,5709,5709,5709,5709,5709,5709,5709,5709
1,498,498,498,498,498,498,498,498,498,498,...,498,498,498,498,498,498,498,498,498,498
