In [27]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import gutenberg, stopwords
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn import ensemble

from sklearn.svm import SVC


In [2]:
# Utility function for standard text cleaning.
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text
    
# Load and clean the data.
persuasion = gutenberg.raw('austen-persuasion.txt')
alice = gutenberg.raw('carroll-alice.txt')

# The Chapter indicator is idiosyncratic
persuasion = re.sub(r'Chapter \d+', '', persuasion)
alice = re.sub(r'CHAPTER .*', '', alice)
    
alice = text_cleaner(alice)
persuasion = text_cleaner(persuasion)

In [3]:
# Parse the cleaned novels. This can take a bit.
nlp = spacy.load('en')
alice_doc = nlp(alice)
persuasion_doc = nlp(persuasion)

In [4]:
# Group into sentences.
alice_sents = [[sent, "Carroll"] for sent in alice_doc.sents]
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents]

# Combine the sentences from the two novels into one data frame.
sentences = pd.DataFrame(alice_sents + persuasion_sents)
sentences.head()

Unnamed: 0,0,1
0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,"(So, she, was, considering, in, her, own, mind...",Carroll
2,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,"(Oh, dear, !)",Carroll
4,"(I, shall, be, late, !, ')",Carroll


In [5]:
# Utility function to create a list of the 2000 most common words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(2000)]
    

# Creates a data frame with features for each word in our common word set.
# Each value is the count of the times the word appears in each sentence.
def bow_features(sentences, common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 500 == 0:
            print("Processing row {}".format(i))
            
    return df

# Set up the bags.
alicewords = bag_of_words(alice_doc)
persuasionwords = bag_of_words(persuasion_doc)

# Combine bags to create a set of unique words.
common_words = set(alicewords + persuasionwords)

In [6]:
# Create our data frame with features. This can take a while to run.
word_counts = bow_features(sentences, common_words)
word_counts.head()

Processing row 0
Processing row 500
Processing row 1000
Processing row 1500
Processing row 2000
Processing row 2500
Processing row 3000
Processing row 3500
Processing row 4000
Processing row 4500
Processing row 5000


Unnamed: 0,character,fearful,rein,crouch,comfortable,message,scholar,crawl,turkey,rally,...,complaisance,pencil,occupation,o'clock,dodo,arrow,merit,wide,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(So, she, was, considering, in, her, own, mind...",Carroll
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Oh, dear, !)",Carroll
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(I, shall, be, late, !, ')",Carroll


In [7]:
len(common_words)

3059

### I don't know why, but for some reason my scores are slightly lower than what's in the curriculum and I cut and pasted the whole thing! I did change the test size from 0.4 to 0.2 though and that brought it closer. And from what I can see, the X_train, y_train shapes are different.

In [8]:
rfc = ensemble.RandomForestClassifier()
Y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence','text_source'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=0)
train = rfc.fit(X_train, y_train)

print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

Training set score: 0.988719153937

Test set score: 0.906015037594


In [9]:
lr = LogisticRegression()
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

(4255, 3059) (4255,)
Training set score: 0.956756756757

Test set score: 0.916353383459


In [10]:
clf = ensemble.GradientBoostingClassifier()
train = clf.fit(X_train, y_train)

print('Training set score:', clf.score(X_train, y_train))
print('\nTest set score:', clf.score(X_test, y_test))

Training set score: 0.884606345476

Test set score: 0.871240601504


### Now with Emma! This one too, despite basically copying and pasting the work, also has a much lower score from 70% to 65% accuracy... :-/

In [11]:
# Clean the Emma data.
emma = gutenberg.raw('austen-emma.txt')
emma = re.sub(r'VOLUME \w+', '', emma)
emma = re.sub(r'CHAPTER \w+', '', emma)
emma = text_cleaner(emma)
print(emma[:100])

Emma Woodhouse, handsome, clever, and rich, with a comfortable home and happy disposition, seemed to


In [14]:
# Parse our cleaned data.
emma_doc = nlp(emma)

In [15]:
# Group into sentences.
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents]
emma_sents = [[sent, "Austen"] for sent in emma_doc.sents]

# Emma is quite long, let's cut it down to the same length as Alice.
emma_sents = emma_sents[0:len(alice_sents)]

In [18]:
emma_sents

[[Emma Woodhouse, handsome, clever, and rich, with a comfortable home and happy disposition, seemed to unite some of the best blessings of existence; and had lived nearly twenty-one years in the world with very little to distress or vex her.,
  'Austen'],
 [She was the youngest of the two daughters of a most affectionate, indulgent father; and had, in consequence of her sister's marriage, been mistress of his house from a very early period.,
  'Austen'],
 [Her mother had died too long ago for her to have more than an indistinct remembrance of her caresses; and her place had been supplied by an excellent woman as governess, who had fallen little short of a mother in affection.,
  'Austen'],
 [Sixteen years had Miss Taylor been in Mr. Woodhouse's family, less as a governess than a friend, very fond of both daughters, but particularly of Emma.,
  'Austen'],
 [Between _them, 'Austen'],
 [_, 'Austen'],
 [it was more the intimacy of sisters., 'Austen'],
 [Even before Miss Taylor had ceased t

In [16]:
# Build a new Bag of Words data frame for Emma word counts.
# We'll use the same common words from Alice and Persuasion.
emma_sentences = pd.DataFrame(emma_sents)
emma_bow = bow_features(emma_sentences, common_words)

print('done')

Processing row 0
Processing row 500
Processing row 1000
Processing row 1500
done


In [20]:
emma_sentences.head()

Unnamed: 0,0,1
0,"(Emma, Woodhouse, ,, handsome, ,, clever, ,, a...",Austen
1,"(She, was, the, youngest, of, the, two, daught...",Austen
2,"(Her, mother, had, died, too, long, ago, for, ...",Austen
3,"(Sixteen, years, had, Miss, Taylor, been, in, ...",Austen
4,"(Between, _, them)",Austen


In [21]:
emma_bow

Unnamed: 0,character,fearful,rein,crouch,comfortable,message,scholar,crawl,turkey,rally,...,complaisance,pencil,occupation,o'clock,dodo,arrow,merit,wide,text_sentence,text_source
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Emma, Woodhouse, ,, handsome, ,, clever, ,, a...",Austen
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(She, was, the, youngest, of, the, two, daught...",Austen
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Her, mother, had, died, too, long, ago, for, ...",Austen
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Sixteen, years, had, Miss, Taylor, been, in, ...",Austen
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Between, _, them)",Austen
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,(_),Austen
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(it, was, more, the, intimacy, of, sisters, .)",Austen
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Even, before, Miss, Taylor, had, ceased, to, ...",Austen
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(The, real, evils, ,, indeed, ,, of, Emma, 's,...",Austen
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(The, danger, ,, however, ,, was, at, present,...",Austen


In [17]:
# Now we can model it!
# Let's use logistic regression again.

# Combine the Emma sentence data with the Alice data from the test set.
X_Emma_test = np.concatenate((
    X_train[y_train[y_train=='Carroll'].index],
    emma_bow.drop(['text_sentence','text_source'], 1)
), axis=0)
y_Emma_test = pd.concat([y_train[y_train=='Carroll'],
                         pd.Series(['Austen'] * emma_bow.shape[0])])

# Model.
print('\nTest set score:', lr.score(X_Emma_test, y_Emma_test))
lr_Emma_predicted = lr.predict(X_Emma_test)
pd.crosstab(y_Emma_test, lr_Emma_predicted)


Test set score: 0.652564956696


col_0,Austen,Carroll
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
Austen,1576,94
Carroll,949,383


### First, I'm going to try the support vector machine for classification... It worked about as well as the logistic regression model.

In [28]:
svm = SVC(kernel = 'linear')
svm.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [32]:
print('Training set score:', svm.score(X_train, y_train))
print('\nTest set score:', svm.score(X_test, y_test))

Training set score: 0.971797884841

Test set score: 0.916353383459


### Now I'm going to add two features to see if that changes anything: Sentence length and the number of unique words per sentence.

In [36]:
word_counts.shape

(5319, 3061)

In [67]:
example_sentence = word_counts.iloc[0,3059]
example_sentence

Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 'and what is the use of a book,' thought Alice 'without pictures or conversation?'

In [43]:
# Look at some metrics around this sentence.
example_words = [token for token in example_sentence if not token.is_punct]
unique_words = set([token.text for token in example_words])

In [46]:
len(example_words)

57

In [47]:
len(unique_words)

41

In [58]:
sentence_lengths = []
unique_words_length = []
for i in range(word_counts.shape[0]):
    example_sentence = word_counts.iloc[i,3059]
    example_words = [token for token in example_sentence if not token.is_punct]
    length = len(example_words)
    sentence_lengths.append(length)
    unique_length = len(set([token.text for token in example_words]))
    unique_words_length.append(unique_length)

In [51]:
word_counts['sentence_lengths'] = sentence_lengths

In [61]:
word_counts['unique_words_length'] = unique_words_length

In [63]:
word_counts.head()

Unnamed: 0,character,fearful,rein,crouch,comfortable,message,scholar,crawl,turkey,rally,...,occupation,o'clock,dodo,arrow,merit,wide,text_sentence,text_source,sentence_lengths,unique_words_length
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll,57,41
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,"(So, she, was, considering, in, her, own, mind...",Carroll,56,46
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,"(There, was, nothing, so, VERY, remarkable, in...",Carroll,29,25
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,"(Oh, dear, !)",Carroll,2,2
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,"(I, shall, be, late, !, ')",Carroll,4,4


### So, I added my two new features. Now going to try it with logistic regression. It was slightly better with a jump from 0.957 to 0.963 in the training test and a jump from 0.916 to 0.917 in the test set.

In [64]:
Y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence','text_source'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=0)

In [65]:
lr2 = LogisticRegression()
train = lr2.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr2.score(X_train, y_train))
print('\nTest set score:', lr2.score(X_test, y_test))

(4255, 3061) (4255,)
Training set score: 0.963337250294

Test set score: 0.917293233083


### So weird, that the test set scores are EXACTLY alike with the logistic regression. It did do better in the training set score though. And it did perform slightly better from the previous SVM score going from 0.971 to 0.976 in the train set and 0.916 to 0.917 in the test set...

In [66]:
svm2 = SVC(kernel = 'linear')
svm2.fit(X_train, y_train)

print('Training set score:', svm2.score(X_train, y_train))
print('\nTest set score:', svm2.score(X_test, y_test))

Training set score: 0.976263219741

Test set score: 0.917293233083


### Now let's try parts of speech!!!

In [68]:
example_sentence = word_counts.iloc[0,3059]
example_sentence

Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 'and what is the use of a book,' thought Alice 'without pictures or conversation?'

In [75]:
# View the part of speech for some tokens in our sentence.
parts_of_speech = []
for token in example_sentence:
    parts_of_speech.append(token.pos_)

In [79]:
len(set(parts_of_speech))


11

In [82]:
parts_of_speech

['PROPN',
 'VERB',
 'VERB',
 'PART',
 'VERB',
 'ADV',
 'ADJ',
 'ADP',
 'VERB',
 'ADP',
 'ADJ',
 'NOUN',
 'ADP',
 'DET',
 'NOUN',
 'PUNCT',
 'CCONJ',
 'ADP',
 'VERB',
 'NOUN',
 'PART',
 'VERB',
 'PUNCT',
 'ADV',
 'CCONJ',
 'ADV',
 'PRON',
 'VERB',
 'VERB',
 'ADP',
 'DET',
 'NOUN',
 'ADJ',
 'NOUN',
 'VERB',
 'VERB',
 'PUNCT',
 'CCONJ',
 'PRON',
 'VERB',
 'DET',
 'NOUN',
 'CCONJ',
 'NOUN',
 'ADP',
 'PRON',
 'PUNCT',
 'PUNCT',
 'CCONJ',
 'NOUN',
 'VERB',
 'DET',
 'NOUN',
 'ADP',
 'DET',
 'NOUN',
 'PUNCT',
 'PUNCT',
 'VERB',
 'PROPN',
 'PUNCT',
 'ADP',
 'NOUN',
 'CCONJ',
 'NOUN',
 'PUNCT',
 'PUNCT']

In [81]:
from collections import Counter

print(Counter(parts_of_speech).keys()) # equals to list(set(words))
print(Counter(parts_of_speech).values()) # counts the elements' frequency

dict_keys(['VERB', 'ADP', 'PRON', 'PROPN', 'PART', 'ADV', 'DET', 'ADJ', 'NOUN', 'CCONJ', 'PUNCT'])
dict_values([13, 8, 3, 2, 2, 3, 5, 3, 12, 6, 10])


In [88]:
keys = list(Counter(parts_of_speech).keys())
values = list(Counter(parts_of_speech).values())
for i in range(len(Counter(parts_of_speech).keys())):
    print('{}={}'.format(keys[i],values[i]))

VERB=13
ADP=8
PRON=3
PROPN=2
PART=2
ADV=3
DET=5
ADJ=3
NOUN=12
CCONJ=6
PUNCT=10


In [None]:
for i in range(word_counts.shape[0]):
    example_sentence = word_counts.iloc[i,3059]
    parts_of_speech = []
    for token in example_sentence:
        parts_of_speech.append(token.pos_)
    keys = list(Counter(parts_of_speech).keys())
    values = list(Counter(parts_of_speech).values())
    for j in len(keys):
        word_counts.at[i, keys]

In [90]:
df.at['C', 'x'] = 10

5319

In [95]:
import nltk
nltk.download('punkt')
from nltk import pos_tag, word_tokenize
tokens = nltk.word_tokenize(example_sentence)


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/vincentgomez/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


TypeError: expected string or bytes-like object

In [98]:
example_sentence

Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 'and what is the use of a book,' thought Alice 'without pictures or conversation?'

In [100]:
type(example_sentence)

spacy.tokens.span.Span