In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import gutenberg, stopwords
from collections import Counter
import nltk

In [2]:
print(gutenberg.fileids())

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


In [3]:
alice = gutenberg.raw('carroll-alice.txt')
persuasion = gutenberg.raw('austen-persuasion.txt')
print(type(alice))
alice[0:1000]

<class 'str'>


"[Alice's Adventures in Wonderland by Lewis Carroll 1865]\n\nCHAPTER I. Down the Rabbit-Hole\n\nAlice was beginning to get very tired of sitting by her sister on the\nbank, and of having nothing to do: once or twice she had peeped into the\nbook her sister was reading, but it had no pictures or conversations in\nit, 'and what is the use of a book,' thought Alice 'without pictures or\nconversation?'\n\nSo she was considering in her own mind (as well as she could, for the\nhot day made her feel very sleepy and stupid), whether the pleasure\nof making a daisy-chain would be worth the trouble of getting up and\npicking the daisies, when suddenly a White Rabbit with pink eyes ran\nclose by her.\n\nThere was nothing so VERY remarkable in that; nor did Alice think it so\nVERY much out of the way to hear the Rabbit say to itself, 'Oh dear!\nOh dear! I shall be late!' (when she thought it over afterwards, it\noccurred to her that she ought to have wondered at this, but at the time\nit all seeme

In [4]:
def data_cleaning(txt):
    pattern = "[\[].*?[\]]"
    text = re.sub(pattern,'',txt)
    text = re.sub(r'CHAPTER *.', '', text)
    text = re.sub(r'Chapter \d+', '', text)
    text = ' '.join(text.split())
    return text

In [5]:
alice_cleaned = data_cleaning(alice[:int(len(alice)/10)])
persuasion_cleaned = data_cleaning(persuasion[:int(len(persuasion)/10)])
print(alice_cleaned[:500])
print(persuasion_cleaned[:500])

. Down the Rabbit-Hole Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 'and what is the use of a book,' thought Alice 'without pictures or conversation?' So she was considering in her own mind (as well as she could, for the hot day made her feel very sleepy and stupid), whether the pleasure of making a daisy-chain would be wort
Sir Walter Elliot, of Kellynch Hall, in Somersetshire, was a man who, for his own amusement, never took up any book but the Baronetage; there he found occupation for an idle hour, and consolation in a distressed one; there his faculties were roused into admiration and respect, by contemplating the limited remnant of the earliest patents; there any unwelcome sensations, arising from domestic affairs changed naturally into pity and contempt as he turned over the almost endless creations of the la

In [6]:
#data parsing
nlp = spacy.load('en_core_web_sm')
alice_doc = nlp(alice_cleaned)
persuasion_doc = nlp(persuasion_cleaned)
print(type(alice_doc),type(persuasion_doc))

<class 'spacy.tokens.doc.Doc'> <class 'spacy.tokens.doc.Doc'>


In [7]:
#sentences
alice_sents = list(alice_doc.sents)
persuasion_sents = list(persuasion_doc.sents)
print(alice_sents[1],'\n',persuasion_sents[1])

Down the Rabbit-Hole Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 'and what is the use of a book,' thought Alice 'without pictures or conversation?' 
 This was the page at which the favourite volume always opened: "ELLIOT OF KELLYNCH HALL.


In [8]:
#Stop words
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [9]:
#lemma
alice_doc[1:100].lemma_

"down the Rabbit - Hole Alice be begin to get very tired of sit by -PRON- sister on the bank , and of have nothing to do : once or twice -PRON- have peep into the book -PRON- sister be read , but -PRON- have no picture or conversation in -PRON- , ' and what be the use of a book , ' think Alice ' without picture or conversation ? ' so -PRON- be consider in -PRON- own mind ( as well as -PRON- could , for the hot day make -PRON- feel very sleepy and stupid )"

In [10]:
#word frequencies with stopwords
from collections import Counter

def word_frequencies(txt):
    words = []
    for token in txt:
        if not token.is_punct and not token.is_stop:
            words.append(token.text)
    return Counter(words)            

In [11]:
alice_frequencies = word_frequencies(alice_doc).most_common(30)
print(alice_frequencies)

[('Alice', 34), ('little', 19), ('way', 16), ('like', 13), ('think', 11), ('time', 11), ('went', 11), ('thought', 10), ('Rabbit', 9), ('said', 9), ('Oh', 8), ('feet', 8), ('wonder', 8), ('door', 8), ('found', 7), ('hall', 7), ('key', 7), ('shall', 6), ('moment', 6), ('going', 6), ('things', 6), ('eat', 6), ('came', 6), ('garden', 6), ('use', 5), ('dear', 5), ('looked', 5), ('large', 5), ('look', 5), ('tried', 5)]


In [12]:
persuasion_frequencies = word_frequencies(persuasion_doc).most_common(30)
print(persuasion_frequencies)

[('Sir', 68), ('Walter', 67), ('Elliot', 33), ('Lady', 32), ('Mr', 31), ('Anne', 29), ('Shepherd', 28), ('Russell', 26), ('father', 25), ('Elizabeth', 24), ('Kellynch', 23), ('man', 20), ('house', 20), ('good', 17), ('little', 16), ('Hall', 15), ('great', 15), ('Admiral', 15), ('family', 14), ('years', 14), ('soon', 12), ('gentleman', 12), ('Mrs', 11), ('time', 11), ('seen', 11), ('tenant', 11), ('found', 10), ('think', 10), ('having', 10), ('consequence', 10)]


In [13]:
type(alice_frequencies)

list

In [14]:
print(alice_frequencies[0])
alice_common = [val[0] for val in alice_frequencies]
print(alice_common)

('Alice', 34)
['Alice', 'little', 'way', 'like', 'think', 'time', 'went', 'thought', 'Rabbit', 'said', 'Oh', 'feet', 'wonder', 'door', 'found', 'hall', 'key', 'shall', 'moment', 'going', 'things', 'eat', 'came', 'garden', 'use', 'dear', 'looked', 'large', 'look', 'tried']


In [15]:
type(set(alice_common))

set

In [16]:
#lemma frequencies with stopwords
def lemma_frequencies(txt):
    words = []
    #prefixes = []
    for token in txt:
        if not token.is_punct and not token.is_stop:
            words.append(token.lemma_)
            #prefixes.append(token.prefix_)
    return Counter(words)

In [17]:
alice_lemma_frequencies = lemma_frequencies(alice_doc).most_common(30)
print(alice_lemma_frequencies)

[('Alice', 34), ('think', 19), ('little', 19), ('go', 17), ('way', 16), ('like', 13), ('time', 11), ('find', 11), ('fall', 11), ('come', 11), ('say', 11), ('wonder', 10), ('look', 10), ('door', 10), ('Rabbit', 9), ('begin', 9), ('get', 9), ('eat', 9), ('oh', 8), ('foot', 8), ('large', 8), ('try', 8), ('thing', 8), ('dear', 7), ('shall', 7), ('good', 7), ('hall', 7), ('key', 7), ('feel', 6), ('moment', 6)]


In [18]:
#Counting the number of words and number of unique words in a sentence
example_sentence = alice_sents[2]
example_words = [token.text for token in example_sentence if not token.is_punct]
unique_words = set([token for token in example_words])
print(example_words,'\n\n\n',unique_words)

['So', 'she', 'was', 'considering', 'in', 'her', 'own', 'mind', 'as', 'well', 'as', 'she', 'could', 'for', 'the', 'hot', 'day', 'made', 'her', 'feel', 'very', 'sleepy', 'and', 'stupid', 'whether', 'the', 'pleasure', 'of', 'making', 'a', 'daisy', 'chain', 'would', 'be', 'worth', 'the', 'trouble', 'of', 'getting', 'up', 'and', 'picking', 'the', 'daisies', 'when', 'suddenly', 'a', 'White', 'Rabbit', 'with', 'pink', 'eyes', 'ran', 'close', 'by', 'her'] 


 {'made', 'suddenly', 'chain', 'making', 'trouble', 'the', 'well', 'when', 'ran', 'her', 'picking', 'close', 'by', 'worth', 'own', 'and', 'eyes', 'getting', 'in', 'was', 'for', 'whether', 'day', 'stupid', 'daisies', 'as', 'mind', 'Rabbit', 'of', 'be', 'So', 'White', 'would', 'with', 'she', 'daisy', 'pink', 'pleasure', 'could', 'very', 'feel', 'up', 'considering', 'hot', 'sleepy', 'a'}


In [19]:
# Parts of speech and dependencies in a sentence
for token in example_sentence[:10]:
    print(token.orth_, token.dep_, token.pos_, token.head.orth_)
    #print(token.text, token.pos_)

So advmod ADV considering
she nsubj PRON considering
was aux VERB considering
considering ROOT VERB considering
in prep ADP considering
her poss DET mind
own amod ADJ mind
mind pobj NOUN in
( punct PUNCT mind
as advmod ADV as


In [20]:
entities = list(alice_doc.ents)[0:10]
print(entities)

[Alice, the hot day, Alice, Rabbit, Rabbit, Alice, Alice, Alice, First, one]


In [21]:
for t in entities:
    print(t.label_, t.orth_)

PERSON Alice
DATE the hot day
PERSON Alice
PERSON Rabbit
PERSON Rabbit
PERSON Alice
PERSON Alice
PERSON Alice
ORDINAL First
CARDINAL one


In [22]:
#People in Alice novel
person_alice = [entity.text for entity in list(alice_doc.ents) if entity.label_=='PERSON']
print(set(person_alice))

{'Alice', 'Dinah', 'Down', "Ma'am", 'Rabbit', 'ALICE', 'FOOT', 'Latitude', 'Longitude'}


In [23]:
person_persuasion = [entity.text for entity in list(persuasion_doc.ents) if entity.label_=='PERSON']
print(set(person_persuasion))

{'Elizabeth', 'Basil Morley', 'Wentworth', "Lady Russell's", 'William Walter Elliot', 'Walter Elliot', 'Mr--', "Elizabeth Elliot's", 'James Stevenson', "Mrs Croft's", 'Anne', 'Thirteen', "Mrs Clay's", 'Basil', 'Lady Russell', 'Baldwin', 'Charles', 'Mr Elliot', 'Somerset', 'Charles Musgrove', 'Trafalgar', 'Mr Wentworth', "Mr Shepherd's", 'Strafford', 'Tattersall', 'Lady Elliot', 'Mr Shepherd', 'Shepherd', "Lady Elliot's", 'St Domingo', 'Mrs Charles Musgrove', 'Mrs Croft', 'Baronetage', 'Anne Elliot', 'Charles II', 'Elizabeths', 'Elliot', 'Frederick Wentworth', 'Croft', 'Walter', 'Mrs Clay', 'John Shepherd', 'Mary', 'Walter--'}


In [24]:
alice_sents_for_df = [[sent,'alice'] for sent in alice_sents]
persuasion_sents_for_df = [[sent,'persuasion'] for sent in persuasion_sents]
sentences = pd.DataFrame(alice_sents_for_df + persuasion_sents_for_df)
sentences

Unnamed: 0,0,1
0,(.),alice
1,"(Down, the, Rabbit, -, Hole, Alice, was, begin...",alice
2,"(So, she, was, considering, in, her, own, mind...",alice
3,"(There, was, nothing, so, VERY, remarkable, in...",alice
4,"(Oh, dear, !)",alice
5,"(Oh, dear, !)",alice
6,"(I, shall, be, late, !, ')",alice
7,"((, when, she, thought, it, over, afterwards, ...",alice
8,"(A, WATCH, OUT, OF, ITS, WAISTCOAT, -, POCKET,...",alice
9,"(,, Alice, started, to, her, feet, ,, for, it,...",alice


In [25]:
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(2000)]
    

# Creates a data frame with features for each word in our common word set.
# Each value is the count of the times the word appears in each sentence.
def bow_features(sentences, common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df_new = pd.DataFrame(columns=common_words)
    df_new['text_sentence'] = sentences[0]
    df_new['text_source'] = sentences[1]
    df_new.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df_new['text_sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df_new.loc[i, word] += 1
    return df_new

# Set up the bags.
alicewords = bag_of_words(alice_doc)
persuasionwords = bag_of_words(persuasion_doc)

# Combine bags to create a set of unique words.
common_words = set(alicewords + persuasionwords)

In [26]:
word_counts = bow_features(sentences, common_words)
word_counts.head()

Unnamed: 0,odd,privilege,gentleness,enjoyment,finish,grove,ago,inch,knock,betime,...,household,hair,possibly,stop,war,explanation,simple,infatuation,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,(.),alice
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Down, the, Rabbit, -, Hole, Alice, was, begin...",alice
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(So, she, was, considering, in, her, own, mind...",alice
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(There, was, nothing, so, VERY, remarkable, in...",alice
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Oh, dear, !)",alice


In [27]:
word_counts.columns

Index(['odd', 'privilege', 'gentleness', 'enjoyment', 'finish', 'grove', 'ago',
       'inch', 'knock', 'betime',
       ...
       'household', 'hair', 'possibly', 'stop', 'war', 'explanation', 'simple',
       'infatuation', 'text_sentence', 'text_source'],
      dtype='object', length=1635)

In [28]:
word_counts.query('Anne == 1')

Unnamed: 0,odd,privilege,gentleness,enjoyment,finish,grove,ago,inch,knock,betime,...,household,hair,possibly,stop,war,explanation,simple,infatuation,text_sentence,text_source
136,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Anne, ,, born, August, 9, ,, 1787, ;, a, stil...",persuasion
157,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Mary, had, acquired, a, little, artificial, i...",persuasion
158,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(she, was, only, Anne, .)",persuasion
160,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Lady, Russell, loved, them, all, ;, but, it, ...",persuasion
161,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(A, few, years, before, ,, Anne, Elliot, had, ...",persuasion
167,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Anne, haggard, ,, Mary, coarse, ,, every, fac...",persuasion
202,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Does, it, occur, to, you, that, there, is, an...",persuasion
217,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Herself, the, widow, of, only, a, knight, ,, ...",persuasion
221,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(:, she, consulted, Anne, ,, who, never, seeme...",persuasion
223,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Every, emendation, of, Anne, 's, had, been, o...",persuasion


In [29]:
word_counts.describe()

Unnamed: 0,odd,privilege,gentleness,enjoyment,finish,grove,ago,inch,knock,betime,...,household,hair,possibly,stop,war,explanation,simple,infatuation,text_sentence,text_source
count,454,454,454,454,454,454,454,454,454,454,...,454,454,454,454,454,454,454,454,454,454
unique,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,454,2
top,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(A, larger, society, would, improve, them, .)",persuasion
freq,451,452,453,451,452,452,453,451,451,453,...,453,453,453,452,453,452,453,453,1,322


In [30]:
from sklearn import ensemble
from sklearn.model_selection import train_test_split

rfc = ensemble.RandomForestClassifier()
Y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence','text_source'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)
train = rfc.fit(X_train, y_train)
y_pred_rn = rfc.predict(X_test)
print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))



Training set score: 0.9779411764705882

Test set score: 0.8461538461538461


In [31]:
X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=object)

In [32]:
#Logistic regression
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(penalty='l2') # No need to specify l2 as it's the default. But we put it for demonstration.
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
y_pred = lr.predict(X_test)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))



(272, 1633) (272,)
Training set score: 0.9742647058823529

Test set score: 0.8956043956043956


In [33]:
clf = ensemble.GradientBoostingClassifier()
train = clf.fit(X_train, y_train)
y_pred_gb = clf.predict(X_test)
print('Training set score:', clf.score(X_train, y_train))
print('\nTest set score:', clf.score(X_test, y_test))

Training set score: 0.9632352941176471

Test set score: 0.8351648351648352


In [34]:
from sklearn.svm import SVC
svm = SVC(kernel = 'linear')
train = svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
print('Training set score:', svm.score(X_train, y_train))
print('\nTest set score:', svm.score(X_test, y_test))

Training set score: 0.9852941176470589

Test set score: 0.8681318681318682


In [35]:
from sklearn.metrics import classification_report
target_names = ['alice', 'persuasion']
print(classification_report(y_test, y_pred_rn, target_names=target_names))
print(classification_report(y_test, y_pred, target_names=target_names))
print(classification_report(y_test, y_pred_gb, target_names=target_names))
print(classification_report(y_test, y_pred_svm, target_names=target_names))

              precision    recall  f1-score   support

       alice       0.90      0.53      0.67        53
  persuasion       0.83      0.98      0.90       129

    accuracy                           0.85       182
   macro avg       0.87      0.75      0.78       182
weighted avg       0.85      0.85      0.83       182

              precision    recall  f1-score   support

       alice       0.95      0.68      0.79        53
  persuasion       0.88      0.98      0.93       129

    accuracy                           0.90       182
   macro avg       0.91      0.83      0.86       182
weighted avg       0.90      0.90      0.89       182

              precision    recall  f1-score   support

       alice       0.96      0.45      0.62        53
  persuasion       0.82      0.99      0.90       129

    accuracy                           0.84       182
   macro avg       0.89      0.72      0.76       182
weighted avg       0.86      0.84      0.81       182

              preci

In [36]:
from sklearn.metrics import confusion_matrix
print("Confusion matrix for random forest \n",format(confusion_matrix(y_test, y_pred_rn)))
print("Confusion matrix for logistic regression \n",format(confusion_matrix(y_test, y_pred)))
print("Confusion matrix for Gradient Boost \n",format(confusion_matrix(y_test, y_pred_gb)))
print("Confusion matrix for SVM \n",format(confusion_matrix(y_test, y_pred_svm)))

Confusion matrix for random forest 
 [[ 28  25]
 [  3 126]]
Confusion matrix for logistic regression 
 [[ 36  17]
 [  2 127]]
Confusion matrix for Gradient Boost 
 [[ 24  29]
 [  1 128]]
Confusion matrix for SVM 
 [[ 38  15]
 [  9 120]]


In [37]:
#Adding new features to increase accuracy
word_counts
common_names = set(person_persuasion + person_alice)

for i in common_names:
    word_counts[i] = 0

In [41]:
word_counts.head()

Unnamed: 0,odd,privilege,gentleness,enjoyment,finish,grove,ago,inch,knock,betime,...,Lady Elliot's,St Domingo,Mrs Charles Musgrove,Mrs Croft,Anne Elliot,Charles II,Down,Frederick Wentworth,Mrs Clay,John Shepherd
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [48]:
for i, sentence in enumerate(word_counts['text_sentence']):
    words = []
    words = [token.text for token in sentence if token.text in common_names]
    for word in words:
        word_counts.loc[i, word] += 1

[]
['Down', 'Rabbit', 'Alice', 'Alice']
['Rabbit']
['Alice', 'Rabbit']
[]
[]
[]
['Rabbit']
[]
['Alice']
['Alice']
['Alice']
[]
[]
[]
[]
['Alice']
[]
[]
[]
['Down']
[]
[]
[]
[]
['Alice']
['Latitude', 'Longitude']
[]
['Alice', 'Latitude', 'Longitude']
[]
[]
[]
[]
[]
[]
["Ma'am"]
[]
[]
[]
[]
['Down']
['Alice']
[]
['Dinah']
[]
['Dinah']
[]
[]
[]
['Alice']
[]
[]
['Dinah', 'Dinah']
[]
[]
[]
['Alice', 'Rabbit']
['Alice']
['Rabbit']
[]
['Alice']
['Alice']
[]
[]
[]
['Alice']
['Alice']
[]
[]
['Alice']
['Alice']
['Alice']
[]
['Alice']
[]
['Alice']
[]
[]
[]
['Alice']
[]
[]
['Alice']
[]
['Alice']
[]
['Alice']
[]
[]
['Alice']
[]
[]
['Alice']
[]
[]
[]
['Alice']
[]
[]
[]
[]
[]
[]
['Alice']
[]
[]
[]
[]
['ALICE', 'FOOT']
['ALICE']
[]
[]
['Alice']
[]
['Alice']
[]
[]
[]
[]
['Rabbit']
[]
[]
['Alice', 'Rabbit']
['Rabbit']
['Alice']
[]
[]
[]
[]
[]
[]
[]
['Walter', 'Elliot', 'Baronetage']
[]
[]
['Walter', 'Elliot', 'Elizabeth', 'Elizabeth']
['Anne', 'Mary']
[]
['Walter', 'Mary', 'Charles', 'Charles', 'Somerse

In [50]:
word_counts['Dinah']
word_counts.query('Dinah==1')

Unnamed: 0,odd,privilege,gentleness,enjoyment,finish,grove,ago,inch,knock,betime,...,Lady Elliot's,St Domingo,Mrs Charles Musgrove,Mrs Croft,Anne Elliot,Charles II,Down,Frederick Wentworth,Mrs Clay,John Shepherd
43,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
45,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [53]:
Y_updated = word_counts['text_source']
X_updated = np.array(word_counts.drop(['text_sentence','text_source'], 1))

X_train_updated, X_test_updated, y_train_updated, y_test_updated = train_test_split(X_updated, 
                                                    Y_updated,
                                                    test_size=0.4,
                                                    random_state=0)
train = rfc.fit(X_train_updated, y_train_updated)
y_pred_rn_updated = rfc.predict(X_test_updated)
print('Training set score:', rfc.score(X_train_updated, y_train_updated))
print('\nTest set score:', rfc.score(X_test_updated, y_test_updated))

Training set score: 0.9816176470588235

Test set score: 0.8571428571428571


In [57]:
train = lr.fit(X_train_updated, y_train_updated)
print(X_train_updated.shape, y_train_updated.shape)
y_pred_updated = lr.predict(X_test_updated)
print('Training set score:', lr.score(X_train_updated, y_train_updated))
print('\nTest set score:', lr.score(X_test_updated, y_test_updated))



(272, 1661) (272,)
Training set score: 0.9742647058823529

Test set score: 0.8956043956043956


In [61]:
train = clf.fit(X_train_updated, y_train_updated)
y_pred_gb_updated = clf.predict(X_test_updated)
print('Training set score:', clf.score(X_train_updated, y_train_updated))
print('\nTest set score:', clf.score(X_test_updated, y_test_updated))

Training set score: 0.9705882352941176

Test set score: 0.8076923076923077


In [63]:
train = svm.fit(X_train_updated, y_train_updated)
y_pred_svm_updated = svm.predict(X_test_updated)
print('Training set score:', svm.score(X_train_updated, y_train_updated))
print('\nTest set score:', svm.score(X_test_updated, y_test_updated))

Training set score: 0.9889705882352942

Test set score: 0.8736263736263736


In [64]:
from sklearn.metrics import confusion_matrix
print("Confusion matrix for random forest \n",format(confusion_matrix(y_test_updated, y_pred_rn_updated)))
print("Confusion matrix for logistic regression \n",format(confusion_matrix(y_test_updated, y_pred_updated)))
print("Confusion matrix for Gradient Boost \n",format(confusion_matrix(y_test_updated, y_pred_gb_updated)))
print("Confusion matrix for SVM \n",format(confusion_matrix(y_test_updated, y_pred_svm_updated)))

Confusion matrix for random forest 
 [[ 32  21]
 [  5 124]]
Confusion matrix for logistic regression 
 [[ 36  17]
 [  2 127]]
Confusion matrix for Gradient Boost 
 [[ 23  30]
 [  5 124]]
Confusion matrix for SVM 
 [[ 39  14]
 [  9 120]]
