# NLP Analysis: Bag of Words vs TF-IDF

## Data Preprocessing

In [1]:
%%time

import math
import nltk
import spacy
import re
import warnings

from IPython.display import display
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.formula.api as smf
from matplotlib.mlab import PCA as mlabPCA
from sklearn.naive_bayes import BernoulliNB
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn import neighbors
from sklearn.utils import resample
from sklearn import tree
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn import ensemble
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import KernelPCA
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neural_network import MLPClassifier

from datetime import datetime
from dateutil.parser import parse
from nltk.stem.porter import PorterStemmer
from nltk.corpus import gutenberg, stopwords



# Display preferences.
%matplotlib inline
pd.options.display.float_format = '{:.3f}'.format

# Suppress annoying harmless error.
warnings.filterwarnings(
    action="ignore",
    module="sklearn"  
    )

# Set Plot Style
sns.set_style('white')

nltk.download('gutenberg')
!python -m spacy download en


[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')
CPU times: user 1.12 s, sys: 232 ms, total: 1.36 s
Wall time: 3.86 s


In [2]:
%%time

# Grab and process the raw data.
print(gutenberg.fileids())

brown = gutenberg.raw('chesterton-brown.txt')
stories = gutenberg.raw('bryant-stories.txt')
parents = gutenberg.raw('edgeworth-parents.txt')

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']
CPU times: user 5.35 ms, sys: 1.87 ms, total: 7.22 ms
Wall time: 11.2 ms


In [3]:
%%time

## Define Reusable Functions

## Text Cleaning Function
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text
  


CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 9.78 µs


In [4]:
%%time

## Clean text data and reduce to same number of characters
brown_1 = text_cleaner(brown[:int(len(stories)/10)])
brown_2 = text_cleaner(brown[int(len(stories)/10):2*int(len(stories)/10)])
brown_3 = text_cleaner(brown[2*int(len(stories)/10):3*int(len(stories)/10)])
brown_4 = text_cleaner(brown[3*int(len(stories)/10):4*int(len(stories)/10)])
parents_1 = text_cleaner(parents[:int(len(stories)/10)])
parents_2 = text_cleaner(parents[int(len(stories)/10):2*int(len(stories)/10)])
parents_3 = text_cleaner(parents[2*int(len(stories)/10):3*int(len(stories)/10)])
parents_4 = text_cleaner(parents[3*int(len(stories)/10):4*int(len(stories)/10)])
stories_1 = text_cleaner(stories[:int(len(stories)/10)])
stories_2 = text_cleaner(stories[int(len(stories)/10):2*int(len(stories)/10)])
stories_3 = text_cleaner(stories[2*int(len(stories)/10):3*int(len(stories)/10)])
stories_4 = text_cleaner(stories[3*int(len(stories)/10):4*int(len(stories)/10)])

CPU times: user 5.12 ms, sys: 0 ns, total: 5.12 ms
Wall time: 5.08 ms


In [5]:
%%time

## Parse the cleaned novels
nlp = spacy.load('en')
brown_1_doc = nlp(brown_1)
brown_2_doc = nlp(brown_2)
brown_3_doc = nlp(brown_3)
brown_4_doc = nlp(brown_4)
parents_1_doc = nlp(parents_1)
parents_2_doc = nlp(parents_2)
parents_3_doc = nlp(parents_3)
parents_4_doc = nlp(parents_4)
stories_1_doc = nlp(stories_1)
stories_2_doc = nlp(stories_2)
stories_3_doc = nlp(stories_3)
stories_4_doc = nlp(stories_4)

CPU times: user 10.2 s, sys: 345 ms, total: 10.6 s
Wall time: 10.6 s


In [6]:
%%time

## Group into sentences
brown_1_sents = [[sent, "chesterton"] for sent in brown_1_doc.sents]
brown_2_sents = [[sent, "chesterton"] for sent in brown_2_doc.sents]
brown_3_sents = [[sent, "chesterton"] for sent in brown_3_doc.sents]
brown_4_sents = [[sent, "chesterton"] for sent in brown_4_doc.sents]
parents_1_sents = [[sent, "edgeworth"] for sent in parents_1_doc.sents]
parents_2_sents = [[sent, "edgeworth"] for sent in parents_2_doc.sents]
parents_3_sents = [[sent, "edgeworth"] for sent in parents_3_doc.sents]
parents_4_sents = [[sent, "edgeworth"] for sent in parents_4_doc.sents]
stories_1_sents = [[sent, "bryant"] for sent in stories_1_doc.sents]
stories_2_sents = [[sent, "bryant"] for sent in stories_2_doc.sents]
stories_3_sents = [[sent, "bryant"] for sent in stories_3_doc.sents]
stories_4_sents = [[sent, "bryant"] for sent in stories_4_doc.sents]
 

CPU times: user 7.71 ms, sys: 1.98 ms, total: 9.7 ms
Wall time: 9.84 ms


In [7]:
%%time

## Process Text into Sentence Pair Groups

brown_all_sents = pd.DataFrame(
    brown_1_sents+
    brown_2_sents+
    brown_3_sents+
    brown_4_sents)

parents_all_sents = pd.DataFrame(
    parents_1_sents+
    parents_2_sents+
    parents_3_sents+
    parents_4_sents)

stories_all_sents = pd.DataFrame(
    stories_1_sents+
    stories_2_sents+
    stories_3_sents+
    stories_4_sents)



brown_all_sents = brown_all_sents.applymap(str)
brown_all_sents[0] = brown_all_sents[0].apply(lambda x: x + ' ')
brown_all_sents['grp'] = brown_all_sents.index // 2
brown_all_sents = brown_all_sents.groupby('grp').sum()
brown_all_sents[1] = 'chesterton'

parents_all_sents = parents_all_sents.applymap(str)
parents_all_sents[0] = parents_all_sents[0].apply(lambda x: x + ' ')
parents_all_sents['grp'] = parents_all_sents.index // 2
parents_all_sents = parents_all_sents.groupby('grp').sum()
parents_all_sents[1] = 'edgeworth'

stories_all_sents = stories_all_sents.applymap(str)
stories_all_sents[0] = stories_all_sents[0].apply(lambda x: x + ' ')
stories_all_sents['grp'] = stories_all_sents.index // 2
stories_all_sents = stories_all_sents.groupby('grp').sum()
stories_all_sents[1] = 'bryant'

sentences = pd.concat(
    [brown_all_sents,
    parents_all_sents,
    stories_all_sents])

print(sentences.head())

                                                     0           1
grp                                                               
0                          I. The Absence of Mr Glass   chesterton
1    THE consulting-rooms of Dr Orion Hood, the emi...  chesterton
2    It must not be supposed that Dr Hood's apartme...  chesterton
3    Luxury was there: there stood upon a special t...  chesterton
4    Poetry was there: the left-hand corner of the ...  chesterton
CPU times: user 714 ms, sys: 8.18 ms, total: 722 ms
Wall time: 720 ms


Texts from 3 author were acquired from the gutenberg corpora and stored into a dataframe as sentences.
The texts were organized into sentences to preserve the contextual information of individual tokens.
Sentences were stored as pairs to allow for more information to be stored in each record.
Each sentence pair was labeled with the corresponding author's name.


## Preparing The Data For Modeling

In [8]:
%%time

## Vectorizing Text Data Using Bag Of Words

porter_stemmer = PorterStemmer()

def stemming_tokenizer(str_input):
    words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
    words = [porter_stemmer.stem(word) for word in words]
    return words

count_vectorizer = CountVectorizer(stop_words='english', tokenizer=stemming_tokenizer, max_features=1000)
X = count_vectorizer.fit_transform(sentences[0])
df_bow = pd.DataFrame(X.toarray(), columns=count_vectorizer.get_feature_names())



CPU times: user 1.35 s, sys: 2.75 ms, total: 1.36 s
Wall time: 1.36 s


In [9]:
%%time

## Vectorizing Text Data Using TfIdf

tfidf_vectorizer = TfidfVectorizer(stop_words='english', tokenizer=stemming_tokenizer, max_features=1000, use_idf=True)
X = tfidf_vectorizer.fit_transform(sentences[0])
df_tfidf = pd.DataFrame(X.toarray(), columns=tfidf_vectorizer.get_feature_names())

CPU times: user 1.36 s, sys: 3.25 ms, total: 1.36 s
Wall time: 1.36 s


The sentences were tokenized using bag of words and TF-IDF prior to modeling, creating two sets of tokens. Different Modeling techniques were used on each of the tokens and the results were compared.


## Modeling the Data using Bag of Words



In [10]:
%%time

## Establish variables based on original features to be used for modeling

x = df_bow
y = sentences[1]


x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=20)

CPU times: user 12.9 ms, sys: 992 µs, total: 13.8 ms
Wall time: 13.4 ms


### Naive Bayes

In [11]:
%%time

## train and fit model

bnb = BernoulliNB().fit(x_train, y_train)

CPU times: user 39.2 ms, sys: 5.92 ms, total: 45.2 ms
Wall time: 39 ms


In [12]:
%%time

## Model Evaluation

print("accuracy score:\n" + str(bnb.score(x_test, y_test))+'\n')

print("cross validation:\n" + str(cross_val_score(bnb, x_test, y_test, cv=5))+'\n')

print("confusion matrix:\n" + str(confusion_matrix(y_test, bnb.predict(x_test)))+'\n')

print(classification_report(y_test, bnb.predict(x_test)))

accuracy score:
0.8637532133676092

cross validation:
[0.74683544 0.71794872 0.75641026 0.72727273 0.81818182]

confusion matrix:
[[128  10   8]
 [  8 119   3]
 [ 11  13  89]]

              precision    recall  f1-score   support

      bryant       0.87      0.88      0.87       146
  chesterton       0.84      0.92      0.88       130
   edgeworth       0.89      0.79      0.84       113

    accuracy                           0.86       389
   macro avg       0.87      0.86      0.86       389
weighted avg       0.87      0.86      0.86       389

CPU times: user 159 ms, sys: 83.1 ms, total: 242 ms
Wall time: 135 ms


### K Nearest Neighbors 

In [13]:
%%time

## train and fit model

knn = neighbors.KNeighborsClassifier(n_neighbors=10).fit(x_train, y_train)

CPU times: user 90.5 ms, sys: 50.1 ms, total: 141 ms
Wall time: 81.9 ms


In [14]:
%%time

## Model Evaluation

print("accuracy score:\n" + str(knn.score(x_test, y_test))+'\n')

print("cross validation:\n" + str(cross_val_score(knn, x_test, y_test, cv=5))+'\n')

print("confusion matrix:\n" + str(confusion_matrix(y_test, knn.predict(x_test)))+'\n')

print(classification_report(y_test, knn.predict(x_test)))

accuracy score:
0.5372750642673522

cross validation:
[0.43037975 0.5        0.48717949 0.44155844 0.37662338]

confusion matrix:
[[103  13  30]
 [ 44  54  32]
 [ 44  17  52]]

              precision    recall  f1-score   support

      bryant       0.54      0.71      0.61       146
  chesterton       0.64      0.42      0.50       130
   edgeworth       0.46      0.46      0.46       113

    accuracy                           0.54       389
   macro avg       0.55      0.53      0.52       389
weighted avg       0.55      0.54      0.53       389

CPU times: user 5.06 s, sys: 9.11 ms, total: 5.06 s
Wall time: 5.07 s


### Decision Tree

In [15]:
%%time

## train and fit model

decision_tree = tree.DecisionTreeClassifier(
    criterion='entropy',
    max_features=6,
    max_depth=25,
    ).fit(x_train, y_train)

CPU times: user 18 ms, sys: 756 µs, total: 18.8 ms
Wall time: 19.3 ms


In [16]:
%%time

## Model Evaluation

print("accuracy score:\n" + str(decision_tree.score(x_test, y_test))+'\n')

print("cross validation:\n" + str(cross_val_score(decision_tree, x_test, y_test, cv=5))+'\n')

print("confusion matrix:\n" + str(confusion_matrix(y_test, decision_tree.predict(x_test)))+'\n')

print(classification_report(y_test, decision_tree.predict(x_test)))

accuracy score:
0.4781491002570694

cross validation:
[0.40506329 0.43589744 0.38461538 0.48051948 0.41558442]

confusion matrix:
[[136   3   7]
 [ 89  37   4]
 [ 94   6  13]]

              precision    recall  f1-score   support

      bryant       0.43      0.93      0.58       146
  chesterton       0.80      0.28      0.42       130
   edgeworth       0.54      0.12      0.19       113

    accuracy                           0.48       389
   macro avg       0.59      0.44      0.40       389
weighted avg       0.59      0.48      0.42       389

CPU times: user 63.7 ms, sys: 3.01 ms, total: 66.7 ms
Wall time: 67.1 ms


### Random Forest

In [17]:
%%time

## Fit and Train Model

rfc = ensemble.RandomForestClassifier(
    criterion='entropy',
    max_features=15,
    max_depth=100,
    ).fit(x_train, y_train)

CPU times: user 91.2 ms, sys: 1 ms, total: 92.2 ms
Wall time: 93.7 ms


In [18]:
%%time

## Model Evaluation

print("accuracy score:\n" + str(rfc.score(x_test, y_test))+'\n')

print("cross validation:\n" + str(cross_val_score(rfc, x_test, y_test, cv=5))+'\n')

print("confusion matrix:\n" + str(confusion_matrix(y_test, rfc.predict(x_test)))+'\n')

print(classification_report(y_test, rfc.predict(x_test)))

accuracy score:
0.7609254498714653

cross validation:
[0.58227848 0.58974359 0.64102564 0.67532468 0.66233766]

confusion matrix:
[[125   8  13]
 [ 26  96   8]
 [ 28  10  75]]

              precision    recall  f1-score   support

      bryant       0.70      0.86      0.77       146
  chesterton       0.84      0.74      0.79       130
   edgeworth       0.78      0.66      0.72       113

    accuracy                           0.76       389
   macro avg       0.77      0.75      0.76       389
weighted avg       0.77      0.76      0.76       389

CPU times: user 193 ms, sys: 5.08 ms, total: 198 ms
Wall time: 197 ms


### Logistic Regression 

In [19]:
%%time

## train and fit model

lr = LogisticRegression(fit_intercept=False).fit(x_train, y_train)

CPU times: user 40.2 ms, sys: 1.96 ms, total: 42.2 ms
Wall time: 43.2 ms


In [20]:
%%time

## Model Evaluation

print("accuracy score:\n" + str(lr.score(x_test, y_test))+'\n')

print("cross validation:\n" + str(cross_val_score(lr, x_test, y_test, cv=5))+'\n')

print("confusion matrix:\n" + str(confusion_matrix(y_test, lr.predict(x_test)))+'\n')

print(classification_report(y_test, lr.predict(x_test)))

accuracy score:
0.8508997429305912

cross validation:
[0.7721519  0.73076923 0.70512821 0.72727273 0.75324675]

confusion matrix:
[[131   6   9]
 [ 12 113   5]
 [ 16  10  87]]

              precision    recall  f1-score   support

      bryant       0.82      0.90      0.86       146
  chesterton       0.88      0.87      0.87       130
   edgeworth       0.86      0.77      0.81       113

    accuracy                           0.85       389
   macro avg       0.85      0.85      0.85       389
weighted avg       0.85      0.85      0.85       389

CPU times: user 132 ms, sys: 84.2 ms, total: 216 ms
Wall time: 122 ms


### Neural Network

In [21]:
%%time

## train and fit model

mlp = MLPClassifier(hidden_layer_sizes=(100,)).fit(x_train, y_train)

CPU times: user 19.8 s, sys: 9.47 s, total: 29.3 s
Wall time: 14.8 s


In [22]:
%%time

## Model Evaluation

print("accuracy score:\n" + str(mlp.score(x_test, y_test))+'\n')

print("cross validation:\n" + str(cross_val_score(mlp, x_test, y_test, cv=5))+'\n')


print("confusion matrix:\n" + str(confusion_matrix(y_test, mlp.predict(x_test)))+'\n')

print(classification_report(y_test, mlp.predict(x_test)))

accuracy score:
0.8251928020565553

cross validation:
[0.74683544 0.76923077 0.71794872 0.72727273 0.79220779]

confusion matrix:
[[123   8  15]
 [  9 113   8]
 [ 17  11  85]]

              precision    recall  f1-score   support

      bryant       0.83      0.84      0.83       146
  chesterton       0.86      0.87      0.86       130
   edgeworth       0.79      0.75      0.77       113

    accuracy                           0.83       389
   macro avg       0.82      0.82      0.82       389
weighted avg       0.82      0.83      0.82       389

CPU times: user 24 s, sys: 10.9 s, total: 34.9 s
Wall time: 17.6 s


## Modeling the Data using TF-IDF



In [23]:
%%time

## Establish variables based on original features to be used for modeling

x = df_tfidf
y = sentences[1]


x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=20)

CPU times: user 15.3 ms, sys: 6.45 ms, total: 21.8 ms
Wall time: 16.8 ms


### Naive Bayes

In [24]:
%%time

## train and fit model

bnb = BernoulliNB().fit(x_train, y_train)

CPU times: user 39.3 ms, sys: 21.8 ms, total: 61.1 ms
Wall time: 35.3 ms


In [25]:
%%time

## Model Evaluation

print("accuracy score:\n" + str(bnb.score(x_test, y_test))+'\n')

print("cross validation:\n" + str(cross_val_score(bnb, x_test, y_test, cv=5))+'\n')

print("confusion matrix:\n" + str(confusion_matrix(y_test, bnb.predict(x_test)))+'\n')

print(classification_report(y_test, bnb.predict(x_test)))

accuracy score:
0.8637532133676092

cross validation:
[0.74683544 0.71794872 0.75641026 0.72727273 0.81818182]

confusion matrix:
[[128  10   8]
 [  8 119   3]
 [ 11  13  89]]

              precision    recall  f1-score   support

      bryant       0.87      0.88      0.87       146
  chesterton       0.84      0.92      0.88       130
   edgeworth       0.89      0.79      0.84       113

    accuracy                           0.86       389
   macro avg       0.87      0.86      0.86       389
weighted avg       0.87      0.86      0.86       389

CPU times: user 143 ms, sys: 86 ms, total: 229 ms
Wall time: 128 ms


### K Nearest Neighbors 

In [26]:
%%time

## train and fit model

knn = neighbors.KNeighborsClassifier(n_neighbors=10).fit(x_train, y_train)

CPU times: user 94.4 ms, sys: 52.6 ms, total: 147 ms
Wall time: 81.8 ms


In [27]:
%%time

## Model Evaluation

print("accuracy score:\n" + str(knn.score(x_test, y_test))+'\n')

print("cross validation:\n" + str(cross_val_score(knn, x_test, y_test, cv=5))+'\n')

print("confusion matrix:\n" + str(confusion_matrix(y_test, knn.predict(x_test)))+'\n')

print(classification_report(y_test, knn.predict(x_test)))

accuracy score:
0.442159383033419

cross validation:
[0.73417722 0.70512821 0.55128205 0.37662338 0.64935065]

confusion matrix:
[[40 33 73]
 [ 1 57 72]
 [ 1 37 75]]

              precision    recall  f1-score   support

      bryant       0.95      0.27      0.43       146
  chesterton       0.45      0.44      0.44       130
   edgeworth       0.34      0.66      0.45       113

    accuracy                           0.44       389
   macro avg       0.58      0.46      0.44       389
weighted avg       0.61      0.44      0.44       389

CPU times: user 4.98 s, sys: 6.94 ms, total: 4.99 s
Wall time: 4.99 s


### Decision Tree

In [28]:
%%time

## train and fit model

decision_tree = tree.DecisionTreeClassifier(
    criterion='entropy',
    max_features=6,
    max_depth=25,
    ).fit(x_train, y_train)

CPU times: user 17.6 ms, sys: 451 µs, total: 18 ms
Wall time: 20.8 ms


In [29]:
%%time

## Model Evaluation

print("accuracy score:\n" + str(decision_tree.score(x_test, y_test))+'\n')

print("cross validation:\n" + str(cross_val_score(decision_tree, x_test, y_test, cv=5))+'\n')

print("confusion matrix:\n" + str(confusion_matrix(y_test, decision_tree.predict(x_test)))+'\n')

print(classification_report(y_test, decision_tree.predict(x_test)))

accuracy score:
0.442159383033419

cross validation:
[0.39240506 0.44871795 0.41025641 0.54545455 0.4025974 ]

confusion matrix:
[[132   4  10]
 [107  21   2]
 [ 94   0  19]]

              precision    recall  f1-score   support

      bryant       0.40      0.90      0.55       146
  chesterton       0.84      0.16      0.27       130
   edgeworth       0.61      0.17      0.26       113

    accuracy                           0.44       389
   macro avg       0.62      0.41      0.36       389
weighted avg       0.61      0.44      0.37       389

CPU times: user 63 ms, sys: 2.52 ms, total: 65.5 ms
Wall time: 68.9 ms


### Random Forest

In [30]:
%%time

## Fit and Train Model

rfc = ensemble.RandomForestClassifier(
    criterion='entropy',
    max_features=15,
    max_depth=100,
    ).fit(x_train, y_train)

CPU times: user 102 ms, sys: 1.04 ms, total: 103 ms
Wall time: 103 ms


In [31]:
%%time

## Model Evaluation

print("accuracy score:\n" + str(rfc.score(x_test, y_test))+'\n')

print("cross validation:\n" + str(cross_val_score(rfc, x_test, y_test, cv=5))+'\n')

print("confusion matrix:\n" + str(confusion_matrix(y_test, rfc.predict(x_test)))+'\n')

print(classification_report(y_test, rfc.predict(x_test)))

accuracy score:
0.8097686375321337

cross validation:
[0.60759494 0.57692308 0.62820513 0.54545455 0.71428571]

confusion matrix:
[[131   7   8]
 [ 20 106   4]
 [ 22  13  78]]

              precision    recall  f1-score   support

      bryant       0.76      0.90      0.82       146
  chesterton       0.84      0.82      0.83       130
   edgeworth       0.87      0.69      0.77       113

    accuracy                           0.81       389
   macro avg       0.82      0.80      0.81       389
weighted avg       0.82      0.81      0.81       389

CPU times: user 181 ms, sys: 2.38 ms, total: 184 ms
Wall time: 181 ms


### Logistic Regression 

In [32]:
%%time

## train and fit model

lr = LogisticRegression(fit_intercept=False).fit(x_train, y_train)

CPU times: user 25.7 ms, sys: 0 ns, total: 25.7 ms
Wall time: 24.9 ms


In [33]:
%%time

## Model Evaluation

print("accuracy score:\n" + str(lr.score(x_test, y_test))+'\n')

print("cross validation:\n" + str(cross_val_score(lr, x_test, y_test, cv=5))+'\n')

print("confusion matrix:\n" + str(confusion_matrix(y_test, lr.predict(x_test)))+'\n')

print(classification_report(y_test, lr.predict(x_test)))

accuracy score:
0.8586118251928021

cross validation:
[0.75949367 0.73076923 0.73076923 0.68831169 0.77922078]

confusion matrix:
[[132   6   8]
 [ 12 113   5]
 [ 17   7  89]]

              precision    recall  f1-score   support

      bryant       0.82      0.90      0.86       146
  chesterton       0.90      0.87      0.88       130
   edgeworth       0.87      0.79      0.83       113

    accuracy                           0.86       389
   macro avg       0.86      0.85      0.86       389
weighted avg       0.86      0.86      0.86       389

CPU times: user 112 ms, sys: 56.4 ms, total: 169 ms
Wall time: 106 ms


### Neural Network

In [34]:
%%time

## train and fit model

mlp = MLPClassifier(hidden_layer_sizes=(100,)).fit(x_train, y_train)

CPU times: user 20.2 s, sys: 8.18 s, total: 28.4 s
Wall time: 14.3 s


In [35]:
%%time

## Model Evaluation

print("accuracy score:\n" + str(mlp.score(x_test, y_test))+'\n')

print("cross validation:\n" + str(cross_val_score(mlp, x_test, y_test, cv=5))+'\n')

print("confusion matrix:\n" + str(confusion_matrix(y_test, mlp.predict(x_test)))+'\n')

print(classification_report(y_test, mlp.predict(x_test)))

accuracy score:
0.8251928020565553

cross validation:
[0.78481013 0.80769231 0.73076923 0.75324675 0.83116883]

confusion matrix:
[[125   6  15]
 [  9 109  12]
 [ 17   9  87]]

              precision    recall  f1-score   support

      bryant       0.83      0.86      0.84       146
  chesterton       0.88      0.84      0.86       130
   edgeworth       0.76      0.77      0.77       113

    accuracy                           0.83       389
   macro avg       0.82      0.82      0.82       389
weighted avg       0.83      0.83      0.83       389

CPU times: user 21.9 s, sys: 9.19 s, total: 31.1 s
Wall time: 15.7 s


## Revising TF-IDF Based Logistic Regression Model

In [36]:
%%time

## Process Text into Groups of Four

brown_all_sents['grp2'] = brown_all_sents.index //2
brown_all_sents = brown_all_sents.groupby('grp2').sum()
brown_all_sents[1] = 'chesterton'

parents_all_sents['grp2'] = parents_all_sents.index // 2
parents_all_sents = parents_all_sents.groupby('grp2').sum()
parents_all_sents[1] = 'edgeworth'

stories_all_sents['grp2'] = stories_all_sents.index // 2
stories_all_sents = stories_all_sents.groupby('grp2').sum()
stories_all_sents[1] = 'bryant'

sentences = pd.concat(
    [brown_all_sents,
    parents_all_sents,
    stories_all_sents])

print(sentences.head())

                                                      0           1
grp2                                                               
0     I. The Absence of Mr Glass THE consulting-room...  chesterton
1     It must not be supposed that Dr Hood's apartme...  chesterton
2     Poetry was there: the left-hand corner of the ...  chesterton
3     And if this strict scientific intangibility st...  chesterton
4     Fate, being in a funny mood, pushed the door o...  chesterton
CPU times: user 344 ms, sys: 66.7 ms, total: 411 ms
Wall time: 329 ms


In [39]:
%%time

## Vectorizing Text Data Using TfIdf

tfidf_vectorizer = TfidfVectorizer(stop_words='english', tokenizer=stemming_tokenizer, max_features=1000, use_idf=True)
X = tfidf_vectorizer.fit_transform(sentences[0])
df_tfidf = pd.DataFrame(X.toarray(), columns=tfidf_vectorizer.get_feature_names())

CPU times: user 1.35 s, sys: 2.08 ms, total: 1.35 s
Wall time: 1.36 s


In [40]:
%%time

## Establish variables based on original features to be used for modeling

x = df_tfidf
y = sentences[1]


x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=20)

CPU times: user 8.02 ms, sys: 1.03 ms, total: 9.05 ms
Wall time: 8.06 ms


In [47]:
%%time

## train and fit model

lr = LogisticRegression(fit_intercept=False).fit(x_train, y_train)

CPU times: user 15.7 ms, sys: 771 µs, total: 16.5 ms
Wall time: 24.5 ms


In [48]:
%%time

## Model Evaluation

print("accuracy score:\n" + str(lr.score(x_test, y_test))+'\n')

print("cross validation:\n" + str(cross_val_score(lr, x_test, y_test, cv=5))+'\n')

print("confusion matrix:\n" + str(confusion_matrix(y_test, lr.predict(x_test)))+'\n')

print(classification_report(y_test, lr.predict(x_test)))

accuracy score:
0.9487179487179487

cross validation:
[0.825      0.82051282 0.82051282 0.87179487 0.84210526]

confusion matrix:
[[63  1  1]
 [ 3 56  0]
 [ 3  2 66]]

              precision    recall  f1-score   support

      bryant       0.91      0.97      0.94        65
  chesterton       0.95      0.95      0.95        59
   edgeworth       0.99      0.93      0.96        71

    accuracy                           0.95       195
   macro avg       0.95      0.95      0.95       195
weighted avg       0.95      0.95      0.95       195

CPU times: user 76.6 ms, sys: 55.3 ms, total: 132 ms
Wall time: 79.6 ms


The number of sentences per record was increased to four to increase the accuracy of the best performing model type in this study; the Logistic Regression model with TF-IDF tokens. Performance increased by over 10 percent.

## Conclusion

By comparing different model types and types of NLP methods, it was ascertained that TF-IDF paired with Logistic regression yielded the best results.
TF-IDF generally had better performance than bag of words.
This study has given added insight into NLP practices that yield higher accuracy and give more information about analyzed texts.
