# Complement Naive Bayes and Prediction
With the TF-IDF data ready, we can move into the model making and prediction. In the final version, this will be a part of the pipeline and will run training and test sets separately.

In [1]:
random_seed = 42

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import clear_output
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import ComplementNB
from sklearn import metrics

import pickle

In [2]:
# get pickled TF-IDF data and vectorizer
path_to_file = "../Data/pickles/TFIDF.dat"
with open (path_to_file, "rb") as f:
    tfidf_data, tfidf_vectorizer = pickle.load(f)

In [3]:
# get dataframe with summaries and genres
path_to_data = "../Data/cleaned_summaries_and_genres.csv"
df = pd.read_csv(path_to_data, index_col=0)
df

Unnamed: 0,summary,genre
0,old major old boar manor farm call animal farm...,Children's literature
1,old major old boar manor farm call animal farm...,Speculative fiction
2,old major old boar manor farm call animal farm...,Fiction
3,alex teenager live nearfuture england lead gan...,Science Fiction
4,alex teenager live nearfuture england lead gan...,Speculative fiction
...,...,...
26536,series follow character nick stone exmilitary ...,Fiction
26537,series follow character nick stone exmilitary ...,Suspense
26538,reader first meet rapp covert operation iran d...,Thriller
26539,reader first meet rapp covert operation iran d...,Fiction


In [4]:
# encode genres into labels and make X and y
le = LabelEncoder()

X = tfidf_data

y_names = df['genre']
y = le.fit_transform(y_names)
y

array([ 2, 15,  6, ..., 18,  6, 15])

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_seed)
print("Array Shapes\nX_train: {}  y_train: {}\nX_test:  {}   y_test:  {}"\
      .format(X_train.shape, y_train.shape, X_test.shape, y_test.shape))

Array Shapes
X_train: (18578, 116360)  y_train: (18578,)
X_test:  (7963, 116360)   y_test:  (7963,)


## Complement Naive Bayes
The complement naive bayes analyzer has historically good performance with textual analysis.

In [6]:
clf = ComplementNB()
clf.fit(X_train, y_train)

ComplementNB()

In [7]:
train_preds = clf.predict(X_train)
print(metrics.classification_report(y_train, train_preds, target_names=le.classes_))

                        precision    recall  f1-score   support

       Adventure novel       0.86      0.23      0.37       236
     Alternate history       0.64      0.18      0.28       148
 Children's literature       0.65      0.39      0.48      1456
         Crime Fiction       0.68      0.39      0.49       517
     Detective fiction       0.77      0.26      0.39       238
               Fantasy       0.51      0.53      0.52      1723
               Fiction       0.36      0.72      0.48      3344
    Historical fiction       0.79      0.15      0.26       275
      Historical novel       0.73      0.48      0.58       450
                Horror       0.59      0.15      0.25       369
               Mystery       0.52      0.35      0.42       960
           Non-fiction       0.99      0.62      0.76       157
                 Novel       0.81      0.49      0.61      1712
         Romance novel       0.71      0.14      0.23       300
       Science Fiction       0.55      

The accuracy within the training set isn't bad. There are some obvious issues, but the numbers are almost all above the 50% success mark. How about with the test data, though?

In [8]:
preds = clf.predict(X_test)
print("Training score: {}".format(clf.score(X_train, y_train)))
print("Testing score:  {}".format(clf.score(X_test, y_test)))

Training score: 0.4894498869630746
Testing score:  0.13562727615220393


In [9]:
print(metrics.classification_report(y_test, preds, target_names=le.classes_))

                        precision    recall  f1-score   support

       Adventure novel       0.07      0.02      0.03        94
     Alternate history       0.06      0.01      0.02        78
 Children's literature       0.14      0.07      0.10       666
         Crime Fiction       0.12      0.05      0.07       236
     Detective fiction       0.07      0.03      0.04       103
               Fantasy       0.17      0.15      0.16       690
               Fiction       0.14      0.32      0.19      1403
    Historical fiction       0.00      0.00      0.00       113
      Historical novel       0.18      0.08      0.11       204
                Horror       0.02      0.01      0.01       142
               Mystery       0.09      0.06      0.07       436
           Non-fiction       0.22      0.03      0.05        73
                 Novel       0.15      0.07      0.09       751
         Romance novel       0.04      0.01      0.01       135
       Science Fiction       0.24      

In [10]:
print(metrics.confusion_matrix(y_test, preds))

[[  2   1  11   0   0   3  34   0   4   0   1   0   5   0   8  23   0   0
    0   2]
 [  0   1   0   0   0   3  16   0   3   0   1   0   1   0  22  31   0   0
    0   0]
 [  3   0  49   1   4  80 328   0   6   6  13   1   9   1  27 121   2   0
    1  14]
 [  0   0   0  12   1   0 172   0   2   0  30   0   5   1   1   5   0   3
    4   0]
 [  0   0   4   3   3   0  41   0   0   0  46   0   2   0   1   1   0   1
    0   1]
 [  1   1  49   0   0 102 132   0   0   3   6   0   2   1  43 337   0   1
    4   8]
 [  6   2 113  23   2  86 453   9  23   8  69   2 153  10 113 292   6  18
   10   5]
 [  0   0   3   1   0   1  82   0  11   0   0   0   7   1   0   5   0   0
    0   2]
 [  6   0   9   3   0   4 121   5  16   1   6   2  10   1   0  15   1   0
    0   4]
 [  0   0   1   0   0  11  40   1   0   1   2   0   3   0   5  74   0   1
    0   3]
 [  0   0  12  37  25   6 256   1   2   2  24   0   9   4   7  33   0  16
    1   1]
 [  0   0   1   0   0   0  58   0   0   0   0   2   4   0   7   1

Well, those numbers look terrible. My first thought on the poor scores is that the mixture of genres for a text might be throwing off the results. For example, Animal Farm has the genres Children's literature, Speculative fiction, and Fiction. Since all three of these are options for the text, the predictor might have picked one that didn't match at the expected point. Let's dig into the numbers and see if that interpretation holds up.

In [11]:
all_preds = clf.predict(X)
df['preds'] = le.inverse_transform(all_preds)
df.head()

Unnamed: 0,summary,genre,preds
0,old major old boar manor farm call animal farm...,Children's literature,Fiction
1,old major old boar manor farm call animal farm...,Speculative fiction,Fiction
2,old major old boar manor farm call animal farm...,Fiction,Fiction
3,alex teenager live nearfuture england lead gan...,Science Fiction,Fiction
4,alex teenager live nearfuture england lead gan...,Speculative fiction,Fiction


In [12]:
df.genre.value_counts()

Fiction                   4747
Speculative fiction       4314
Science Fiction           2870
Novel                     2463
Fantasy                   2413
Children's literature     2122
Mystery                   1396
Young adult literature     825
Suspense                   765
Crime Fiction              753
Historical novel           654
Thriller                   568
Horror                     511
Romance novel              435
Historical fiction         388
Detective fiction          341
Adventure novel            330
Non-fiction                230
Alternate history          226
Spy fiction                190
Name: genre, dtype: int64

In [13]:
df.preds.value_counts()

Fiction                   9969
Speculative fiction       5541
Science Fiction           3003
Fantasy                   2422
Novel                     1375
Children's literature     1215
Mystery                    896
Crime Fiction              395
Historical novel           381
Young adult literature     312
Thriller                   164
Horror                     139
Detective fiction          122
Spy fiction                108
Non-fiction                107
Adventure novel             91
Romance novel               85
Suspense                    82
Historical fiction          74
Alternate history           60
Name: preds, dtype: int64

It looks like the AI is guessing Fiction way too often (9969 times in the full set as compared to the 4747 actual Fiction tags).

In [14]:
df[df['preds'] != df['genre']]

Unnamed: 0,summary,genre,preds
0,old major old boar manor farm call animal farm...,Children's literature,Fiction
1,old major old boar manor farm call animal farm...,Speculative fiction,Fiction
3,alex teenager live nearfuture england lead gan...,Science Fiction,Fiction
4,alex teenager live nearfuture england lead gan...,Speculative fiction,Fiction
6,text plague divide five part town oran thousan...,Fiction,Novel
...,...,...,...
26530,heaven leigh casteel fourteen year old girl li...,Young adult literature,Fiction
26532,event heaven first book casteel series heaven ...,Young adult literature,Fiction
26535,series follow character nick stone exmilitary ...,Thriller,Fiction
26537,series follow character nick stone exmilitary ...,Suspense,Fiction


In [15]:
df[df['preds'] == df['genre']]

Unnamed: 0,summary,genre,preds
2,old major old boar manor farm call animal farm...,Fiction,Fiction
5,alex teenager live nearfuture england lead gan...,Fiction,Fiction
7,text plague divide five part town oran thousan...,Novel,Novel
8,novel posit space around milky way divide conc...,Science Fiction,Science Fiction
14,ged young boy gont one large islands north arc...,Speculative fiction,Speculative fiction
...,...,...,...
26533,novel annie stonewall daughter heaven casteel ...,Young adult literature,Young adult literature
26534,story start former government agent frank comp...,Science Fiction,Science Fiction
26536,series follow character nick stone exmilitary ...,Fiction,Fiction
26539,reader first meet rapp covert operation iran d...,Fiction,Fiction


While there could be issues with the underlying data or with our hyperparameters, let's see if another classifier might perform better than the Complement Naive Bayes. The Multinomial Naive Bayes is another which is regularly used in NLP, though it is usually outperformed by the CNB.

In [16]:
from sklearn.naive_bayes import MultinomialNB

clf2 = MultinomialNB(alpha=1.0)

clf2.fit(X_train, y_train)
mnb_train = clf2.predict(X_train)
mnb_test = clf2.predict(X_test)
print("Training score: {}".format(clf2.score(X_train, y_train)))
print("Testing score:  {}".format(clf2.score(X_test, y_test)))

Training score: 0.2718268920228227
Testing score:  0.18598518146427226


In [17]:
print(metrics.classification_report(y_train, mnb_train, target_names=le.classes_))

                        precision    recall  f1-score   support

       Adventure novel       0.00      0.00      0.00       236
     Alternate history       0.00      0.00      0.00       148
 Children's literature       0.75      0.00      0.00      1456
         Crime Fiction       0.00      0.00      0.00       517
     Detective fiction       0.00      0.00      0.00       238
               Fantasy       0.68      0.02      0.03      1723
               Fiction       0.23      0.86      0.36      3344
    Historical fiction       0.00      0.00      0.00       275
      Historical novel       0.00      0.00      0.00       450
                Horror       0.00      0.00      0.00       369
               Mystery       0.00      0.00      0.00       960
           Non-fiction       0.00      0.00      0.00       157
                 Novel       1.00      0.00      0.00      1712
         Romance novel       0.00      0.00      0.00       300
       Science Fiction       0.85      

  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
print(metrics.classification_report(y_test, mnb_test, target_names=le.classes_))

                        precision    recall  f1-score   support

       Adventure novel       0.00      0.00      0.00        94
     Alternate history       0.00      0.00      0.00        78
 Children's literature       0.00      0.00      0.00       666
         Crime Fiction       0.00      0.00      0.00       236
     Detective fiction       0.00      0.00      0.00       103
               Fantasy       0.20      0.00      0.01       690
               Fiction       0.18      0.73      0.29      1403
    Historical fiction       0.00      0.00      0.00       113
      Historical novel       0.00      0.00      0.00       204
                Horror       0.00      0.00      0.00       142
               Mystery       0.00      0.00      0.00       436
           Non-fiction       0.00      0.00      0.00        73
                 Novel       0.00      0.00      0.00       751
         Romance novel       0.00      0.00      0.00       135
       Science Fiction       0.10      

In [19]:
path_to_save = "../Data/pickles/trained_cnb.dat"
with open (path_to_save, "w+b") as f:
    pickle.dump(clf, f)