In [5]:
from sklearn import model_selection
from sklearn import preprocessing
from sklearn import metrics
from sklearn import naive_bayes
from sklearn import linear_model
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from pathlib import Path
from labfuncs import read_books, train_model, classify
import pandas

In [6]:
# Read books for training models.

In [7]:
print('----------------------------------------------------------------')
print()
print('Train corpa')
print()
trainbooks = read_books('books')
print()
print('----------------------------------------------------------------')
print()
print('Test corpa')
print()
testbooks = read_books('testbooks')
print()
print('----------------------------------------------------------------')

----------------------------------------------------------------

Train corpa

Author: Lewis Carroll
        Book: The Hunting of the Snark - An Agony in Eight Fits
        Book: Sylvie and Bruno
        Book: The Game of Logic
        Book: Alice_s Adventures in Wonderland
        Book: Phantasmagoria and Other Poems
        Book: Through the Looking-Glass
        Book: Symbolic Logic
Author: Mark Twain
        Book: The Mysterious Stranger, and Other Stories
        Book: The Prince and the Pauper
        Book: Eve_s Diary, Complete
        Book: Adventures of Huckleberry Finn
        Book: Roughing It
Author: Jane Austen
        Book: Emma
        Book: Sense and Sensibility
        Book: Mansfield Park
        Book: Persuasion
        Book: Northanger Abbey
        Book: Lady Susan
Author: Arthur Conan Doyle
        Book: The Valley of Fear
        Book: The Return of Sherlock Holmes
        Book: The Lost World
        Book: The Memoirs of Sherlock Holmes
        Book: Tales of Te

In [8]:
# Make (sentence, author) labels.

In [9]:
labels, sentences = [], []
for author, works in trainbooks.items():
    for title in works.keys():
        for s in works[title]:
            labels.append(author)
            sentences.append(s)
train = pandas.DataFrame()
train['text'] = sentences
train['label'] = labels

In [10]:
# Partition data into train/validate subsets.

In [11]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(train['text'], train['label'])
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)


In [12]:
# Extract features from texts: word counts.

In [13]:
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(train['text'])
xtrain_count = count_vect.transform(train_x)
xvalid_count = count_vect.transform(valid_x)

In [14]:
# Extract features from texts: word-level TF/IDF (Term Frequency and Inverse Document Frequency).

In [15]:
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}')
tfidf_vect.fit(train['text'])
xtrain_tfidf = tfidf_vect.transform(train_x)
xvalid_tfidf = tfidf_vect.transform(valid_x)


In [16]:
# Extract features form text: character-level ngrams TF/IDF.

In [17]:
tfidf_vect_char = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3))
tfidf_vect_char.fit(train['text'])
xtrain_tfidf_char =  tfidf_vect_char.transform(train_x)
xvalid_tfidf_char =  tfidf_vect_char.transform(valid_x)



In [18]:
# Naive Bayes classification based on word countings.

In [19]:
count_word_classifier = naive_bayes.MultinomialNB()
print("Naive Bayes, Count Vectors: ", train_model(count_word_classifier, xtrain_count, train_y, xvalid_count, valid_y))

Naive Bayes, Count Vectors:  0.7679919663778034


In [20]:
# Naive Bayes classification based on word-level TF/IDF.

In [21]:
tfidf_word_classifier = naive_bayes.MultinomialNB()
print("Naive Bayes, Word-level TF/IDF: ", train_model(tfidf_word_classifier, xtrain_tfidf, train_y, xvalid_tfidf, valid_y))

Naive Bayes, Word-level TF/IDF:  0.7387585078290624


In [22]:
# Linear classifier based on word-level TF/IDFs.

In [23]:
tfidf_linear_classifier = linear_model.LogisticRegression()
print("Linear, word-level TF/IDF: ", train_model(tfidf_linear_classifier, xtrain_tfidf, train_y, xvalid_tfidf, valid_y))

Linear, word-level TF/IDF:  0.7736824487670622


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [24]:
# Linear classifier based on char-level ngrams TF/IDFs.

In [25]:
tfidf_linear_char_classifier = linear_model.LogisticRegression()
print("Linear, word-level TF/IDF: ", train_model(tfidf_linear_char_classifier, xtrain_tfidf_char, train_y, xvalid_tfidf_char, valid_y))

Linear, word-level TF/IDF:  0.720682857886711


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [22]:
for title, sentences in testbooks.items():
    print()
    print('----------------------------------------------------------------')
    print(f'\nTest title:  "{title}"')
    probs = classify(sentences,
                     [count_word_classifier, tfidf_word_classifier,
                      tfidf_linear_classifier, tfidf_linear_char_classifier],
                     [count_vect, tfidf_vect, tfidf_vect, tfidf_vect_char],
                     method='intersect')
    for prob, label in probs:
        print(f'{prob * 100: 8.2f}%  -  {encoder.inverse_transform([label])[0]}')


----------------------------------------------------------------

Test title:  "Alice's Adventures Under Ground"
   57.65%  -  Lewis Carroll
   22.89%  -  Jane Austen
   12.45%  -  Arthur Conan Doyle
    7.01%  -  Mark Twain

----------------------------------------------------------------

Test title:  "Pride and Prejudice"
   96.83%  -  Jane Austen
    3.03%  -  Arthur Conan Doyle
    0.10%  -  Mark Twain
    0.04%  -  Lewis Carroll

----------------------------------------------------------------

Test title:  "The Hound of the Baskervilles "
   90.96%  -  Arthur Conan Doyle
    8.57%  -  Jane Austen
    0.46%  -  Mark Twain
    0.01%  -  Lewis Carroll

----------------------------------------------------------------

Test title:  "The Adventures of Tom Sawyer"
   77.12%  -  Mark Twain
   14.43%  -  Arthur Conan Doyle
    7.92%  -  Jane Austen
    0.54%  -  Lewis Carroll
