In [1]:
from sklearn import model_selection
from sklearn import preprocessing
from sklearn import metrics
from sklearn import naive_bayes
from sklearn import linear_model
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from pathlib import Path
from labfuncs import read_books, train_model, classify
import pandas

ModuleNotFoundError: No module named 'sklearn'

In [2]:
! pip3 install sklearn

Collecting sklearn
  Downloading https://files.pythonhosted.org/packages/1e/7a/dbb3be0ce9bd5c8b7e3d87328e79063f8b263b2b1bfa4774cb1147bfcd3f/sklearn-0.0.tar.gz
Collecting scikit-learn (from sklearn)
  Downloading https://files.pythonhosted.org/packages/d1/48/e9fa9e252abcd1447eff6f9257636af31758a6e46fd5ce5d3c879f6907cb/scikit_learn-0.22.1-cp36-cp36m-manylinux1_x86_64.whl (7.0MB)
[K    100% |████████████████████████████████| 7.1MB 91kB/s  eta 0:00:01    62% |████████████████████▏           | 4.4MB 11.3MB/s eta 0:00:01
[?25hCollecting joblib>=0.11 (from scikit-learn->sklearn)
  Downloading https://files.pythonhosted.org/packages/28/5c/cf6a2b65a321c4a209efcdf64c2689efae2cb62661f8f6f4bb28547cf1bf/joblib-0.14.1-py2.py3-none-any.whl (294kB)
[K    100% |████████████████████████████████| 296kB 1.6MB/s eta 0:00:01
[?25hCollecting numpy>=1.11.0 (from scikit-learn->sklearn)
  Downloading https://files.pythonhosted.org/packages/62/20/4d43e141b5bc426ba38274933ef8e76e85c7adea2c321ecf9ebf7421cedf/n

In [2]:
# Read books for training models.

In [3]:
print('----------------------------------------------------------------')
print()
print('Train corpa')
print()
trainbooks = read_books('books')
print()
print('----------------------------------------------------------------')
print()
print('Test corpa')
print()
testbooks = read_books('testbooks')
print()
print('----------------------------------------------------------------')

----------------------------------------------------------------

Train corpa

Author: Arthur Conan Doyle
        Book: The Lost World
        Book: Tales of Terror and Mystery
        Book: The Valley of Fear
        Book: The Return of Sherlock Holmes
        Book: The Adventures of Sherlock Holmes
        Book: The Memoirs of Sherlock Holmes
Author: Jane Austen
        Book: Sense and Sensibility
        Book: Northanger Abbey
        Book: Emma
        Book: Persuasion
        Book: Lady Susan
        Book: Mansfield Park
Author: Mark Twain
        Book: Eve's Diary, Complete
        Book: Adventures of Huckleberry Finn
        Book: Roughing It
        Book: The Mysterious Stranger, and Other Stories
        Book: The Prince and the Pauper
Author: Lewis Carroll
        Book: Through the Looking-Glass
        Book: Symbolic Logic
        Book: Phantasmagoria and Other Poems
        Book: The Hunting of the Snark - An Agony in Eight Fits
        Book: Alice's Adventures in Wonderlan

In [4]:
# Make (sentence, author) labels.

In [5]:
labels, sentences = [], []
for author, works in trainbooks.items():
    for title in works.keys():
        for s in works[title]:
            labels.append(author)
            sentences.append(s)
train = pandas.DataFrame()
train['text'] = sentences
train['label'] = labels

In [6]:
# Partition data into train/validate subsets.

In [7]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(train['text'], train['label'])
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)


In [8]:
# Extract features from texts: word counts.

In [9]:
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(train['text'])
xtrain_count = count_vect.transform(train_x)
xvalid_count = count_vect.transform(valid_x)

In [10]:
# Extract features from texts: word-level TF/IDF (Term Frequency and Inverse Document Frequency).

In [11]:
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}')
tfidf_vect.fit(train['text'])
xtrain_tfidf = tfidf_vect.transform(train_x)
xvalid_tfidf = tfidf_vect.transform(valid_x)


In [12]:
# Extract features form text: character-level ngrams TF/IDF.

In [13]:
tfidf_vect_char = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3))
tfidf_vect_char.fit(train['text'])
xtrain_tfidf_char =  tfidf_vect_char.transform(train_x)
xvalid_tfidf_char =  tfidf_vect_char.transform(valid_x)

In [14]:
# Naive Bayes classification based on word countings.

In [15]:
count_word_classifier = naive_bayes.MultinomialNB()
print("Naive Bayes, Count Vectors: ", train_model(count_word_classifier, xtrain_count, train_y, xvalid_count, valid_y))

Naive Bayes, Count Vectors:  0.76732249786142


In [16]:
# Naive Bayes classification based on word-level TF/IDF.

In [17]:
tfidf_word_classifier = naive_bayes.MultinomialNB()
print("Naive Bayes, Word-level TF/IDF: ", train_model(tfidf_word_classifier, xtrain_tfidf, train_y, xvalid_tfidf, valid_y))

Naive Bayes, Word-level TF/IDF:  0.7359318629821103


In [18]:
# Linear classifier based on word-level TF/IDFs.

In [19]:
tfidf_linear_classifier = linear_model.LogisticRegression()
print("Linear, word-level TF/IDF: ", train_model(tfidf_linear_classifier, xtrain_tfidf, train_y, xvalid_tfidf, valid_y))



Linear, word-level TF/IDF:  0.7674712686428385


In [20]:
# Linear classifier based on char-level ngrams TF/IDFs.

In [21]:
tfidf_linear_char_classifier = linear_model.LogisticRegression()
print("Linear, word-level TF/IDF: ", train_model(tfidf_linear_char_classifier, xtrain_tfidf_char, train_y, xvalid_tfidf_char, valid_y))

Linear, word-level TF/IDF:  0.7181909472979506


In [22]:
for title, sentences in testbooks.items():
    print()
    print('----------------------------------------------------------------')
    print(f'\nTest title:  "{title}"')
    probs = classify(sentences,
                     [count_word_classifier, tfidf_word_classifier,
                      tfidf_linear_classifier, tfidf_linear_char_classifier],
                     [count_vect, tfidf_vect, tfidf_vect, tfidf_vect_char],
                     method='intersect')
    for prob, label in probs:
        print(f'{prob * 100: 8.2f}%  -  {encoder.inverse_transform([label])[0]}')


----------------------------------------------------------------

Test title:  "Alice's Adventures Under Ground"
   57.65%  -  Lewis Carroll
   22.89%  -  Jane Austen
   12.45%  -  Arthur Conan Doyle
    7.01%  -  Mark Twain

----------------------------------------------------------------

Test title:  "Pride and Prejudice"
   96.83%  -  Jane Austen
    3.03%  -  Arthur Conan Doyle
    0.10%  -  Mark Twain
    0.04%  -  Lewis Carroll

----------------------------------------------------------------

Test title:  "The Hound of the Baskervilles "
   90.96%  -  Arthur Conan Doyle
    8.57%  -  Jane Austen
    0.46%  -  Mark Twain
    0.01%  -  Lewis Carroll

----------------------------------------------------------------

Test title:  "The Adventures of Tom Sawyer"
   77.12%  -  Mark Twain
   14.43%  -  Arthur Conan Doyle
    7.92%  -  Jane Austen
    0.54%  -  Lewis Carroll
