With our cleaned corpora ready, we can now begin the process of building our model. Next we vectorize the words by the Bag of Words approach to prepare for Term Frequency - Inverse Document Frequency (TF-IDF) analysis.

In [24]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import re
from datetime import datetime

from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
# from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

In [16]:
path_to_data = "Data/bookshelf_data.csv"
text_details = pd.read_csv(path_to_data, index_col=0)
text_details.head()

Unnamed: 0,Title,Author,Link,ID,Bookshelf,Text
0,The Extermination of the American Bison,William T. Hornaday,http://www.gutenberg.org/ebooks/17748,17748,Animal,[Illustration: (Inscription) Mr. Theodore Roos...
1,Deadfalls and Snares,A. R. Harding,http://www.gutenberg.org/ebooks/34110,34110,Animal,DEADFALLS AND SNARES [Frontispiece: A GOOD DEA...
2,Artistic Anatomy of Animals,Édouard Cuyer,http://www.gutenberg.org/ebooks/38315,38315,Animal,+---------------------------------------------...
3,"Birds, Illustrated","Color Photography, Vol. 1, No. 1 Various",http://www.gutenberg.org/ebooks/30221,30221,Animal,FROM: THE PRESIDENT OF THE NATIONAL TEACHERS' ...
4,On Snake-Poison: Its Action and Its Antidote,A. Mueller,http://www.gutenberg.org/ebooks/32947,32947,Animal,[Illustration] ON SNAKE-POISON. ITS ACTION AND...


In [30]:
# we only need the Bookshelf column to serve as the target for our Text series, so let's extract that as y
y = text_details.Bookshelf

# now we encode our labels into integers so that our model can work with them
le = LabelEncoder()
y = le.fit_transform(y)

In [32]:
le.classes_

array(['6', 'Adventure', 'Africa', 'American', 'Anarchism', 'Animal',
       'Animals-Domestic', 'Animals-Wild', 'Animals-Wild-Birds',
       'Animals-Wild-Insects', 'Animals-Wild-Mammals',
       'Animals-Wild-Reptiles', 'Animals-Wild-Trapping', 'Anthropology',
       'Archaeology', 'Architecture', 'Argentina', 'Art', 'Astronomy',
       'Atheism', 'Australia', "Bahá'í", 'Bibliomania', 'Biographies',
       'Biology', 'Boer', 'Botany', 'British', 'Buddhism', 'Bulgaria',
       'CIA', 'Camping', 'Canada', 'Canon', 'Chemistry', "Child's",
       "Children's", 'Christianity', 'Christmas', 'Classical',
       'Cookbooks', 'Crafts', 'Crime', 'Current', 'Czech', 'Detective',
       'Ecology', 'Education', 'Egypt', 'Engineering', 'English',
       'Erotic', 'FR', 'Fantasy', 'Folklore', 'France', 'General',
       'Geology', 'German', 'Germany', 'Gothic', 'Greece', 'Harvard',
       'Hinduism', 'Historical', 'Horror', 'Humor', 'IT', 'India',
       'Islam', 'Italy', 'Journal', 'Judaism', 'Lan

In [4]:
path_to_data = "Data/cleaned_texts.csv"
texts = pd.read_csv(path_to_data, index_col=0, squeeze=True)
texts.head()

0    illustration inscription mr theodore roosevelt...
1    deadfalls snare frontispiece good deadfall dea...
2    transcriber note transcription use etext texts...
3    president national teacher association state n...
4    illustration snakepoison action antidote muell...
Name: 0, dtype: object

In [5]:
len(texts)

2355

In [34]:
tokenized_texts = texts.apply(word_tokenize)
print(len(tokenized_texts))
print(tokenized_texts[0])

# tokenized_texts will also serve as our X, so let's assign that now
X = tokenized_texts

2355
['illustration', 'inscription', 'mr', 'theodore', 'roosevelt', 'author', 'hunt', 'trip', 'ranchman', 'compliment', 'author', 'wt', 'hornaday', 'smithsonian', 'institution', 'united', 'state', 'national', 'museum', 'extermination', 'american', 'bison', 'william', 'hornaday', 'superintendent', 'national', 'zoological', 'park', 'report', 'national', 'museum', '188687', 'page', '369548', 'plate', 'ixxii', 'washington', 'government', 'printing', 'office', '1889', 'illustration', 'group', 'american', 'bison', 'national', 'museum', 'collect', 'mount', 'w', 'hornaday', 'content', 'prefatory', 'note', 'part', 'ithe', 'life', 'history', 'bison', 'discovery', 'specie', 'ii', 'geographical', 'distribution', 'iii', 'abundance', 'iv', 'character', 'specie', '1', 'buffalo', 'rank', 'amongst', 'ruminant', '2', 'change', 'form', 'captivity', '3', 'mount', 'specimen', 'museum', '4', 'calf', '5', 'yearling', '6', 'spike', 'bull', '7', 'adult', 'bull', '8', 'cow', 'third', 'year', '9', 'adult', 'cow'

In [9]:
# count vectorization function
def count_vectorize(tokenized_text):
    word_counts = {word:0 for word in sorted(set(tokenized_text))}
    
    for word in tokenized_text:
        word_counts[word] += 1
        
    return word_counts

In [11]:
# %%timeit
# testing our count function
count_vectorize(tokenized_texts[0])

{'0': 2,
 '1': 57,
 '10': 39,
 '100': 16,
 '1000': 6,
 '10000': 3,
 '100000': 3,
 '101105': 1,
 '101110': 2,
 '1012': 2,
 '1017600': 1,
 '104': 1,
 '104105': 1,
 '105': 1,
 '1062750': 1,
 '107': 4,
 '10793350': 1,
 '108': 2,
 '1089000': 1,
 '1090': 1,
 '1091': 1,
 '10d': 1,
 '10gallon': 1,
 '10mile': 1,
 '10th': 1,
 '11': 12,
 '110': 3,
 '110420': 1,
 '111': 1,
 '1112': 2,
 '112': 8,
 '1135300': 2,
 '114': 2,
 '114000': 1,
 '115': 3,
 '116': 1,
 '1160': 1,
 '117': 1,
 '118950': 1,
 '119': 1,
 '12': 22,
 '120': 6,
 '1200': 4,
 '12000': 1,
 '12000000': 1,
 '121': 2,
 '1210': 4,
 '1213': 2,
 '1217': 1,
 '122': 1,
 '1222': 1,
 '1233070': 1,
 '12371241': 1,
 '124': 1,
 '1240': 1,
 '1246': 1,
 '125': 3,
 '126867': 1,
 '127': 1,
 '1280': 1,
 '129': 1,
 '129130': 1,
 '12inch': 1,
 '12mile': 1,
 '12th': 1,
 '13': 11,
 '130000': 1,
 '132': 2,
 '132057': 1,
 '134': 2,
 '135': 1,
 '1378359': 1,
 '139144': 1,
 '13th': 1,
 '14': 9,
 '1400': 2,
 '14000': 1,
 '144': 1,
 '144147': 1,
 '14520': 1,
 '146

The first iteration revealed that we left numbers in the text corpora, so we can safely drop those. However, that will be for a later version due to the current time constraints.

It otherwise appears to be working as expected, so let's bag some words!

In [12]:
first_bag = count_vectorize(tokenized_texts[0])
bagged_text = pd.Series(data=[first_bag])
increment = 10
start = datetime.now()
for i in range(1, len(tokenized_texts), increment):
    print("Processing {}-{} out of {}...".format(i, i + increment, len(tokenized_texts)))
    bag_set = tokenized_texts[i:i+increment].apply(count_vectorize)
    bagged_text = pd.concat([bagged_text, bag_set])
    print("Elapsed time: {} seconds.".format((datetime.now() - start).total_seconds()))
    print("Last {}:\n{}".format(increment, bagged_text[-increment:]))
print("Complete.")

Processing 1-11 out of 2355...
Elapsed time: 0.149318 seconds.
Last 10:
1     {'0': 3, '020': 1, '075': 1, '1': 68, '10': 13...
2     {'0': 5, '0c': 8, '0i': 2, '0·22': 1, '0·30': ...
3     {'1': 2, '10th': 1, '12×18': 1, '15': 2, '150'...
4     {'0': 1, '002': 1, '004': 1, '007013': 1, '1':...
5     {'0': 1, '1': 24, '10': 14, '100': 5, '11': 5,...
6     {'1': 104, '10': 64, '100': 4, '101': 2, '1012...
7     {'0': 2, '1': 30, '10': 8, '100': 2, '1000': 1...
8     {'1': 191, '10': 48, '1000': 1, '101': 2, '103...
9     {'0': 1, '01': 3, '0112': 1, '02': 1, '03': 1,...
10    {'00': 27, '07': 2, '074': 1, '08': 1, '1': 18...
dtype: object
Processing 11-21 out of 2355...
Elapsed time: 7.004172 seconds.
Last 10:
11    {'1': 1, '10': 2, '100': 3, '101': 3, '102': 2...
12    {'1': 35, '10': 21, '100': 2, '101': 3, '102':...
13    {'0': 2, '1': 15, '10': 22, '100': 1, '1000': ...
14    {'1': 32, '10': 17, '100': 1, '100the': 1, '10...
15    {'003': 1, '004': 1, '005': 1, '007': 1, '008'...
1

Last 10:
171    {'0': 5, '07': 1, '1': 54, '10': 29, '100': 4,...
172    {'1': 138, '10': 21, '100': 9, '1000': 3, '100...
173    {'1': 11, '10': 6, '100': 5, '101': 8, '102': ...
174    {'127': 1, '1899': 1, '1901': 1, '21': 1, '303...
176    {'0': 1, '006': 1, '006028': 1, '006035': 1, '...
177    {'0': 1, '02´': 2, '0608': 1, '07´': 1, '087':...
178    {'00': 1, '01': 2, '02': 7, '03': 2, '04': 1, ...
179    {'0': 3, '005sigmam': 2, '0155': 1, '076': 1, ...
180    {'0': 3, '005': 1, '0058': 1, '0065': 1, '009'...
181    {'015': 1, '016': 1, '023': 1, '034': 1, '036'...
dtype: object
Processing 151-161 out of 2355...
Elapsed time: 26.458667 seconds.
Last 10:
182    {'0': 9, '00': 15, '01': 2, '01000': 2, '02': ...
183    {'03': 1, '04': 1, '05': 1, '054': 1, '054070'...
184    {'1': 13, '10': 5, '104215233': 1, '112': 1, '...
185    {'1': 106, '10': 13, '100': 2, '1013': 1, '103...
186    {'1': 4, '102594': 2, '11': 4, '110': 1, '1143...
187    {'1': 14, '10': 3, '105': 1, '11': 4, '

Elapsed time: 27.984347 seconds.
Last 10:
323    {'12mo': 2, '12th': 3, '15': 1, '19': 1, '1916...
324    {'1817': 1, '186000000': 1, 'ability': 2, 'abl...
325    {'14821513': 1, '1484': 1, '1490': 1, '1801': ...
327    {'1': 1, '1908': 2, '2': 2, 'aback': 1, 'aband...
328    {'538': 1, '624': 1, 'abandon': 1, 'abbot': 12...
329    {'1': 5, '100': 1, '1000': 1, '101': 1, '103':...
330    {'0020421001': 1, '1': 6, '10': 2, '10022': 1,...
331    {'1': 4, '10': 5, '100': 9, '101': 9, '102': 3...
332    {'1': 16, '10': 7, '100': 6, '101': 6, '101afa...
333    {'1': 2, '10': 2, '103': 1, '11': 2, '12': 1, ...
dtype: object
Processing 301-311 out of 2355...
Elapsed time: 28.058654 seconds.
Last 10:
334    {'12': 1, '16': 2, '1998': 5, '20': 5, '219': ...
335    {'1': 6, '10': 4, '100': 1, '1000': 2, '103': ...
336    {'accept': 1, 'acquaint': 1, 'across': 4, 'ada...
337    {'1621': 1, '1905': 2, 'abandon': 3, 'abash': ...
338    {'1': 1, '127': 1, '130a': 2, '130b': 2, '130c...
339    {'1': 

Elapsed time: 29.212727 seconds.
Last 10:
463    {'1': 14, '10': 6, '1000': 1, '11': 5, '11th':...
464    {'0': 430, '0d': 1, '0¹³': 1, '0¼': 4, '0½': 1...
465    {'014': 1, '1': 2, '10': 3, '100': 3, '1000': ...
466    {'1': 12, '10': 7, '100': 1, '10044': 1, '101'...
467    {'10': 1, '16': 1, '1604': 1, '1614': 1, '1615...
468    {'1478': 1, '1486': 1, '1499': 1, '1503': 1, '...
469    {'1': 2, '10': 5, '11': 2, '12': 5, '13': 2, '...
470    {'1': 2, '10': 2, '10th': 1, '11': 2, '12': 2,...
471    {'0525470182': 1, '1': 53, '10': 33, '100': 14...
472    {'1': 1, '10': 2, '100': 1, '101': 2, '102': 1...
dtype: object
Processing 431-441 out of 2355...
Elapsed time: 29.359252 seconds.
Last 10:
473    {'1': 59, '10': 2, '100': 1, '101': 2, '1010':...
474    {'1797': 1, '1828': 1, '1831': 1, '1856': 1, '...
475    {'1': 8, '10': 3, '100': 4, '100000': 1, '1004...
478    {'aba': 1, 'abantes': 3, 'abarbarea': 1, 'abat...
479    {'abandon': 1, 'able': 1, 'absence': 1, 'absen...
480    {'191'

Elapsed time: 31.234191 seconds.
Last 10:
612    {'1': 1, '10': 1, '103': 2, '104': 1, '11': 2,...
613    {'19': 1, '1912': 1, 'ab': 14, 'abals': 1, 'ab...
614    {'1876': 2, '4th': 2, '640': 2, 'aa': 3, 'aban...
615    {'0': 3, '012': 1, '012103': 1, '014': 1, '1':...
616    {'0': 1, '0f': 1, '0rso': 1, '1': 188, '10': 1...
617    {'1': 56, '10': 31, '100': 4, '1000': 1, '102'...
618    {'1': 181, '10': 1, '1085': 1, '1116': 1, '112...
619    {'abandon': 2, 'abandonest': 1, 'abasement': 1...
620    {'1': 5, '10': 7, '100': 4, '1000': 2, '10000'...
621    {'1300': 1, '18071882': 1, 'abandon': 4, 'abat...
dtype: object
Processing 561-571 out of 2355...
Elapsed time: 31.403117 seconds.
Last 10:
622    {'1': 5, '10': 7, '100': 3, '101': 1, '102': 3...
623    {'1': 9, '10': 6, '100': 7, '1000': 2, '101': ...
624    {'1': 4, '10': 1, '105': 1, '108': 1, '109': 1...
625    {'022': 1, '0333s': 1, '0375d': 1, '05s': 1, '...
626    {'1': 30, '10': 22, '100': 6, '10000': 3, '101...
628    {'1': 

Elapsed time: 33.285645 seconds.
Last 10:
749    {'1': 3, '10': 2, '100': 1, '101': 1, '102': 1...
750    {'02': 1, '1': 8, '10': 3, '100': 2, '1000': 2...
751    {'0': 1, '00': 11, '1': 137, '10': 26, '100': ...
752    {'1': 5, '10': 5, '100': 14, '1000': 4, '10000...
753    {'10': 2, '104': 1, '105': 1, '1051': 2, '1098...
754    {'0': 3, '1': 3, '10': 1, '103': 1, '104': 1, ...
755    {'1': 1, '10': 3, '103': 1, '10th': 2, '11': 1...
756    {'008large': 1, '008small': 1, '010png': 1, '0...
757    {'1': 30, '10': 1, '100': 3, '101': 2, '103': ...
758    {'08': 1, '1': 9, '10': 3, '100': 4, '1000': 1...
dtype: object
Processing 691-701 out of 2355...
Elapsed time: 33.430243 seconds.
Last 10:
759    {'1': 4, '10': 4, '100': 1, '1000': 2, '10000'...
760    {'1': 2, '10': 5, '100': 4, '101': 4, '102': 2...
762    {'1': 15, '10': 4, '100': 1, '103': 1, '108': ...
763    {'1': 71, '10': 11, '100': 6, '1000': 2, '1000...
764    {'1': 3, '10': 1, '100': 1, '10000': 2, '101':...
765    {'1': 

Elapsed time: 35.445357 seconds.
Last 10:
895    {'0': 1, '026': 1, '048': 1, '052': 1, '055': ...
896    {'104': 2, '109': 2, '10th': 2, '10—it': 1, '1...
898    {'1': 13, '10': 9, '100': 1, '105': 1, '11': 1...
899    {'10': 1, '100': 1, '1000': 2, '10000000': 2, ...
900    {'0': 4, '1': 26, '10': 17, '100': 22, '1000':...
901    {'1': 3, '10': 1, '1700': 1, '2': 3, '2403': 1...
902    {'1': 12, '10': 11, '100': 6, '1000': 3, '1000...
903    {'1': 68, '10': 3, '11': 6, '12': 5, '128': 1,...
904    {'1': 13, '10': 10, '100': 10, '10000': 1, '10...
905    {'1': 5, '10': 1, '1787': 1, '1970s': 1, '2': ...
dtype: object
Processing 821-831 out of 2355...
Elapsed time: 35.674771 seconds.
Last 10:
908    {'1': 1, '12': 1, '1776': 1, '2005': 1, '4': 1...
909    {'1': 10, '10': 5, '102': 1, '10th': 1, '11': ...
911    {'1': 28, '10': 3, '100': 5, '100101': 1, '100...
912    {'003': 1, '010': 1, '014': 1, '025': 2, '029'...
915    {'1': 871, '10': 196, '100': 117, '1000': 14, ...
916    {'0': 

Elapsed time: 37.218758 seconds.
Last 10:
1041    {'0': 1, '1': 45, '10': 19, '100': 4, '1000': ...
1042    {'1': 89, '10': 2, '100': 1, '1000': 2, '10000...
1043    {'0': 4, '00': 4, '08': 1, '1': 17, '10': 17, ...
1044    {'0': 6, '005': 1, '1': 39, '10': 26, '100': 1...
1045    {'0': 1, '1': 2, '10': 1, '100': 7, '100°': 1,...
1046    {'1': 26, '10': 5, '11': 3, '12': 3, '13': 3, ...
1047    {'10': 2, '1014': 1, '102': 2, '1112': 1, '112...
1048    {'0': 15, '1': 140, '10': 56, '100': 5, '108':...
1049    {'1915': 2, '2': 2, '450': 2, 'ab': 2, 'abraha...
1050    {'1': 2, '10': 2, '13vuotiaana': 1, '16': 1, '...
dtype: object
Processing 951-961 out of 2355...
Elapsed time: 37.498554 seconds.
Last 10:
1051    {'100alice': 1, '100in': 1, '101a': 2, '102dre...
1052    {'1919': 1, 'aback': 2, 'abandon': 3, 'abandon...
1053    {'1': 5, '10': 1, '12': 1, '14': 2, '18': 1, '...
1054    {'130': 1, '1918': 1, '21': 1, '6': 1, 'aback'...
1055    {'1': 1, '10': 2, '11': 2, '12': 2, '13': 2, '..

Elapsed time: 39.213882 seconds.
Last 10:
1180    {'1': 4, '10': 2, '11': 4, '12': 3, '13': 2, '...
1182    {'10°': 1, '1550': 1, '16°': 1, '17°': 1, '188...
1183    {'1': 2, '10': 2, '11': 2, '12': 2, '13': 2, '...
1184    {'15': 1, '15th': 1, '325': 1, '425': 1, '450'...
1185    {'10': 1, '100000': 1, '10440': 1, '10800': 2,...
1186    {'1': 5, '10': 2, '104': 2, '11': 2, '12': 2, ...
1187    {'1': 28, '10½': 1, '12': 2, '1445': 1, '1491—...
1188    {'11': 2, '15th': 1, '1685': 1, '1687that': 1,...
1189    {'1': 2, '11': 1, '12': 1, '1257': 1, '1258': ...
1190    {'1': 1, '104': 2, '1257—first': 1, '1258—firs...
dtype: object
Processing 1081-1091 out of 2355...
Elapsed time: 39.325516 seconds.
Last 10:
1191    {'1': 2, '10': 1, '100': 1, '11': 1, '11450': ...
1192    {'1': 2, '10': 2, '100': 1, '11': 3, '12': 4, ...
1194    {'10th': 2, '13': 2, '1789': 2, '1790': 1, '17...
1195    {'1': 1, '10thschooner': 1, '14thschooner': 1,...
1196    {'1': 2, '10': 1, '11': 1, '12': 1, '13': 1, '

Elapsed time: 40.779699 seconds.
Last 10:
1347    {'1': 2, '150': 1, '1609': 1, '1646': 3, '1676...
1348    {'10': 1, '11': 1, '14': 1, '16c': 1, '1895': ...
1349    {'1': 3, '10': 1, '123': 1, '14': 1, '1530': 1...
1350    {'1': 1, '10': 2, '100000': 1, '1084': 1, '109...
1351    {'1': 1, '1423': 1, '1424': 1, '1442': 1, '15'...
1352    {'1': 1, '10': 1, '1002': 1, '1013': 1, '106':...
1353    {'1': 6, '10': 1, '108': 1, '1107': 1, '114': ...
1354    {'10': 1, '107': 1, '13': 1, '1687': 1, '1895'...
1355    {'1': 1, '1000': 1, '132': 1, '15': 1, '1500':...
1356    {'1174': 1, '1182': 1, '140': 1, '158': 1, '18...
dtype: object
Processing 1211-1221 out of 2355...
Elapsed time: 40.845722 seconds.
Last 10:
1357    {'1': 3, '100': 1, '11': 1, '118': 1, '12': 2,...
1358    {'11': 1, '1250': 1, '14': 1, '15': 1, '1752':...
1359    {'1': 1, '111': 1, '125': 1, '12500': 1, '14':...
1360    {'1': 112, '10': 25, '100': 1, '101': 1, '102'...
1361    {'1': 29, '10': 22, '100': 4, '1000': 2, '100×

Elapsed time: 45.734191 seconds.
Last 10:
1485    {'1': 111, '10': 65, '100': 11, '1000': 5, '10...
1486    {'1': 11, '10': 7, '100': 3, '1000': 4, '100th...
1487    {'1': 12, '10': 18, '100': 5, '10003': 1, '101...
1488    {'05': 1, '1': 65, '10': 19, '100': 10, '1000'...
1489    {'1': 3, '10': 4, '100': 9, '1000': 2, '100000...
1490    {'1': 6, '10': 3, '100': 1, '1013': 1, '102': ...
1491    {'1': 10, '10': 2, '100': 2, '101': 2, '102': ...
1492    {'0': 1, '0tn20': 1, '1': 148, '10': 80, '1001...
1493    {'0068': 1, '0095': 1, '0098': 1, '0202': 1, '...
1494    {'1': 4, '10': 6, '100': 1, '10000': 1, '10000...
dtype: object
Processing 1341-1351 out of 2355...
Elapsed time: 45.883623 seconds.
Last 10:
1495    {'07': 1, '0°': 1, '0°c': 6, '1': 23, '10': 21...
1496    {'1': 20, '10': 4, '100': 5, '1002': 1, '101':...
1498    {'014': 1, '014in': 1, '1': 50, '10': 24, '100...
1499    {'1': 2, '120': 1, '14': 1, '167': 1, '17': 1,...
1500    {'1': 124, '10': 28, '100': 5, '1000': 3, '100

Elapsed time: 48.029015 seconds.
Last 10:
1643    {'1': 1, '10': 3, '100': 3, '10000': 1, '10000...
1644    {'1': 48, '10': 1, '11': 1, '13': 3, '15': 3, ...
1645    {'000': 12, '1': 141, '10': 11, '100': 2, '100...
1646    {'1': 30, '10': 4, '1000': 1, '10e': 1, '11': ...
1647    {'1': 3, '10': 3, '100e': 1, '101e': 1, '102e'...
1648    {'1': 5, '100': 3, '1000': 3, '10000': 1, '100...
1649    {'1': 1, '10000': 1, '1000l': 1, '10th': 1, '1...
1650    {'1': 22, '10': 1, '100': 1, '1000': 5, '10000...
1651    {'1': 4, '100': 3, '1000': 1, '10000': 3, '100...
1652    {'0': 96, '1': 65, '10': 12, '100': 1, '105': ...
dtype: object
Processing 1471-1481 out of 2355...
Elapsed time: 48.185648 seconds.
Last 10:
1653    {'0': 1, '00': 1, '1': 4, '10': 1, '109': 1, '...
1654    {'1': 3, '10': 1, '1000': 1, '11': 1, '12': 1,...
1655    {'1': 4, '100': 1, '10000': 1, '10th': 3, '110...
1656    {'1': 4, '10': 3, '1000': 1, '11': 1, '115': 1...
1657    {'1': 6, '10': 4, '100': 3, '1000': 3, '10000'

Elapsed time: 49.823765 seconds.
Last 10:
1799    {'1': 2, '10': 3, '10000': 1, '11': 3, '12': 2...
1800    {'1': 65, '10': 58, '100': 41, '101': 4, '102'...
1801    {'10th': 1, '116th': 1, '11th': 1, '1650': 1, ...
1802    {'1': 6, '130': 1, '1800': 1, '501': 1, '91': ...
1803    {'010': 1, '10': 1, '12': 1, '2': 2, '20': 1, ...
1804    {'1': 1, '147': 1, '1891': 1, '1908': 1, '1909...
1805    {'1': 2, '12': 2, '12th': 1, '13th': 1, '1669'...
1806    {'1904': 1, '1911': 1, '23': 1, '25th': 1, 'ab...
1807    {'1': 2, '10': 2, '11': 2, '12': 2, '1291': 1,...
1808    {'1': 1, '10': 1, '100': 1, '11': 1, '1458': 1...
dtype: object
Processing 1611-1621 out of 2355...
Elapsed time: 50.027166 seconds.
Last 10:
1809    {'1813': 1, '1819': 2, '1832': 1, '1843': 1, '...
1810    {'1': 1, '10': 2, '11': 1, '12': 2, '14': 1, '...
1811    {'10': 1, '12': 2, '13': 1, '14': 1, '15': 2, ...
1812    {'10': 1, '15': 1, '1830': 1, '1862': 1, '1884...
1813    {'1843': 1, '184445': 1, '1845': 4, '1846': 1,

Elapsed time: 51.713873 seconds.
Last 10:
1941    {'1': 21, '10': 11, '1014': 1, '11': 11, '12':...
1942    {'1': 18, '10': 23, '100': 8, '101': 4, '102':...
1943    {'1': 5, '10': 3, '100': 1, '101': 1, '102': 1...
1944    {'1': 6, '10': 4, '11': 4, '12': 4, '13': 4, '...
1945    {'1': 16, '10': 5, '100': 3, '101': 1, '101—th...
1946    {'1': 3, '10': 3, '1041–2': 1, '1076–1153': 2,...
1947    {'1': 4, '10': 2, '100': 2, '101': 2, '102': 2...
1948    {'1': 4, '10': 2, '101–116': 2, '11': 2, '110'...
1949    {'1': 7, '10': 1, '100': 1, '101': 1, '102': 1...
1950    {'1': 14, '10': 2, '1016': 1, '1033': 1, '1050...
dtype: object
Processing 1741-1751 out of 2355...
Elapsed time: 51.834412 seconds.
Last 10:
1951    {'1': 5, '10': 2, '11': 1, '12': 1, '15': 1, '...
1952    {'1': 18, '10': 8, '104th': 2, '11': 10, '118'...
1953    {'1': 3, '10': 5, '100000': 1, '1089': 1, '11'...
1954    {'1': 2, '193': 1, '2': 1, '2006': 1, '25': 1,...
1955    {'1': 2, '2': 2, '2005': 1, '3': 1, 'abandon':

Elapsed time: 53.549048 seconds.
Last 10:
2084    {'1': 27, '10': 174, '100': 5, '100–101': 1, '...
2086    {'1': 34, '10': 22, '100': 2, '101': 4, '102':...
2087    {'1': 4, '10': 1, '104th': 1, '11': 2, '12': 1...
2091    {'1': 41, '10': 16, '100': 6, '1000': 1, '1000...
2092    {'1': 6, '10': 5, '100': 3, '101': 4, '102': 4...
2094    {'1': 85, '10': 10, '100': 4, '101': 4, '102':...
2098    {'1': 400, '10': 1, '102': 1, '114': 1, '117':...
2100    {'1': 11, '10': 4, '101': 2, '102': 1, '102a':...
2103    {'1': 3, '10': 3, '100': 1, '104': 1, '106110'...
2104    {'03': 1, '1': 495, '10': 133, '100': 2, '101'...
dtype: object
Processing 1871-1881 out of 2355...
Elapsed time: 53.758344 seconds.
Last 10:
2105    {'1': 178, '10': 50, '100': 2, '10000': 1, '10...
2106    {'1': 5, '10': 3, '100': 1, '103p': 1, '11': 1...
2108    {'1': 551, '10': 67, '100': 6, '1000': 1, '100...
2109    {'01': 1, '01001001': 1, '01001002': 1, '01001...
2110    {'0001001': 8, '01': 1, '01001001': 1, '010010

Elapsed time: 55.167026 seconds.
Last 10:
2239    {'0': 2, '003': 1, '0084': 1, '009': 1, '024':...
2240    {'1': 30, '10': 17, '100': 10, '1000': 6, '100...
2241    {'0°': 2, '1': 7, '10': 3, '100': 1, '1000': 1...
2242    {'0': 3, '0·0843': 2, '0·4': 1, '0·62': 1, '0·...
2244    {'0': 1, '000171': 1, '0008': 1, '00174': 1, '...
2245    {'0°': 4, '0°c': 1, '0·0001': 1, '0·0002': 1, ...
2246    {'0': 1, '100inch': 1, 'abbey': 1, 'abides': 3...
2248    {'1': 5, '10': 2, '100': 1, '1000': 2, '100000...
2249    {'0': 11, '06': 3, '08': 1, '1': 52, '10': 25,...
2250    {'1': 582, '10': 15, '100': 4, '101': 2, '102'...
dtype: object
Processing 2001-2011 out of 2355...
Elapsed time: 55.2716 seconds.
Last 10:
2251    {'1': 5, '10': 1, '11': 1, '12': 2, '12000': 1...
2252    {'00': 325, '0024': 1, '05': 1, '1': 310, '10'...
2253    {'1913': 2, '1919': 4, '22': 1, '22d': 1, '23'...
2254    {'1': 20, '100': 1, '138': 1, '139': 2, '141':...
2255    {'1': 7, '10': 1, '1000': 1, '100000': 17, '10..

Elapsed time: 56.937925 seconds.
Last 10:
2445    {'00000112': 1, '0002': 1, '0004': 1, '0025': ...
2447    {'1': 4, '10': 3, '100': 7, '1000': 1, '101': ...
2448    {'1': 3, '10': 2, '100': 4, '1000': 1, '101': ...
2456    {'00': 1, '1': 126, '10': 26, '100': 1, '101':...
2458    {'00': 11, '1': 26, '10': 5, '101': 1, '105': ...
2460    {'1': 29, '10': 10, '100': 10, '1000': 2, '100...
2461    {'1': 18, '10': 6, '100': 12, '101': 4, '102':...
2462    {'1': 60, '10': 17, '100': 5, '1000': 3, '1000...
2463    {'1': 26, '10': 19, '100': 11, '1000': 5, '100...
2465    {'0': 1, '000': 15, '07': 1, '0mm': 2, '1': 8,...
dtype: object
Processing 2161-2171 out of 2355...
Elapsed time: 57.057786 seconds.
Last 10:
2466    {'1': 39, '10': 18, '100': 17, '1001': 1, '101...
2469    {'000000000000000001': 1, '0000000000000000062...
2470    {'0': 7, '01': 1, '1': 31, '10': 25, '100': 18...
2471    {'1': 2, '10': 5, '100': 3, '1000': 5, '100000...
2472    {'1': 27, '10': 6, '100': 4, '1000': 4, '10000

Elapsed time: 58.506789 seconds.
Last 10:
2648    {'1': 3, '10': 2, '10th': 1, '11': 1, '12': 2,...
2649    {'0': 1, '1': 29, '10': 8, '100': 1, '10000': ...
2652    {'10': 2, '100': 2, '100000': 3, '105000': 1, ...
2653    {'1': 51, '10': 11, '100': 8, '10000': 2, '100...
2654    {'1': 15, '10': 6, '1000000000': 1, '10th': 1,...
2655    {'0c': 1, '1': 13, '10': 14, '100': 1, '100101...
2656    {'1': 1, '10': 1, '1620': 1, '1765': 1, '1797'...
2658    {'06': 1, '1': 46, '10': 23, '100': 13, '1000'...
2659    {'1': 13, '10': 20, '106': 3, '11': 17, '1163'...
2660    {'0': 3, '03': 1, '04': 3, '05': 6, '06': 9, '...
dtype: object
Processing 2291-2301 out of 2355...
Elapsed time: 58.678423 seconds.
Last 10:
2662    {'10th': 1, '117': 2, '12th': 1, '13': 1, '15t...
2663    {'1': 2, '10': 1, '100': 1, '100000': 1, '10th...
2664    {'06629': 1, '1': 27, '10': 12, '100': 5, '100...
2666    {'1': 37, '10': 9, '100': 1, '1000': 4, '1030'...
2667    {'0': 2, '1': 487, '10': 106, '100': 32, '1000

In [13]:
bagged_text

0       {'0': 2, '1': 57, '10': 39, '100': 16, '1000':...
1       {'0': 3, '020': 1, '075': 1, '1': 68, '10': 13...
2       {'0': 5, '0c': 8, '0i': 2, '0·22': 1, '0·30': ...
3       {'1': 2, '10th': 1, '12×18': 1, '15': 2, '150'...
4       {'0': 1, '002': 1, '004': 1, '007013': 1, '1':...
                              ...                        
2727    {'1': 1, '10': 2, '1000': 2, '10101': 1, '104'...
2728    {'1': 27, '10': 14, '100': 4, '1000': 2, '1000...
2729    {'1': 4, '100': 2, '10000': 1, '105': 1, '10th...
2730    {'0': 18, '1': 2, '10': 4, '100': 3, '1000': 3...
2731    {'1': 9, '10': 13, '100': 10, '1000': 6, '1000...
Length: 2355, dtype: object

Well, the results are still messy, but the process appears to be working.

In [40]:
# TF count vectorization function
def count_vectorize(tokenized_text):
    word_counts = {word:0 for word in sorted(set(tokenized_text))}
    
    for word in tokenized_text:
        word_counts[word] += 1
        
    return word_counts

# IDF function
def inverse_document_frequency(list_of_token_texts):
    num_texts = len(list_of_token_texts)
    unique_words = set([word for text in list_of_token_texts for word in text])
    
    idf = {word:0 for word in sorted(unique_words)}
    
    for word in unique_words:
        num_texts_with_word = 0
        for text in list_of_token_texts:
            num_texts_with_word += (word in text)
        idf[word] = np.log( num_texts / num_texts_with_word)
    
    return idf

# TF-IDF
def tf_idf(list_of_token_texts):
    unique_words = set([item for sublist in list_of_token_texts for item in sublist])
    
    idf = inverse_document_frequency(list_of_token_texts)
    
    tf_idf_list_of_dicts = []
    for text_tokens in list_of_token_texts:
        text_tf = count_vectorize(text_tokens)
        doc_tf_idf = {word:0 for word in unique_words}
        for word in unique_words:
            if word in text_tokens:
                doc_tf_idf[word] = text_tf[word] * idf[word]
            else:
                doc_tf_idf[word] = 0
        tf_idf_list_of_dicts.append(doc_tf_idf)
        
    return tf_idf_list_of_dicts

With our functions ready to roll, it's time to finish the preparation process. Let's split our X (which is tokenized_texts) and y (which are the Bookshelf labels) into training and test sets.

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

And now we set up our analysis pipeline.

In [None]:
def main(token_iterable, labels):
    tf_idf_all_docs = tf_idf(token_iterable)
    return tf_idf_all_docs

tf_idf_all_docs = main(X_train, y_train)