With our cleaned corpora ready, we can now begin the process of building our model. Next we vectorize the words by the Bag of Words approach to prepare for Term Frequency - Inverse Document Frequency (TF-IDF) analysis.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import re
from datetime import datetime

from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
# from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
path_to_data = "Data/bookshelf_data.csv"
text_details = pd.read_csv(path_to_data, index_col=0)
text_details.head()

Unnamed: 0,Title,Author,Link,ID,Bookshelf,Text
0,The Extermination of the American Bison,William T. Hornaday,http://www.gutenberg.org/ebooks/17748,17748,Animal,[Illustration: (Inscription) Mr. Theodore Roos...
1,Deadfalls and Snares,A. R. Harding,http://www.gutenberg.org/ebooks/34110,34110,Animal,DEADFALLS AND SNARES [Frontispiece: A GOOD DEA...
2,Artistic Anatomy of Animals,Édouard Cuyer,http://www.gutenberg.org/ebooks/38315,38315,Animal,+---------------------------------------------...
3,"Birds, Illustrated","Color Photography, Vol. 1, No. 1 Various",http://www.gutenberg.org/ebooks/30221,30221,Animal,FROM: THE PRESIDENT OF THE NATIONAL TEACHERS' ...
4,On Snake-Poison: Its Action and Its Antidote,A. Mueller,http://www.gutenberg.org/ebooks/32947,32947,Animal,[Illustration] ON SNAKE-POISON. ITS ACTION AND...


In [3]:
# we only need the Bookshelf column to serve as the target for our Text series, so let's extract that as y
y = text_details.Bookshelf

# now we encode our labels into integers so that our model can work with them
le = LabelEncoder()
y = le.fit_transform(y)

In [4]:
le.classes_

array(['6', 'Adventure', 'Africa', 'American', 'Anarchism', 'Animal',
       'Animals-Domestic', 'Animals-Wild', 'Animals-Wild-Birds',
       'Animals-Wild-Insects', 'Animals-Wild-Mammals',
       'Animals-Wild-Reptiles', 'Animals-Wild-Trapping', 'Anthropology',
       'Archaeology', 'Architecture', 'Argentina', 'Art', 'Astronomy',
       'Atheism', 'Australia', "Bahá'í", 'Bibliomania', 'Biographies',
       'Biology', 'Boer', 'Botany', 'British', 'Buddhism', 'Bulgaria',
       'CIA', 'Camping', 'Canada', 'Canon', 'Chemistry', "Child's",
       "Children's", 'Christianity', 'Christmas', 'Classical',
       'Cookbooks', 'Crafts', 'Crime', 'Current', 'Czech', 'Detective',
       'Ecology', 'Education', 'Egypt', 'Engineering', 'English',
       'Erotic', 'FR', 'Fantasy', 'Folklore', 'France', 'General',
       'Geology', 'German', 'Germany', 'Gothic', 'Greece', 'Harvard',
       'Hinduism', 'Historical', 'Horror', 'Humor', 'IT', 'India',
       'Islam', 'Italy', 'Journal', 'Judaism', 'Lan

In [5]:
path_to_data = "Data/cleaned_texts.csv"
texts = pd.read_csv(path_to_data, index_col=0, squeeze=True)
texts.head()

0    illustration inscription mr theodore roosevelt...
1    deadfalls snare frontispiece good deadfall dea...
2    transcriber note transcription use etext texts...
3    president national teacher association state n...
4    illustration snakepoison action antidote muell...
Name: 0, dtype: object

In [6]:
len(texts)

2355

In [7]:
tokenized_texts = texts.apply(word_tokenize)
print(len(tokenized_texts))
print(tokenized_texts[0])

# tokenized_texts will also serve as our X, so let's assign that now
X = tokenized_texts

2355
['illustration', 'inscription', 'mr', 'theodore', 'roosevelt', 'author', 'hunt', 'trip', 'ranchman', 'compliment', 'author', 'wt', 'hornaday', 'smithsonian', 'institution', 'united', 'state', 'national', 'museum', 'extermination', 'american', 'bison', 'william', 'hornaday', 'superintendent', 'national', 'zoological', 'park', 'report', 'national', 'museum', '188687', 'page', '369548', 'plate', 'ixxii', 'washington', 'government', 'printing', 'office', '1889', 'illustration', 'group', 'american', 'bison', 'national', 'museum', 'collect', 'mount', 'w', 'hornaday', 'content', 'prefatory', 'note', 'part', 'ithe', 'life', 'history', 'bison', 'discovery', 'specie', 'ii', 'geographical', 'distribution', 'iii', 'abundance', 'iv', 'character', 'specie', '1', 'buffalo', 'rank', 'amongst', 'ruminant', '2', 'change', 'form', 'captivity', '3', 'mount', 'specimen', 'museum', '4', 'calf', '5', 'yearling', '6', 'spike', 'bull', '7', 'adult', 'bull', '8', 'cow', 'third', 'year', '9', 'adult', 'cow'

# count vectorization function
def count_vectorize(tokenized_text):
    word_counts = {word:0 for word in sorted(set(tokenized_text))}
    
    for word in tokenized_text:
        word_counts[word] += 1
        
    return word_counts

# %%timeit
# testing our count function
count_vectorize(tokenized_texts[0])

The first iteration revealed that we left numbers in the text corpora, so we can safely drop those. However, that will be for a later version due to the current time constraints.

It otherwise appears to be working as expected, so let's bag some words!

first_bag = count_vectorize(tokenized_texts[0])
bagged_text = pd.Series(data=[first_bag])
increment = 10
start = datetime.now()
for i in range(1, len(tokenized_texts), increment):
    print("Processing {}-{} out of {}...".format(i, i + increment, len(tokenized_texts)))
    bag_set = tokenized_texts[i:i+increment].apply(count_vectorize)
    bagged_text = pd.concat([bagged_text, bag_set])
    print("Elapsed time: {} seconds.".format((datetime.now() - start).total_seconds()))
    print("Last {}:\n{}".format(increment, bagged_text[-increment:]))
print("Complete.")

bagged_text

Well, the results are still messy, but the process appears to be working.

In [8]:
# TF count vectorization function
def count_vectorize(tokenized_text):
    word_counts = {word:0 for word in sorted(set(tokenized_text))}
    
    for word in tokenized_text:
        word_counts[word] += 1
        
    return word_counts

# IDF function
def inverse_document_frequency(list_of_token_texts):
    num_texts = len(list_of_token_texts)
    unique_words = set([word for text in list_of_token_texts for word in text])
    
    idf = {word:0 for word in sorted(unique_words)}
    
    for word in unique_words:
        num_texts_with_word = 0
        for text in list_of_token_texts:
            num_texts_with_word += (word in text)
        idf[word] = np.log( num_texts / num_texts_with_word)
    
    return idf

# TF-IDF
def tf_idf(list_of_token_texts):
    unique_words = set([item for sublist in list_of_token_texts for item in sublist])
    
    idf = inverse_document_frequency(list_of_token_texts)
    
    tf_idf_list_of_dicts = []
    for text_tokens in list_of_token_texts:
        text_tf = count_vectorize(text_tokens)
        doc_tf_idf = {word:0 for word in unique_words}
        for word in unique_words:
            if word in text_tokens:
                doc_tf_idf[word] = text_tf[word] * idf[word]
            else:
                doc_tf_idf[word] = 0
        tf_idf_list_of_dicts.append(doc_tf_idf)
        
    return tf_idf_list_of_dicts

With our functions ready to roll, it's time to finish the preparation process. Let's split our X (which is tokenized_texts) and y (which are the Bookshelf labels) into training and test sets.

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

And now we set up our analysis pipeline.

In [10]:
def main(token_iterable, labels):
    tf_idf_all_docs = tf_idf(token_iterable)
    return tf_idf_all_docs

tf_idf_all_docs = main(X_train, y_train)

KeyboardInterrupt: 