In [1]:
import json
from csv import QUOTE_NONNUMERIC

from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
from sklearn.model_selection import train_test_split

# This is needed later for the sentence tokenizer to work
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [23]:
!wget "https://storage.googleapis.com/menagerie/bert_demos/gutenberg-metadata.json"

--2019-11-14 15:06:35--  https://storage.googleapis.com/menagerie/bert_demos/gutenberg-metadata.json
Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.21.176, 2a00:1450:400f:80a::2010
Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.21.176|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 57466731 (55M) [application/json]
Saving to: ‘gutenberg-metadata.json.1’


2019-11-14 15:06:43 (8.03 MB/s) - ‘gutenberg-metadata.json.1’ saved [57466731/57466731]



In [2]:
OUTPUT_ROOT = "/data"
# TODO: download this from a bucket
metadata_file = "./gutenberg-metadata.json"

with open(metadata_file, "r") as read_file:
    metadata_dict = json.load(read_file)

metadata = pd.DataFrame.from_dict(metadata_dict, orient='index')

In [3]:
# For simplicity, we remove all books that have more than a single author, title, language or rights
metadata_columns = ['author', 'title', 'rights', 'language']
for x in metadata_columns:
    metadata[x + "_length"] = [len(x) for x in metadata[x]]

clean_metadata = metadata.query(
    "_length == 1 and ".join(metadata_columns) + '_length == 1'
).copy(deep=True)

for column in metadata_columns:
    clean_metadata[column] = [x[0] for x in clean_metadata[column]]
    clean_metadata = clean_metadata.drop(column + "_length", axis=1)

clean_metadata.sample(3)

Unnamed: 0,author,formaturi,language,rights,subject,title
33668,"Darwin, Francis, Sir",[http://www.gutenberg.org/files/33668/33668-h....,en,Public domain in the USA.,"[AC, Essays]",Springtime and Other Essays
44073,Levi,[http://www.gutenberg.org/cache/epub/44073/pg4...,en,Public domain in the USA.,"[BT, Akashic records, Jesus Christ -- Biograph...",The Aquarian Gospel of Jesus the Christ\r\nThe...
17943,"Jerome, Jerome K. (Jerome Klapka)",[http://www.gutenberg.org/files/17943/17943-h/...,en,Public domain in the USA.,[England -- Social life and customs -- Fiction...,The Observations of Henry


In [4]:
# This code can be used to look up specific authors or book titles
clean_metadata[clean_metadata['title'].str.match('.*Divine.*', case=True, flags=0)].head(3)

Unnamed: 0,author,formaturi,language,rights,subject,title
1001,Dante Alighieri,"[http://www.gutenberg.org/ebooks/1001.rdf, htt...",en,Public domain in the USA.,[Italian poetry -- To 1400 -- Translations int...,"Divine Comedy, Longfellow's Translation, Hell"
1002,Dante Alighieri,[http://www.gutenberg.org/files/1002/1002-h/10...,en,Public domain in the USA.,[Italian poetry -- To 1400 -- Translations int...,"Divine Comedy, Longfellow's Translation, Purga..."
1003,Dante Alighieri,[http://www.gutenberg.org/files/1003/1003-h/10...,en,Public domain in the USA.,[Italian poetry -- To 1400 -- Translations int...,"Divine Comedy, Longfellow's Translation, Paradise"


In [5]:
clean_metadata[clean_metadata['author'].str.match('.*Selma.*', case=True, flags=0)].head(3)

Unnamed: 0,author,formaturi,language,rights,subject,title
5161,"Lagerlöf, Selma","[http://www.gutenberg.org/files/5161/5161.txt,...",en,Public domain in the USA.,"[Bohuslän (Sweden) -- Fiction, Sweden -- Histo...",The Treasure
10935,"Lagerlöf, Selma",[http://www.gutenberg.org/ebooks/10935.html.no...,en,Public domain in the USA.,"[Fantasy literature, Conduct of life -- Juveni...",The Wonderful Adventures of Nils
14273,"Lagerlöf, Selma",[http://www.gutenberg.org/ebooks/14273.html.no...,en,Public domain in the USA.,"[Short stories, Fiction, PT]",Invisible Links


In [6]:
# This is a hand-compiled list of the top 100 ebooks from Project Gutenberg
# from the Last 30 Days list as was visible on 2019-11-14
# (https://www.gutenberg.org/browse/scores/top#books-last30)

MIN_CHARACTERS_PER_BOOK = 100000
DATASET_IDENTIFIER = 'top100_20191114'

books_list = sorted(list(set([
84, 1342, 25525, 1080, 43, 16328, 2701, 41, 98, 2542,
46, 11, 219, 345, 25344, 1661, 205, 1952, 76, 844,
1260, 5200, 1250, 2591, 4300, 174, 408, 74, 160, 1232,
23, 1635, 16, 58975, 1400, 2600, 1497, 2554, 3207, 6130,
5740, 120, 2852, 158, 203, 2814, 1184, 36, 28054, 209,
514, 768, 33283, 2500, 11030, 3600, 3825, 57490, 45, 20203,
2148, 15399, 829, 244, 135, 55, 140, 7370, 2680,
19942, 42324, 3296, 161, 58585, 1998, 972, 4363, 375, 35,
1251, 34901, 16643, 45502, 851, 779, 1001, 1322, 3176, 730,
815, 100, 5827, 1727, 1934, 786, 521, 236, 28860, 42108,
])))

books_list_str = [str(x) for x in books_list]
books_list[:3]

[11, 16, 23]

In [7]:
chosen_books_metadata = clean_metadata.loc[books_list_str, :]
chosen_books_metadata.sample(3)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


Unnamed: 0,author,formaturi,language,rights,subject,title
11030,"Jacobs, Harriet A. (Harriet Ann)",[http://www.gutenberg.org/files/11030/11030-h/...,en,Public domain in the USA.,"[Slaves -- United States -- Social conditions,...","Incidents in the Life of a Slave Girl, Written..."
209,"James, Henry",[http://www.gutenberg.org/ebooks/209.kindle.im...,en,Public domain in the USA.,"[England -- Fiction, Governesses -- Fiction, C...",The Turn of the Screw
74,"Twain, Mark",[http://www.gutenberg.org/ebooks/74.kindle.ima...,en,Public domain in the USA.,"[Humorous stories, Sawyer, Tom (Fictitious cha...",The Adventures of Tom Sawyer


In [8]:
# This is a wrapper for downloading and cleaning up the text 
# of each book from Project Gutenberg
def download_book(book_id):
    try:
        text = strip_headers(load_etext(book_id)).strip()
    except:
        print(f"Could not download book {book_id}")
        text = ""
    return text

texts_col = [download_book(x) for x in books_list]

Could not download book 5740
Could not download book 33283


In [9]:
chosen_books_df = chosen_books_metadata.copy(deep=True)
chosen_books_df['text'] = texts_col
chosen_books_df['text_length'] = [len(x) for x in texts_col]
chosen_books_filtered = chosen_books_df \
  .dropna().query(f"text_length >= {MIN_CHARACTERS_PER_BOOK}") \
  .query("language == 'en'").query("rights == 'Public domain in the USA.'") \
  .query("author != 'Various'") \
  .sort_values('text_length')

print(chosen_books_filtered.shape)
chosen_books_filtered.head(3)

(81, 8)


Unnamed: 0,author,formaturi,language,rights,subject,title,text,text_length
851,"Rowlandson, Mary White",[http://www.gutenberg.org/ebooks/851.epub.imag...,en,Public domain in the USA.,"[Rowlandson, Mary White, approximately 1635-17...",Narrative of the Captivity and Restoration of ...,NARRATIVE OF THE CAPTIVITY AND RESTORATION OF ...,104401
1250,"Rand, Ayn",[http://www.gutenberg.org/files/1250/1250-h.zi...,en,Public domain in the USA.,"[Individuality -- Fiction, Time travel -- Fict...",Anthem,ANTHEM\n\n by Ayn Rand\n\n\n CONTE...,108043
844,"Wilde, Oscar","[http://www.gutenberg.org/files/844/844.txt, h...",en,Public domain in the USA.,"[Foundlings -- Drama, Identity (Psychology) --...",The Importance of Being Earnest: A Trivial Com...,Transcribed from the 1915 Methuen & Co. Ltd. e...,119549


In [10]:
# Which authors do we have the most and least text from?
with pd.option_context('display.max_rows', 6):
    print(chosen_books_filtered.groupby('author')['text_length'].sum().sort_values())

author
Rowlandson, Mary White     104401
Rand, Ayn                  108043
Marlowe, Christopher       124324
                           ...   
Tolstoy, Leo, graf        3208285
Hugo, Victor              3231713
Dickens, Charles          3386829
Name: text_length, Length: 63, dtype: int64


In [11]:
# To make this work as a classification problem, we have to break up the text in sentence-sized pieces
chosen_books_filtered['sentences'] = [sent_tokenize(x) for x in chosen_books_filtered['text']]
chosen_books_filtered['sentences_length'] = [len(x) for x in chosen_books_filtered['sentences']]

chosen_books_filtered.sort_values('sentences_length').head(3)

Unnamed: 0,author,formaturi,language,rights,subject,title,text,text_length,sentences,sentences_length
851,"Rowlandson, Mary White",[http://www.gutenberg.org/ebooks/851.epub.imag...,en,Public domain in the USA.,"[Rowlandson, Mary White, approximately 1635-17...",Narrative of the Captivity and Restoration of ...,NARRATIVE OF THE CAPTIVITY AND RESTORATION OF ...,104401,[NARRATIVE OF THE CAPTIVITY AND RESTORATION OF...,777
43,"Stevenson, Robert Louis",[http://www.gutenberg.org/ebooks/43.kindle.noi...,en,Public domain in the USA.,"[Multiple personality -- Fiction, London (Engl...",The Strange Case of Dr. Jekyll and Mr. Hyde,The Strange Case Of Dr. Jekyll And Mr. Hyde\n\...,138800,[The Strange Case Of Dr. Jekyll And Mr. Hyde\n...,956
11,"Carroll, Lewis",[http://www.gutenberg.org/ebooks/11.epub.image...,en,Public domain in the USA.,"[Imaginary places -- Juvenile fiction, Fantasy...",Alice's Adventures in Wonderland,ALICE’S ADVENTURES IN WONDERLAND\n\nLewis Carr...,144435,[ALICE’S ADVENTURES IN WONDERLAND\n\nLewis Car...,977


In [12]:
# Too short sentences might contain too little information, so they are attached to the following one
def collate_short_sentences(sentences, threshold=100):
    tmp = ''
    ss = []
    for x in sentences:
        tmp += x
        if len(tmp) > threshold:
            ss.append(tmp)
            tmp = ''
    return ss

chosen_books_filtered['sentences_min100'] = [collate_short_sentences(x) for x in chosen_books_filtered['sentences']]
chosen_books_filtered['sentences_min100_length'] = [len(x) for x in chosen_books_filtered['sentences_min100']]
chosen_books_filtered.head(3)

Unnamed: 0,author,formaturi,language,rights,subject,title,text,text_length,sentences,sentences_length,sentences_min100,sentences_min100_length
851,"Rowlandson, Mary White",[http://www.gutenberg.org/ebooks/851.epub.imag...,en,Public domain in the USA.,"[Rowlandson, Mary White, approximately 1635-17...",Narrative of the Captivity and Restoration of ...,NARRATIVE OF THE CAPTIVITY AND RESTORATION OF ...,104401,[NARRATIVE OF THE CAPTIVITY AND RESTORATION OF...,777,[NARRATIVE OF THE CAPTIVITY AND RESTORATION OF...,539
1250,"Rand, Ayn",[http://www.gutenberg.org/files/1250/1250-h.zi...,en,Public domain in the USA.,"[Individuality -- Fiction, Time travel -- Fict...",Anthem,ANTHEM\n\n by Ayn Rand\n\n\n CONTE...,108043,[ANTHEM\n\n by Ayn Rand\n\n\n CONT...,1163,[ANTHEM\n\n by Ayn Rand\n\n\n CONT...,631
844,"Wilde, Oscar","[http://www.gutenberg.org/files/844/844.txt, h...",en,Public domain in the USA.,"[Foundlings -- Drama, Identity (Psychology) --...",The Importance of Being Earnest: A Trivial Com...,Transcribed from the 1915 Methuen & Co. Ltd. e...,119549,[Transcribed from the 1915 Methuen & Co. Ltd. ...,3274,[Transcribed from the 1915 Methuen & Co. Ltd. ...,842


In [13]:
# This "unrolls" all sentences from the same book so that each is in its own row
# R's tidyr::spread() would do this much more elegantly, but such is life

chosen_sentences = chosen_books_filtered.sentences_min100.apply(pd.Series) \
    .merge(chosen_books_filtered[['title', 'author']], right_index = True, left_index = True) \
    .melt(id_vars = ['title', 'author'], value_name = "sentence") \
    .drop("variable", axis = 1).dropna()

assert sum(chosen_books_filtered['sentences_min100_length']) == len(chosen_sentences)

In [14]:
# Now that we have a dataset of sentences, we can select some to build a model
VAL_PCT = 0.25
MAX_TRAINING_EXAMPLES_PER_AUTHOR = 1500
MAX_VAL_EXAMPLES_PER_AUTHOR = int(MAX_TRAINING_EXAMPLES_PER_AUTHOR * VAL_PCT)
MAX_SENTENCE_LENGTH = 10000
SHUFFLE_SEED = 20191114

train, val = train_test_split(chosen_sentences, 
                              test_size = VAL_PCT, 
                              stratify=chosen_sentences['author'],
                              random_state=SHUFFLE_SEED,
                              shuffle=True)

train = train.copy(deep=True)
val = val.copy(deep=True)

train['subset'] = 'train'
val['subset'] = 'val'
train['sequence_number'] = train.groupby('author').cumcount()
val['sequence_number'] = val.groupby('author').cumcount()

# This has the effect of picking a random sample stratified by author 
# (train_test_split has already shuffled the data) taking all the data 
# if the specific author has fewer than the desired number of examples 
semibalanced_dataset = pd.concat([
    train.query(f'sequence_number < {MAX_TRAINING_EXAMPLES_PER_AUTHOR}'), 
    val.query(f'sequence_number < {MAX_VAL_EXAMPLES_PER_AUTHOR}')
]).drop('sequence_number', axis=1)

print(semibalanced_dataset.shape)

(102777, 4)


In [15]:
output_filename = f'{OUTPUT_ROOT}/semibalanced_dataset_' + \
f'{DATASET_IDENTIFIER}_{MAX_TRAINING_EXAMPLES_PER_AUTHOR}_{MAX_VAL_EXAMPLES_PER_AUTHOR}.csv'
metadata_output_filename = output_filename.replace('.csv', '_metadata.csv')

chosen_books_filtered[['author', 'title', 'text_length', 'sentences_min100_length']].to_csv(
    metadata_output_filename, index=False, quoting = QUOTE_NONNUMERIC
)

semibalanced_dataset['length'] = [len(x) for x in semibalanced_dataset['sentence']]
semibalanced_dataset.query(f"length <= {MAX_SENTENCE_LENGTH}").to_csv(
    output_filename, index=False, quoting = QUOTE_NONNUMERIC
)

semibalanced_dataset.sort_values('length', ascending=False)

Unnamed: 0,title,author,sentence,subset,length
653502,Ulysses,"Joyce, James",Where?•\n\n\n[ 18 ]\n\nYes because he never di...,train,55098
778407,Essays of Michel de Montaigne — Complete,"Montaigne, Michel de",Obedience is never pure nor calm in him who re...,train,12500
25651,Leaves of Grass,"Whitman, Walt",By the city’s quadrangular houses--in log huts...,train,6720
184233,How the Other Half Lives: Studies Among the Te...,"Riis, Jacob A. (Jacob August)","Population of New York, 1880 (census) ...",train,6625
67771,Leaves of Grass,"Whitman, Walt","3\n The log at the wood-pile, the axe support...",val,5450
...,...,...,...,...,...
58759,The Picture of Dorian Gray,"Wilde, Oscar",But there was no motive power in\nexperience.I...,train,101
92857,Meditations,"Marcus Aurelius, Emperor of Rome","Reason is of a diffusive nature, what itself i...",train,101
299456,Les Misérables,"Hugo, Victor","Nevertheless, some Hanoverian battalions yield...",train,101
8382,How the Other Half Lives: Studies Among the Te...,"Riis, Jacob A. (Jacob August)","There was no other explanation, and none was n...",train,101


In [22]:
semibalanced_dataset['author'].nunique(), semibalanced_dataset['title'].nunique(), len(semibalanced_dataset)

(63, 80, 102777)