Code takes from Matei Bejan's '15000 Gutenberg Books' Kaggle submission.
Used under the Creative Commons BY-NC-SA 4.0 License (https://creativecommons.org/licenses/by-nc-sa/4.0/).

Modified and adapted by Alex White.

This code will use the pre-generated gutenberg_metadata.csv file to download the texts for 15000 books from Project Gutenberg.

In [2]:
!python --version

Python 3.8.5


In [5]:
!pip install bsddb3-6.2.9-cp38-cp38-win_amd64.whl
!pip install Gutenberg
!pip install requests



In [6]:
import pandas as pd
import requests
import numpy as np
from bs4 import BeautifulSoup
from urllib.request import urlopen
from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers

# only removes funny tokens for English texts
def remove_funny_tokens(text):
    tokens = text.split()
    sample = ' '.join(' '.join(tokens).replace('xe2x80x9c', ' ').replace('xe2x80x9d', ' ')\
                                      .replace('xe2x80x94', ' ').replace('xe2x80x99', "'")\
                                      .replace('xe2x80x98', "'").split())
    return sample

# clean newlines, carriage returns and tabs
def clean_text(text):
    cleaned_listed_text = []
    listed_text = list(text)

    for iter in range(len(listed_text) - 1):
        if (listed_text[iter] == '\\' and listed_text[iter + 1] == 'n') or \
            (listed_text[iter] == 'n' and listed_text[iter - 1] == '\\'):
            continue
        elif listed_text[iter] == '\\' and listed_text[iter + 1] == 'r' or \
            (listed_text[iter] == 'r' and listed_text[iter - 1] == '\\'):
            continue
        elif listed_text[iter] == '\\' and listed_text[iter + 1] == 't' or \
            (listed_text[iter] == 't' and listed_text[iter - 1] == '\\'):
            continue
        elif listed_text[iter] == '\\':
            continue
        else:
            cleaned_listed_text.append(listed_text[iter])

    cleaned_text = ''.join([str(char) for char in cleaned_listed_text])
    cleaned_text = remove_funny_tokens(cleaned_text)

    return ''.join(cleaned_text)

In [8]:
df_metadata = pd.read_csv('Data/gutenberg_metadata.csv')

data = {'Author': None, 'Title': None, 'Link': None, 'ID': None, 'Bookshelf': None, 'Text': None}

for key, row in df_metadata.iterrows():
    if data['Author'] == None:
        data['Author'] = [row['Author']]
    else:
        data['Author'].append(row['Author'])
    
    if data['Title'] == None:
        data['Title'] = [row['Title']]
    else:
        data['Title'].append(row['Title'])
    
    if data['Link'] == None:
        data['Link'] = [row['Link']]
    else:
        data['Link'].append(row['Link'])
    
    book_id = int(row['Link'].split('/')[-1])

    if data['ID'] == None:
        data['ID'] = [book_id]
    else:
        data['ID'].append(book_id)
    
    if data['Bookshelf'] == None:
        data['Bookshelf'] = [row['Bookshelf']]
    else:
        data['Bookshelf'].append(row['Bookshelf'])

    text = np.nan
    try:
        text = strip_headers(load_etext(etextno=book_id, 
                                        mirror='http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/')).strip()
        text = ' '.join(' '.join(' '.join(text.split('\n')).split('\t')).split('\r'))
        text = ' '.join(text.split())
        text = clean_text(str(text))
    except:
        try: 
            page = requests.get(row['Link'])
            soup = BeautifulSoup(page.content, 'html.parser')
            text_link = 'http://www.gutenberg.org' + soup.find_all("a", string="Plain Text UTF-8")[0]['href']
            http_response_object = urlopen(text_link)

            text = strip_headers(str(http_response_object.read()))
            text = ' '.join(' '.join(' '.join(text.split('\n')).split('\t')).split('\r'))
            text = ' '.join(text.split())
            text = clean_text(str(text))
        except:
            print("Couldn't acquire text for " + row['Title'] + ' with ID ' + str(book_id) + '. Link: ' + row['Link'])
            
    if data['Text'] == None:
        data['Text'] = [' '.join(text.split(' '))]
    else:
        try:
            data['Text'].append(' '.join(text.split(' ')))
        except:
            data['Text'].append(None)
            print("Couldn't save data for " + row['Title'] + ' with ID ' + str(book_id) + '. Link: ' + row['Link'])

Couldn't acquire text for Ragged Dick with ID 20689. Link: http://www.gutenberg.org/ebooks/20689
Couldn't save data for Ragged Dick with ID 20689. Link: http://www.gutenberg.org/ebooks/20689
Couldn't acquire text for The Owl and the Pussycat with ID 23897. Link: http://www.gutenberg.org/ebooks/23897
Couldn't save data for The Owl and the Pussycat with ID 23897. Link: http://www.gutenberg.org/ebooks/23897
Couldn't acquire text for Don Quixote, Volume 1 with ID 28842. Link: http://www.gutenberg.org/ebooks/28842
Couldn't save data for Don Quixote, Volume 1 with ID 28842. Link: http://www.gutenberg.org/ebooks/28842
Couldn't acquire text for The Gift of the Magi with ID 22440. Link: http://www.gutenberg.org/ebooks/22440
Couldn't save data for The Gift of the Magi with ID 22440. Link: http://www.gutenberg.org/ebooks/22440
Couldn't acquire text for The Garden Party, and Other Stories with ID 26463. Link: http://www.gutenberg.org/ebooks/26463
Couldn't save data for The Garden Party, and Other 

Couldn't acquire text for Treatise on Thermodynamics with ID 50880. Link: http://www.gutenberg.org/ebooks/50880
Couldn't save data for Treatise on Thermodynamics with ID 50880. Link: http://www.gutenberg.org/ebooks/50880
Couldn't acquire text for Notes on Recent Researches in Electricity and Magnetism with ID 36525. Link: http://www.gutenberg.org/ebooks/36525
Couldn't save data for Notes on Recent Researches in Electricity and Magnetism with ID 36525. Link: http://www.gutenberg.org/ebooks/36525
Couldn't acquire text for Relativity: The Special & the General Theory with ID 36114. Link: http://www.gutenberg.org/ebooks/36114
Couldn't save data for Relativity: The Special & the General Theory with ID 36114. Link: http://www.gutenberg.org/ebooks/36114
Couldn't acquire text for Utility of Quaternions in Physics with ID 26262. Link: http://www.gutenberg.org/ebooks/26262
Couldn't save data for Utility of Quaternions in Physics with ID 26262. Link: http://www.gutenberg.org/ebooks/26262
Couldn't

Couldn't acquire text for At The Bay with ID 20267. Link: http://www.gutenberg.org/ebooks/20267
Couldn't save data for At The Bay with ID 20267. Link: http://www.gutenberg.org/ebooks/20267
Couldn't acquire text for Wonderwings and other Fairy Stories with ID 26591. Link: http://www.gutenberg.org/ebooks/26591
Couldn't save data for Wonderwings and other Fairy Stories with ID 26591. Link: http://www.gutenberg.org/ebooks/26591
Couldn't acquire text for The Awful German Language with ID 20595. Link: http://www.gutenberg.org/ebooks/20595
Couldn't save data for The Awful German Language with ID 20595. Link: http://www.gutenberg.org/ebooks/20595
Couldn't acquire text for The Life, Adventures and Piracies of the Famous Captain Singleton with ID 9679. Link: http://www.gutenberg.org/ebooks/9679
Couldn't save data for The Life, Adventures and Piracies of the Famous Captain Singleton with ID 9679. Link: http://www.gutenberg.org/ebooks/9679
Couldn't acquire text for The Amateur Cracksman with ID 20

Couldn't acquire text for On the Origin of Species with ID 21153. Link: http://www.gutenberg.org/ebooks/21153
Couldn't save data for On the Origin of Species with ID 21153. Link: http://www.gutenberg.org/ebooks/21153
Couldn't acquire text for The Integration of Functions of a Single Variable with ID 38993. Link: http://www.gutenberg.org/ebooks/38993
Couldn't save data for The Integration of Functions of a Single Variable with ID 38993. Link: http://www.gutenberg.org/ebooks/38993
Couldn't acquire text for Solid Geometry with Problems and Applications (Revised edition) with ID 29807. Link: http://www.gutenberg.org/ebooks/29807
Couldn't save data for Solid Geometry with Problems and Applications (Revised edition) with ID 29807. Link: http://www.gutenberg.org/ebooks/29807
Couldn't acquire text for La géométrie with ID 26400. Link: http://www.gutenberg.org/ebooks/26400
Couldn't save data for La géométrie with ID 26400. Link: http://www.gutenberg.org/ebooks/26400
Couldn't acquire text for Th

Couldn't acquire text for The Return of Sherlock Holmes with ID 9553. Link: http://www.gutenberg.org/ebooks/9553
Couldn't save data for The Return of Sherlock Holmes with ID 9553. Link: http://www.gutenberg.org/ebooks/9553
Couldn't acquire text for The Hound of the Baskervilles with ID 9552. Link: http://www.gutenberg.org/ebooks/9552
Couldn't save data for The Hound of the Baskervilles with ID 9552. Link: http://www.gutenberg.org/ebooks/9552
Couldn't acquire text for The Innocence of Father Brown with ID 21522. Link: http://www.gutenberg.org/ebooks/21522
Couldn't save data for The Innocence of Father Brown with ID 21522. Link: http://www.gutenberg.org/ebooks/21522
Couldn't acquire text for A Study in Scarlet with ID 9556. Link: http://www.gutenberg.org/ebooks/9556
Couldn't save data for A Study in Scarlet with ID 9556. Link: http://www.gutenberg.org/ebooks/9556
Couldn't acquire text for Tarzan of the Apes with ID 6532. Link: http://www.gutenberg.org/ebooks/6532
Couldn't save data for T

Couldn't acquire text for The Oakdale Affair with ID 8761. Link: http://www.gutenberg.org/ebooks/8761
Couldn't save data for The Oakdale Affair with ID 8761. Link: http://www.gutenberg.org/ebooks/8761
Couldn't acquire text for Tarzan the Untamed with ID 8767. Link: http://www.gutenberg.org/ebooks/8767
Couldn't save data for Tarzan the Untamed with ID 8767. Link: http://www.gutenberg.org/ebooks/8767
Couldn't acquire text for Tarzan and the Jewels of Opar with ID 8755. Link: http://www.gutenberg.org/ebooks/8755
Couldn't save data for Tarzan and the Jewels of Opar with ID 8755. Link: http://www.gutenberg.org/ebooks/8755
Couldn't acquire text for The Son of Tarzan with ID 8754. Link: http://www.gutenberg.org/ebooks/8754
Couldn't save data for The Son of Tarzan with ID 8754. Link: http://www.gutenberg.org/ebooks/8754
Couldn't acquire text for Tom Swift and His Aerial Warship; Or, The Naval Terror of the Seas with ID 22951. Link: http://www.gutenberg.org/ebooks/22951
Couldn't save data for T

Couldn't acquire text for Short Cuts in Figures with ID 29914. Link: http://www.gutenberg.org/ebooks/29914
Couldn't save data for Short Cuts in Figures with ID 29914. Link: http://www.gutenberg.org/ebooks/29914
Couldn't acquire text for Elementare Arithmetik und Algebra with ID 11925. Link: http://www.gutenberg.org/ebooks/11925
Couldn't save data for Elementare Arithmetik und Algebra with ID 11925. Link: http://www.gutenberg.org/ebooks/11925
Couldn't acquire text for Some Famous Problems of the Theory of Numbers and in Particular Waring's Problem with ID 37030. Link: http://www.gutenberg.org/ebooks/37030
Couldn't save data for Some Famous Problems of the Theory of Numbers and in Particular Waring's Problem with ID 37030. Link: http://www.gutenberg.org/ebooks/37030
Couldn't acquire text for How to Draw a Straight Line: A Lecture on Linkages with ID 25155. Link: http://www.gutenberg.org/ebooks/25155
Couldn't save data for How to Draw a Straight Line: A Lecture on Linkages with ID 25155. 

Couldn't acquire text for Santa Claus's Partner with ID 14624. Link: http://www.gutenberg.org/ebooks/14624
Couldn't save data for Santa Claus's Partner with ID 14624. Link: http://www.gutenberg.org/ebooks/14624
Couldn't acquire text for The Glugs of Gosh with ID 21518. Link: http://www.gutenberg.org/ebooks/21518
Couldn't save data for The Glugs of Gosh with ID 21518. Link: http://www.gutenberg.org/ebooks/21518
Couldn't acquire text for The Valley of Fear with ID 9557. Link: http://www.gutenberg.org/ebooks/9557
Couldn't save data for The Valley of Fear with ID 9557. Link: http://www.gutenberg.org/ebooks/9557
Couldn't acquire text for The Sign of the Four with ID 9558. Link: http://www.gutenberg.org/ebooks/9558
Couldn't save data for The Sign of the Four with ID 9558. Link: http://www.gutenberg.org/ebooks/9558
Couldn't acquire text for Four Max Carrados Detective Stories with ID 20047. Link: http://www.gutenberg.org/ebooks/20047
Couldn't save data for Four Max Carrados Detective Stories 

Couldn't acquire text for The Secret Garden with ID 21585. Link: http://www.gutenberg.org/ebooks/21585
Couldn't save data for The Secret Garden with ID 21585. Link: http://www.gutenberg.org/ebooks/21585
Couldn't acquire text for Jabberwocky with ID 23717. Link: http://www.gutenberg.org/ebooks/23717
Couldn't save data for Jabberwocky with ID 23717. Link: http://www.gutenberg.org/ebooks/23717
Couldn't acquire text for Anne of the Island with ID 20265. Link: http://www.gutenberg.org/ebooks/20265
Couldn't save data for Anne of the Island with ID 20265. Link: http://www.gutenberg.org/ebooks/20265
Couldn't acquire text for Christmas Poetry and Hymn Collection with ID 20604. Link: http://www.gutenberg.org/ebooks/20604
Couldn't save data for Christmas Poetry and Hymn Collection with ID 20604. Link: http://www.gutenberg.org/ebooks/20604
Couldn't acquire text for Christmas Carol Collection 2006 with ID 20603. Link: http://www.gutenberg.org/ebooks/20603
Couldn't save data for Christmas Carol Coll

Couldn't acquire text for Theorie der Abel'schen Functionen with ID 29780. Link: http://www.gutenberg.org/ebooks/29780
Couldn't save data for Theorie der Abel'schen Functionen with ID 29780. Link: http://www.gutenberg.org/ebooks/29780
Couldn't acquire text for Randwertaufgaben bei Systemen von linearen partiellen Differentialgleichungen with ID 33330. Link: http://www.gutenberg.org/ebooks/33330
Couldn't save data for Randwertaufgaben bei Systemen von linearen partiellen Differentialgleichungen with ID 33330. Link: http://www.gutenberg.org/ebooks/33330
Couldn't acquire text for Note sur une Méthode pour la Réduction d'Intégrales Définies with ID 36334. Link: http://www.gutenberg.org/ebooks/36334
Couldn't save data for Note sur une Méthode pour la Réduction d'Intégrales Définies with ID 36334. Link: http://www.gutenberg.org/ebooks/36334
Couldn't acquire text for Étude sur le Mouvement Permanent des Fluides with ID 33083. Link: http://www.gutenberg.org/ebooks/33083
Couldn't save data for 

Couldn't acquire text for Fables for the Frivolous with ID 20026. Link: http://www.gutenberg.org/ebooks/20026
Couldn't save data for Fables for the Frivolous with ID 20026. Link: http://www.gutenberg.org/ebooks/20026
Couldn't acquire text for The Prince and the Pauper with ID 26252. Link: http://www.gutenberg.org/ebooks/26252
Couldn't save data for The Prince and the Pauper with ID 26252. Link: http://www.gutenberg.org/ebooks/26252
Couldn't acquire text for Über die Picard'schen Gruppen aus dem Zahlkörper der dritten und der vierten with ID 34032. Link: http://www.gutenberg.org/ebooks/34032
Couldn't save data for Über die Picard'schen Gruppen aus dem Zahlkörper der dritten und der vierten with ID 34032. Link: http://www.gutenberg.org/ebooks/34032
Couldn't acquire text for Einleitung in die Theorie der Elliptischen Funktionen with ID 32766. Link: http://www.gutenberg.org/ebooks/32766
Couldn't save data for Einleitung in die Theorie der Elliptischen Funktionen with ID 32766. Link: http:/

Couldn't acquire text for The Golden Dream with ID 22980. Link: http://www.gutenberg.org/ebooks/22980
Couldn't save data for The Golden Dream with ID 22980. Link: http://www.gutenberg.org/ebooks/22980
Couldn't acquire text for Little Men with ID 22787. Link: http://www.gutenberg.org/ebooks/22787
Couldn't save data for Little Men with ID 22787. Link: http://www.gutenberg.org/ebooks/22787
Couldn't acquire text for Uncle Remus, his songs and his sayings with ID 21605. Link: http://www.gutenberg.org/ebooks/21605
Couldn't save data for Uncle Remus, his songs and his sayings with ID 21605. Link: http://www.gutenberg.org/ebooks/21605
Couldn't acquire text for Heidi with ID 20271. Link: http://www.gutenberg.org/ebooks/20271
Couldn't save data for Heidi with ID 20271. Link: http://www.gutenberg.org/ebooks/20271
Couldn't acquire text for Ben-Hur: A tale of the Christ with ID 8810. Link: http://www.gutenberg.org/ebooks/8810
Couldn't save data for Ben-Hur: A tale of the Christ with ID 8810. Link: 

Couldn't acquire text for The Marvelous Land Of Oz with ID 17426. Link: http://www.gutenberg.org/ebooks/17426
Couldn't save data for The Marvelous Land Of Oz with ID 17426. Link: http://www.gutenberg.org/ebooks/17426
Couldn't acquire text for The Little Lame Prince with ID 23977. Link: http://www.gutenberg.org/ebooks/23977
Couldn't save data for The Little Lame Prince with ID 23977. Link: http://www.gutenberg.org/ebooks/23977
Couldn't acquire text for Cricket on the Hearth with ID 9739. Link: http://www.gutenberg.org/ebooks/9739
Couldn't save data for Cricket on the Hearth with ID 9739. Link: http://www.gutenberg.org/ebooks/9739
Couldn't acquire text for Adam Bede with ID 9672. Link: http://www.gutenberg.org/ebooks/9672
Couldn't save data for Adam Bede with ID 9672. Link: http://www.gutenberg.org/ebooks/9672
Couldn't acquire text for Barry Lyndon with ID 9522. Link: http://www.gutenberg.org/ebooks/9522
Couldn't save data for Barry Lyndon with ID 9522. Link: http://www.gutenberg.org/ebo

In [9]:
df_data = pd.DataFrame(data, columns = ['Title', 'Author', 'Link', 'ID', 'Bookshelf', 'Text'])
df_data.head()

df_data.to_csv('Data/gutenberg_data.csv', index=False)
print('Exported')

Exported


In [11]:
df_data.head()

Unnamed: 0,Title,Author,Link,ID,Bookshelf,Text
0,The Extermination of the American Bison,William T. Hornaday,http://www.gutenberg.org/ebooks/17748,17748,Animal,[Illustration: (Inscription) Mr. Theodore Roos...
1,Deadfalls and Snares,A. R. Harding,http://www.gutenberg.org/ebooks/34110,34110,Animal,DEADFALLS AND SNARES [Frontispiece: A GOOD DEA...
2,Artistic Anatomy of Animals,Édouard Cuyer,http://www.gutenberg.org/ebooks/38315,38315,Animal,+---------------------------------------------...
3,"Birds, Illustrated","Color Photography, Vol. 1, No. 1 Various",http://www.gutenberg.org/ebooks/30221,30221,Animal,FROM: THE PRESIDENT OF THE NATIONAL TEACHERS' ...
4,On Snake-Poison: Its Action and Its Antidote,A. Mueller,http://www.gutenberg.org/ebooks/32947,32947,Animal,[Illustration] ON SNAKE-POISON. ITS ACTION AND...
