# Ver 2. with encoding detector


In [10]:
import requests
import csv
import os
import chardet  # You may need to install the chardet library

# Create a folder to store the downloaded books
if not os.path.exists('books'):
    os.makedirs('books')

# Create the output text file with utf-8 encoding
output_file = open('fantasy-dataset.txt', 'w', encoding='utf-8')

# Read the CSV file with the book links
with open('fantasy-link.csv', 'r', encoding='utf-8') as csvfile:
    reader = csv.reader(csvfile)

    # Loop through the links and extract book content
    for row in reader:
        link = row[0]  # Assuming the link is in the first (and only) column
        book_id = link.split('/')[-1]

        # Attempt to download the book text from the first form URL
        url_form1 = f'https://www.gutenberg.org/files/{book_id}/{book_id}-0.txt'
        response = requests.get(url_form1)

        # If the first form URL doesn't yield results, try the second form URL
        if response.status_code != 200:
            url_form2 = f'https://www.gutenberg.org/cache/epub/{book_id}/pg{book_id}.txt'
            response = requests.get(url_form2)

        if response.status_code == 200:
            # Decode the content using chardet to detect the encoding
            detected_encoding = chardet.detect(response.content)['encoding']
            decoded_content = response.content.decode(detected_encoding)

            # Encode the content in UTF-8
            utf8_encoded_content = decoded_content.encode('utf-8', errors='ignore')

            # Write a separator with book ID
            output_file.write(f'#####BOOK-ID-{book_id}#####\n')

            # Write the book content to the output text file with utf-8 encoding
            output_file.write(utf8_encoded_content.decode('utf-8'))

            # Add '#####END-OF-BOOK#####' at the end of each book
            output_file.write('\n#####END-OF-BOOK#####\n')

            print(f'Book {book_id} scraped and saved.')

# Close the output text file
output_file.close()

print('Scraping and text file creation completed.')


Book 55 scraped and saved.
Book 1251 scraped and saved.
Book 12753 scraped and saved.
Book 22566 scraped and saved.
Book 10148 scraped and saved.
Book 1252 scraped and saved.
Book 54 scraped and saved.
Book 1152 scraped and saved.
Book 5160 scraped and saved.
Book 10002 scraped and saved.
Book 7477 scraped and saved.
Book 831 scraped and saved.
Book 8395 scraped and saved.
Book 2892 scraped and saved.
Book 10806 scraped and saved.
Book 10662 scraped and saved.
Book 8183 scraped and saved.
Book 3261 scraped and saved.
Book 964 scraped and saved.
Book 832 scraped and saved.
Book 7838 scraped and saved.
Book 1557 scraped and saved.
Book 169 scraped and saved.
Book 8129 scraped and saved.
Book 420 scraped and saved.
Book 5713 scraped and saved.
Book 419 scraped and saved.
Book 11440 scraped and saved.
Book 3055 scraped and saved.
Book 1605 scraped and saved.
Book 4282 scraped and saved.
Book 518 scraped and saved.
Book 436 scraped and saved.
Book 10745 scraped and saved.
Book 8771 scraped 

# Cleaning


## Useless ending page removal

In [11]:
import re

# Read the content of 'fantasy-dataset.txt'
with open('fantasy-dataset.txt', 'r', encoding='utf-8') as infile:
    content = infile.read()

# Define a regular expression pattern to match the desired sections
pattern = r'\*\*\* END OF THE PROJECT GUTENBERG EBOOK.*?#####END-OF-BOOK#####'

# Use re.sub to remove the matched sections from the content
cleaned_content = re.sub(pattern, '#####END-OF-BOOK#####', content, flags=re.DOTALL)

# Write the cleaned content back to 'fantasy-dataset.txt'
with open('fantasy-dataset.txt', 'w', encoding='utf-8') as outfile:
    outfile.write(cleaned_content)

print('Text removal completed.')


Text removal completed.


## Useless opening page removal

In [15]:
import re

# Read the content of 'fantasy-dataset.txt'
with open('fantasy-dataset.txt', 'r', encoding='utf-8') as infile:
    content = infile.read()

# Define a regular expression pattern to match and remove the desired sections
pattern = r'#####BOOK-ID-(\d+)#####(.*?)\*\*\* START OF THE PROJECT GUTENBERG EBOOK'

# Use re.sub to replace the matched sections with just the "#####BOOK-ID-{the-id}#####"
cleaned_content = re.sub(pattern, '#####BOOK-ID-\\1#####\n*** START OF THE PROJECT GUTENBERG EBOOK', content, flags=re.DOTALL)

# Write the cleaned content back to 'fantasy-dataset.txt'
with open('fantasy-dataset.txt', 'w', encoding='utf-8') as outfile:
    outfile.write(cleaned_content)

print('Text removal completed.')


Text removal completed.
