In [1]:
from pathlib import Path
import requests

In [2]:
def get_gutenberg_book(
	id: int|None = 84,
	data_temp: Path|str = "../data/gutenberg_data",
	remove_gutenberg_meta: bool = True,
) -> str:
	
	data_temp = Path(data_temp)
	data_temp.mkdir(parents=True, exist_ok=True)
	
	url: str = f"https://www.gutenberg.org/cache/epub/{id}/pg{id}.txt"
	data_path: Path = Path(data_temp) / f"{id}.txt"
	data: str
	# read from cache if it exists
	if data_path.exists():
		with open(data_path, 'r', encoding='utf-8') as file:
			data = file.read()
	else:
		# download if it doesn't exist
		response = requests.get(url)
		response.raise_for_status()  # Ensure that the download was successful
		data = response.text

		# save to cache
		with open(data_path, 'w', encoding='utf-8') as file:
			file.write(data)

	# remove header/footer
	if remove_gutenberg_meta:
		data = '***'.join(data.split('***')[2:])
		data = '***'.join(data.split('***')[:-1])
	
	return data

def get_many_books(
		ids: list[int],
		data_temp: Path|str = "../data/gutenberg_data",
	) -> list[str]:
	
	data: list[str] = []
	for id in ids:
		print(f"Getting book {id}...")
		item: str = get_gutenberg_book(id, data_temp)
		print(f"\t{len(item)} characters read")
		data.append(item)
	
	return data

In [3]:
DATA_RAW: list[str] = get_many_books([84, 15, 18, 82, 996, 2600])

print(f"{sum(len(x) for x in DATA_RAW) = }")

Getting book 84...
	426785 characters read
Getting book 15...
	1241025 characters read
Getting book 18...
	1192776 characters read
Getting book 82...
	1124986 characters read
Getting book 996...
	2342262 characters read
Getting book 2600...
	3273998 characters read
sum(len(x) for x in DATA_RAW) = 9601832


In [14]:
def create_word_to_int_mapping(texts: list[str]) -> tuple[dict[str, int], list[list[int]]]:
    # Create vocabulary (unique words)
    vocab = set()
    for text in texts:
        words = text.split()
        vocab.update(words)
    
    # Create word to int mapping
    word_to_int = {word: idx for idx, word in enumerate(sorted(vocab))}
    
    # Convert texts to sequences of integers
    int_sequences = []
    for text in texts:
        words = text.split()
        int_sequence = [word_to_int[word] for word in words]
        int_sequences.append(int_sequence)
    
    return word_to_int, int_sequences

word_to_int, int_sequences = create_word_to_int_mapping(DATA_RAW)

In [18]:
word_to_int["the"], int_sequences[0][:10]

(84277, [6650, 64262, 84277, 10942, 13163, 27134, 10534, 17618, 239, 14836])