# Download and Clean Text Data

Trained on a mixture of text from the following sources:
- The complete works of William Shakespeare from https://www.gutenberg.org/cache/epub/100/pg100.txt
- Collection of 40 books from https://huggingface.co/datasets/IsmaelMousa/books (https://huggingface.co/datasets/IsmaelMousa/books)

In [59]:
import ftfy
import re
import requests
from datasets import load_dataset
from collections import Counter

datasets_path = "datasets/bee_hummingbird"

# Download datasets

In [21]:
# download shakepeare dataset using requests
gutenberg_response = requests.get("https://www.gutenberg.org/cache/epub/100/pg100.txt")
assert gutenberg_response.status_code == 200, "Failed to download the dataset"

with open(f"{datasets_path}/shakespeare.txt", "w") as f:
    f.write(gutenberg_response.text)
    
with open(f"{datasets_path}/shakespeare.txt", "r") as f:
    content = f.readlines()
    marker_start_list = [line  for line in content if "*** START OF THE PROJECT GUTENBERG EBOOK" in line]
    marker_start = marker_start_list[0]
    marker_end = [line for line in content if "*** END OF THE PROJECT GUTENBERG EBOOK" in line][0]
    main_content = "".join(content[content.index(marker_start)+1:content.index(marker_end)])
    
with open(f"{datasets_path}/shakespeare.txt", "w") as f:
    f.write(main_content)

print(main_content[:100])

The Complete Works of William Shakespeare

by William Shakespeare




                    Contents




In [16]:
# downlod data from datasets and save it to a file
train_books_dataset = load_dataset("IsmaelMousa/books")["train"]
validation_books_dataset = load_dataset("IsmaelMousa/books")["validation"]

In [20]:
with open(f"{datasets_path}/books.txt", "w") as f:
    for book in train_books_dataset:
        f.write(book["EN"])
        f.write("\n\n************\n\n")
    for book in validation_books_dataset:
        f.write(book["EN"])
        f.write("\n\n************\n\n")

# ⚒️ **Manual Data preparation/selection**:
- combine the books into one text file, separated by "*************"

# Data Exploration

In [63]:
def text_standardize(text):
    """
    fixes some issues the spacy tokenizer had on books corpus
    also does some whitespace standardization
    """
    text = text.replace('—', '-')
    text = text.replace('–', '-')
    text = text.replace('―', '-')
    text = text.replace('…', '...')
    text = text.replace('´', "'")
    text = re.sub('''(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)''', r' \1 ', text)
    text = re.sub('\s*\n\s*', ' \n ', text)
    text = re.sub('[^\S\n]+', ' ', text)
    return text.strip()

In [64]:
with open(f"{datasets_path}/books.txt", "r") as f:
    books = f.read()
with open(f"{datasets_path}/shakespeare.txt", "r") as f:
    shakespeare = f.read()
books = text_standardize(ftfy.fix_text(books))
shakespeare = text_standardize(ftfy.fix_text(shakespeare))
print(books[:100])
print(shakespeare[:100])

I was born in the year 1632 , in the city of York , of a good family , 
 though not of that country 
From fairest creatures we desire increase , 
 That thereby beauty's rose might never die , 
 But as 


In [71]:
len(books), len(shakespeare), len(books)+len(shakespeare)

(32817514, 5725829, 38543343)

## View the character set of the text

In [65]:
"".join(sorted(list(set(books))))

'\n !"$&\'()*+,-./0123456789:;=>?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]abcdefghijklmnopqrstuvwxyz{|}'

In [66]:
",".join(sorted(list(set(shakespeare))))

'\n, ,!,",&,\',(,),*,,,-,.,1,2,3,4,5,6,7,8,9,:,;,?,A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z,[,],_,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,À,Æ,Ç,É,à,â,æ,ç,è,é,ê,ë,î,œ'

## Most popular and least popular words

In [67]:
word_counts = dict(Counter(books.split()))
sorted_dict = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
[sorted_dict[i] for i in range(1,20)]

[('the', 292752),
 ('and', 183155),
 ('of', 160470),
 ('to', 154686),
 ('a', 118032),
 ('I', 113963),
 ('in', 89919),
 ('was', 74017),
 ('that', 72529),
 ('he', 60213),
 ('it', 56483),
 ('his', 53726),
 (';', 52675),
 ('had', 49471),
 ('you', 46628),
 ('with', 46372),
 ('as', 42928),
 ('for', 40417),
 ('not', 39522)]

In [68]:
[sorted_dict[-i] for i in range(1,20)]

[('Ihope', 1),
 ('astronomy.', 1),
 ('gid.', 1),
 ('gibberedBig', 1),
 ('gibing', 1),
 ('enhancement', 1),
 ('Vainand', 1),
 ('Apia', 1),
 ('tangle:', 1),
 ('marvelling.', 1),
 ('jd', 1),
 ('Sl', 1),
 ('carnivores', 1),
 ('ninepin.', 1),
 ('raft.', 1),
 ('generalised', 1),
 ('ursine', 1),
 ('Familycages', 1),
 ('thoseHappy', 1)]

### Save combined text to file

In [69]:
with open(f"{datasets_path}/combined_data.txt", "w") as f:
    f.write(shakespeare+"\n"+books)