In [10]:
import pandas as pd

In [11]:
# Info available on Kaggle @ https://www.kaggle.com/datasets/edenbd/children-stories-text-corpus

cstc_book_list = 'The Happy Prince, Andersens Fairy Tales, The Blue Fairy Book, The Adventures of Pinocchio, Myths Retold by Children,Household Tales, Indian Fairy Tales, Fairy Tales Second Series, MERRY STORIES AND FUNNY PICTURES, Childhoods Favorites and Fairy Stories,The Wonderful Wizard of Oz, Celtic Tales, Our Children, The Little Lame Prince, The Prince and Betty, The Adventures of Sherlock Holmes,Peter Pan, The Secret Garden, The Jungle Book, The Adventures of Tom Sawyer, A Little Princess, Little Women, Just So Stories, Moby Dick, Treasure Island, The Idiot, A Tale of Two Cities, My Man Jeeves, Sense and Sensibility, The Time Machine, Comic History of the United States, The Velveteen Rabbit, The Book of Dragons, The Snow Image, The Magical Mimics in Oz, Folk Tales from the Russian, Snow-White or The House in the Wood, Dramatic Reader for Lower Grades, A Christmas Hamper, Aesop Fables, My Fathers Dragon, The Peace Egg and Other tales, Indian Why Stories, Folk-Tales of the Khasis, The Paradise of Children, Wonder Stories, The Best American Humorous Short Stories, Hindu Tales from the Sanskrit, The Tale of Johnny Town-Mouse, The Little Red Hen, East of the Sun and West of the Moon, Among the Forest People, True Stories of Wonderful Deeds, English Fairy Tales, Simla Village Tales Or Folk Tales from the Himalayas, Japanese Fairy Tales, Plain Tales of the North, The Wind in the Willows, The Louisa Alcott Reader. A Supplementary Reader for the Fourth Year of School, A Wonder Book for Girls and Boys, Tanglewood Tales, The Pig Brother and Other Fables and Stories, The Worlds Greatest Books, Vol 3, Goody Two-Shoes, The Marvelous Exploits of Paul Bunyan, Christmas Every Day and Other Stories, The Childrens Book of Thanksgiving Stories.'

cstc_book_list = pd.DataFrame(cstc_book_list.split(', '))

cstc_book_list.head()

Unnamed: 0,0
0,The Happy Prince
1,Andersens Fairy Tales
2,The Blue Fairy Book
3,The Adventures of Pinocchio
4,"Myths Retold by Children,Household Tales"


In [12]:
# Read in the Children's Stories Text Corpus

with open('cleaned_merged_fairy_tales_without_eos.txt', 'r') as f:
    cstc = f.read().splitlines()

cstc = pd.DataFrame(cstc, columns=['lines'])

cstc.head()

Unnamed: 0,lines
0,The Happy Prince.
1,"HIGH above the city, on a tall column, stood t..."
2,He was very much admired indeed. “He is as be...
3,“Why can’t you be like the Happy Prince?” aske...
4,“I am glad there is some one in the world who ...


In [13]:
# Read in the CBT Dataset

path1 = 'train-00000-of-00001.parquet'
path2 = 'test-00000-of-00001.parquet'
path3 = 'validation-00000-of-00001.parquet'

df = pd.concat([pd.read_parquet(path1), pd.read_parquet(path2), pd.read_parquet(path3)], axis=0, ignore_index=True).reset_index(drop=True)

df.head()

Unnamed: 0,title,content
0,Andrew_Lang___Prince_Prigio.txt.out,CHAPTER I. -LCB- Chapter heading picture : p1....
1,"Andrew_Lang___Prince_Prigio_From_""His_Own_Fair...",CHAPTER I. -- How the Fairies were not Invited...
2,Andrew_Lang___Prince_Ricardo_of_Pantouflia.txt...,CHAPTER I .\nThe Troubles of King Prigio .\n-L...
3,Andrew_Lang___The_Blue_Fairy_Book.txt.out,THE BRONZE RING Once upon a time in a certain ...
4,Andrew_Lang___The_Brown_Fairy_Book.txt.out,The Brown Fairy Book What the Rose did to the ...


In [14]:
def trim_title(title):
    start = title.find('___') + len('___')
    end = title.find('.txt.out')
    trimmed_title = title[start:end]
    return trimmed_title.replace('_', ' ')

In [15]:
# Extract the title from the file name

df['trimmed_titles'] = df['title'].apply(trim_title)

df.head()

Unnamed: 0,title,content,trimmed_titles
0,Andrew_Lang___Prince_Prigio.txt.out,CHAPTER I. -LCB- Chapter heading picture : p1....,Prince Prigio
1,"Andrew_Lang___Prince_Prigio_From_""His_Own_Fair...",CHAPTER I. -- How the Fairies were not Invited...,"Prince Prigio From ""His Own Fairy Book"""
2,Andrew_Lang___Prince_Ricardo_of_Pantouflia.txt...,CHAPTER I .\nThe Troubles of King Prigio .\n-L...,Prince Ricardo of Pantouflia
3,Andrew_Lang___The_Blue_Fairy_Book.txt.out,THE BRONZE RING Once upon a time in a certain ...,The Blue Fairy Book
4,Andrew_Lang___The_Brown_Fairy_Book.txt.out,The Brown Fairy Book What the Rose did to the ...,The Brown Fairy Book


In [16]:
df.shape

(108, 3)

In [17]:
# Remove the duplicates

df = df.drop_duplicates(subset=['trimmed_titles'])

df.shape

# There are none

(108, 3)

In [18]:
# based on cstc_book_list, remove the books from df that are in the list, using the trimmed_titles column

df = df[~df['trimmed_titles'].isin(cstc_book_list[0])]

In [19]:
df.shape

(101, 3)

In [20]:
total_num_books = df.shape[0] + cstc_book_list.shape[0]

total_num_books

166

In [21]:
# Join Datasets

df = pd.concat([df['content'], cstc['lines']], axis=0, ignore_index=True).reset_index(drop=True)

df.head()

0    CHAPTER I. -LCB- Chapter heading picture : p1....
1    CHAPTER I. -- How the Fairies were not Invited...
2    CHAPTER I .\nThe Troubles of King Prigio .\n-L...
3    The Brown Fairy Book What the Rose did to the ...
4    Lovely Ilonka There was once a king 's son who...
dtype: object

In [22]:
# Remove non-words like \n, \t, commas, underscores, etc.

df = df.str.replace(r'\W', ' ', regex=True)

df.head()

0    CHAPTER I   LCB  Chapter heading picture   p1 ...
1    CHAPTER I     How the Fairies were not Invited...
2    CHAPTER I   The Troubles of King Prigio    LCB...
3    The Brown Fairy Book What the Rose did to the ...
4    Lovely Ilonka There was once a king  s son who...
dtype: object

In [23]:
# Remove unnecessary words like is, i, am, p1, etc.

df = df.str.replace(r'\b\w{1,3}\b', '', regex=True)

df.head()

0    CHAPTER      Chapter heading picture          ...
1    CHAPTER        Fairies were  Invited  Court ON...
2    CHAPTER     Troubles  King Prigio      Prince ...
3     Brown Fairy Book What  Rose    Cypress       ...
4    Lovely Ilonka There  once  king     told  fath...
dtype: object

In [24]:
# Remove the word 'chapter' in a non-case sensitive manner

df = df.str.replace(r'chapter', '', case=False, regex=True)

df.head()

0           heading picture          Fairies were  ...
1            Fairies were  Invited  Court ONCE upon...
2         Troubles  King Prigio      Prince Ricardo...
3     Brown Fairy Book What  Rose    Cypress       ...
4    Lovely Ilonka There  once  king     told  fath...
dtype: object

In [25]:
# Remove extra spaces

df = df.str.replace(r'\s+', ' ', regex=True)

df.head()

0     heading picture Fairies were Invited Court On...
1     Fairies were Invited Court ONCE upon time the...
2     Troubles King Prigio Prince Ricardo lady tied...
3     Brown Fairy Book What Rose Cypress Once upon ...
4    Lovely Ilonka There once king told father that...
dtype: object

In [26]:
# Make all text lowercase

df = df.str.lower()

df.head()

0     heading picture fairies were invited court on...
1     fairies were invited court once upon time the...
2     troubles king prigio prince ricardo lady tied...
3     brown fairy book what rose cypress once upon ...
4    lovely ilonka there once king told father that...
dtype: object

In [27]:
# Remove stop words
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

df = df.apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

df.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\home\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0    heading picture fairies invited court upon tim...
1    fairies invited court upon time reigned pantou...
2    troubles king prigio prince ricardo lady tied ...
3    brown fairy book rose cypress upon time great ...
4    lovely ilonka king told father wished marry sa...
dtype: object

In [28]:
# Count the number of words in total

df.str.split().apply(len).sum()

3542218

In [29]:
# Count the number of unique words

df.str.split().apply(set).apply(len).sum()

1845034

In [47]:
# Sample the top 20% most frequent words

NUM_WORDS_IN_CORPUS = int(0.05 * df.str.split().apply(set).apply(len).sum())

from collections import Counter

word_freq = Counter(' '.join(df).split())

corpus_with_frequency = pd.DataFrame(word_freq.most_common(NUM_WORDS_IN_CORPUS))

corpus_with_frequency

Unnamed: 0,0,1
0,said,49413
1,little,27543
2,would,27050
3,could,22548
4,like,17991
...,...,...
54874,recharging,1
54875,onlooker,1
54876,equine,1
54877,melee,1


In [48]:
# top 0.25%
corpus_with_frequency.describe() 

Unnamed: 0,1
count,54879.0
mean,64.545965
std,493.820219
min,1.0
25%,1.0
50%,4.0
75%,18.0
max,49413.0


In [49]:
corpus_with_frequency.iloc[0:25]

Unnamed: 0,0,1
0,said,49413
1,little,27543
2,would,27050
3,could,22548
4,like,17991
5,time,16011
6,came,15136
7,never,14633
8,good,14135
9,went,14056


In [None]:
# Save the file

corpus_with_frequency.to_csv('corpus_with_frequency.csv', index=False)