In [2]:
import pandas as pd
import os
from helpers import word_count, sent_count, avg_words_in_sentence, vocabulary_size

## Read corpus

In [3]:
df = pd.DataFrame(columns=['Title','Text'])

dir_path = 'data/aesop/original/'
for story_name in os.listdir(dir_path):
    with open(dir_path + story_name, 'r') as file:
        story = file.read().replace('\n', ' ')
        df=df.append({'Title':story_name.replace('.txt',''),'Text':story},ignore_index=True)
df.head()

Unnamed: 0,Title,Text
0,Androcles,A slave named Androcles once escaped from his ...
1,Avaracious_and_Envious,Two neighbours came before Jupiter and prayed ...
2,Belling_the_Cat,"Long ago, the mice had a general council to co..."
3,Hercules_and_the_Waggoner,A Waggoner was once driving a heavy load along...
4,The_Ant_and_the_Grasshopper,In a field one summer’s day a Grasshopper was ...


## Edit corpus

In [4]:
df = df.replace(r'[\n]+', ' ', regex=True)

## Analyse corpus

In [5]:
df['Word count'] = df.apply(lambda row: word_count(row['Text']), axis=1)
df['Sentence count'] = df.apply(lambda row: sent_count(row['Text']), axis=1)
stories = df['Text'].values.tolist()
avg_words_in_sentence=avg_words_in_sentence(stories)
global_vocabulary_size, avg_vocabulary_size = vocabulary_size(stories)

print("{:>10s} | {:.2f}".format('Shortest story', df['Word count'].min()))
print("{:>10s} | {:.2f}".format('Longest story', df['Word count'].max()))
print("{:>10s} | {:.2f}".format('Avg. word count', df['Word count'].mean()))
print("{:>10s} | {:.2f}".format('Avg. sentence count', df['Sentence count'].mean()))
print("{:>10s} | {:.2f}".format('Avg. words in sentence', avg_words_in_sentence))
print("{:>10s} | {:.2f}".format('Vocabulary size', global_vocabulary_size))
print("{:>10s} | {:.2f}".format('Avg. vocabulary size', avg_vocabulary_size))


Shortest fairy tale | 86.00
Longes fairy tale | 355.00
Avg. word count | 171.44
Avg. sentence count | 6.33
Avg. words in sentence | 27.09
Vocabulary size | 1745.00
Avg. vocabulary size | 97.15
