In [9]:
import os

import pandas as pd

from helpers import word_count, sent_count, avg_words_in_sentence, vocabulary_size

# Corpus 1
## Read corpus

In [10]:
df1 = pd.DataFrame(columns=['Title','Text'])

dir_path = 'data/aesop/original/'
for story_name in os.listdir(dir_path):
    with open(dir_path + story_name, 'r') as file:
        story = file.read().replace('\n', ' ')
        df1=df1.append({'Title':story_name.replace('.txt',''),'Text':story},ignore_index=True)
df1.head()

Unnamed: 0,Title,Text
0,Androcles,A slave named Androcles once escaped from his ...
1,Avaracious_and_Envious,Two neighbours came before Jupiter and prayed ...
2,Belling_the_Cat,"Long ago, the mice had a general council to co..."
3,Hercules_and_the_Waggoner,A Waggoner was once driving a heavy load along...
4,The_Ant_and_the_Grasshopper,In a field one summer’s day a Grasshopper was ...


## Edit corpus

In [11]:
df1 = df1.replace(r'[\n]+', ' ', regex=True)

## Analyse corpus

In [12]:
df1['Word count'] = df1.apply(lambda row: word_count(row['Text']), axis=1)
df1['Sentence count'] = df1.apply(lambda row: sent_count(row['Text']), axis=1)
stories = df1['Text'].values.tolist()
avg_words_in_sent=avg_words_in_sentence(stories)
global_vocabulary_size, avg_vocabulary_size = vocabulary_size(stories)

print("{:>10s} | {:.2f}".format('Num. of stories', df1.shape[0]))
print("{:>10s} | {:.2f}".format('Shortest story', df1['Word count'].min()))
print("{:>10s} | {:.2f}".format('Longest story', df1['Word count'].max()))
print("{:>10s} | {:.2f}".format('Avg. word count', df1['Word count'].mean()))
print("{:>10s} | {:.2f}".format('Avg. sentence count', df1['Sentence count'].mean()))
print("{:>10s} | {:.2f}".format('Avg. words in sentence', avg_words_in_sent))
print("{:>10s} | {:.2f}".format('Vocabulary size', global_vocabulary_size))
print("{:>10s} | {:.2f}".format('Avg. vocabulary size', avg_vocabulary_size))

Num. of stories | 55.00
Shortest story | 86.00
Longest story | 454.00
Avg. word count | 174.25
Avg. sentence count | 6.40
Avg. words in sentence | 27.23
Vocabulary size | 1745.00
Avg. vocabulary size | 98.44


# Corpus 2
## Read our corpus

In [13]:
df2 = pd.DataFrame(columns=['Title','Text'])

dir_path = 'data/new_data/original/'
for story_name in os.listdir(dir_path):
    with open(dir_path + story_name, 'r') as file:
        story = file.read().replace('\n', ' ')
        df2=df2.append({'Title':story_name.replace('.txt',''),'Text':story},ignore_index=True)
df2.head()

Unnamed: 0,Title,Text
0,Jupiter_Neptune_Minerva_and_Momus,"ACCORDING to an ancient legend, the first man ..."
1,Mercury_and_the_Workmen,"A WORKMAN, felling wood by the side of a river..."
2,The_Ant_and_the_Grasshopper,"Once upon a time, there were two characters, a..."
3,The_Apes_and_the_Two_Travelers,"TWO MEN, one who always spoke the truth and th..."
4,The_Bald_Man,"Once upon a time, there was a bald man who was..."


## Edit our corpus

In [14]:
df2 = df2.replace(r'[\n]+', ' ', regex=True)

## Analyse our corpus

In [15]:
df2['Word count'] = df2.apply(lambda row: word_count(row['Text']), axis=1)
df2['Sentence count'] = df2.apply(lambda row: sent_count(row['Text']), axis=1)
stories = df2['Text'].values.tolist()
avg_words_in_sent = avg_words_in_sentence(stories)
global_vocabulary_size, avg_vocabulary_size = vocabulary_size(stories)

print("{:>10s} | {:.2f}".format('Num. of stories', df2.shape[0]))
print("{:>10s} | {:.2f}".format('Shortest story', df2['Word count'].min()))
print("{:>10s} | {:.2f}".format('Longest story', df2['Word count'].max()))
print("{:>10s} | {:.2f}".format('Avg. word count', df2['Word count'].mean()))
print("{:>10s} | {:.2f}".format('Avg. sentence count', df2['Sentence count'].mean()))
print("{:>10s} | {:.2f}".format('Avg. words in sentence', avg_words_in_sent))
print("{:>10s} | {:.2f}".format('Vocabulary size', global_vocabulary_size))
print("{:>10s} | {:.2f}".format('Avg. vocabulary size', avg_vocabulary_size))

Num. of stories | 73.00
Shortest story | 43.00
Longest story | 476.00
Avg. word count | 161.07
Avg. sentence count | 6.77
Avg. words in sentence | 23.80
Vocabulary size | 2215.00
Avg. vocabulary size | 89.10


# Corpus 1+2
## Analyse both corpora

In [16]:
df = df1.append(df2, ignore_index=True)

df['Word count'] = df.apply(lambda row: word_count(row['Text']), axis=1)
df['Sentence count'] = df.apply(lambda row: sent_count(row['Text']), axis=1)
stories = df['Text'].values.tolist()
avg_words_in_sent = avg_words_in_sentence(stories)
global_vocabulary_size, avg_vocabulary_size = vocabulary_size(stories)

print("{:>10s} | {:.2f}".format('Num. of stories', df.shape[0]))
print("{:>10s} | {:.2f}".format('Shortest story', df['Word count'].min()))
print("{:>10s} | {:.2f}".format('Longest story', df['Word count'].max()))
print("{:>10s} | {:.2f}".format('Avg. word count', df['Word count'].mean()))
print("{:>10s} | {:.2f}".format('Avg. sentence count', df['Sentence count'].mean()))
print("{:>10s} | {:.2f}".format('Avg. words in sentence', avg_words_in_sent))
print("{:>10s} | {:.2f}".format('Vocabulary size', global_vocabulary_size))
print("{:>10s} | {:.2f}".format('Avg. vocabulary size', avg_vocabulary_size))

Num. of stories | 128.00
Shortest story | 43.00
Longest story | 476.00
Avg. word count | 166.73
Avg. sentence count | 6.61
Avg. words in sentence | 25.23
Vocabulary size | 2999.00
Avg. vocabulary size | 93.11
