In [1]:
!pip install levenshtein



In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from Levenshtein import ratio, distance
from collections import Counter

In [5]:
AngTextPath = '../data/AngOrdtext'
EngTextPath = '../data/EngOrdtext'



In [3]:
def get_sentences(filepath):
    '''
    Read a file and return a list of sentences
    '''
    with open(filepath, "r") as f:
        lines = f.readlines()
        return [line for line in lines]
    
def get_words(filepath):
    '''
    Read a file and return a list of words
    '''
    with open(filepath, "r") as f:
        lines = f.readlines()
        return [word for line in lines for word in line.split()]

def get_freq(filepath):
    '''
    Get frequency of words in a file
    '''
    with open(filepath, "r") as f:
        lines = f.readlines()
        return Counter([word for line in lines for word in line.split()])

# Functions

In [5]:
def read_text_data(filepath):    
    sentences = []
    for line in open(filepath):
        words = line.split()
        sentence = [word for word in words]
        sentences.append(sentence)
    return sentences

def read_data(filepath):
    all_words = []
    for line in open(filepath):
        words = line.split()
        sentence = [word for word in words]
        all_words.extend(sentence)
    return all_words

def get_word_count(sentences):
    all_words = [word for sentence in sentences for word in sentence]
    word_count = len(all_words)
    distinct_word_count = len(set(all_words))
    return all_words, word_count, distinct_word_count

# Read data

In [8]:
ang_words = get_words(AngTextPath)
print('Anglo-Saxon word count:', len(ang_words))
eng_words = get_words(EngTextPath)
print('English word count:', len(eng_words))
print('Anglo-Saxon distinct word count:', len(set(ang_words)))
print('English distinct word count:', len(set(eng_words)))


Anglo-Saxon word count: 488612
English word count: 3564443
Anglo-Saxon distinct word count: 60575
English distinct word count: 111999


In [20]:
import numpy as np
ang_stc = get_sentences(AngTextPath)
print('Anglo-Saxon sentence count:', len(ang_stc))
eng_stc = get_sentences(EngTextPath)
print('English sentence count:', len(eng_stc))

# Calculate the length of each sentence
ang_stc_len = [len(sentence.split()) for sentence in ang_stc]
print('Ang Mean sentence length:', np.mean(ang_stc_len))
print('Ang Standard deviation of sentence length:', np.std(ang_stc_len))

eng_stc_len = [len(sentence.split()) for sentence in eng_stc]
print('Eng Mean sentence length:', np.mean(eng_stc_len))
print('Eng Standard deviation of sentence length:', np.std(eng_stc_len))


Anglo-Saxon sentence count: 1432
English sentence count: 16976
Ang Mean sentence length: 341.20949720670393
Ang Standard deviation of sentence length: 290.300772498378
Eng Mean sentence length: 209.9695452403393
Eng Standard deviation of sentence length: 166.68178503716834


Metadata stats

In [27]:
EngGrantPath = '../data/metadata/eng/EngOrdGrant'
AngGrantPath = '../data/metadata/ang/AngOrdGrant'
ang_grant = get_words(AngGrantPath)
ang_grant_freq = get_freq(AngGrantPath)
print("Ang grant", len(ang_grant))
print("Ang grant", ang_grant_freq)  
eng_grant = get_words(EngGrantPath)
eng_grant_freq = get_freq(EngGrantPath)
print("Eng grant", len(eng_grant))
print("Eng grant", eng_grant_freq)

Ang grant 1432
Ang grant Counter({'1': 933, '0': 499})
Eng grant 16976
Eng grant Counter({'0': 13419, '1': 3557})


In [28]:
AngConfPath = '../data/metadata/ang/AngOrdConfirm'
EngConfPath = '../data/metadata/eng/EngOrdConfirm'
ang_conf = get_words(AngConfPath)
ang_conf_freq = get_freq(AngConfPath)
print("Ang conf", len(ang_conf))
print("Ang conf", ang_conf_freq)
eng_conf = get_words(EngConfPath)
eng_conf_freq = get_freq(EngConfPath)
print("Eng conf", len(eng_conf))
print("Eng conf", eng_conf_freq)

Ang conf 1432
Ang conf Counter({'0': 1324, '1': 108})
Eng conf 16976
Eng conf Counter({'0': 14805, '1': 2171})


In [32]:
AngRelPath = '../data/metadata/ang/AngOrdRel'
EngRelPath = '../data/metadata/eng/EngOrdRel'
ang_rel = get_words(AngRelPath)
ang_rel_freq = get_freq(AngRelPath)
print("Ang rel", len(ang_rel))
print("Ang rel", ang_rel_freq)
eng_rel = get_words(EngRelPath)
eng_rel_freq = get_freq(EngRelPath)
print("Eng rel", len(eng_rel))
print("Eng rel", eng_rel_freq)

Ang rel 1432
Ang rel Counter({'Benedictine': 1212, 'NA': 220})
Eng rel 16976
Eng rel Counter({'NA': 9487, 'Augustinian': 2897, 'Benedictine': 2866, 'Cistercian': 766, 'Hospitaller': 247, 'Cluniac': 226, 'Premonstratensian': 190, 'Gilbertine': 172, 'Templar': 66, 'Franciscan': 59})
