This notebook builds together samples, and then retrieves statistics for each terms / POS. It also builds the sequence for POS-MFW text.

In [1]:
# Documenting
from typing import Generator, List, Tuple

# OS
import glob
import os.path
import random

# Data
import csv
import json
import lxml.etree as ET
import pandas as pd
from collections import Counter

# Operations
import regex as re
import unicodedata

# UI
import tqdm

## Constants

In [2]:
NB_MFW = 1000
NB_MFT = 1000
NB_MFP = 100
SAMPLE_SIZE = 1000
MAX_NB_SAMPLE = 5
MARGIN = SAMPLE_SIZE // 2

## Reading function

In [3]:
def read_tsv(file):
    for line in file:
        yield line.split()[:2]

def normalize_tsv(file):
    for tok, pos in read_tsv(file):
        yield tok.lower(), pos[0]
        
def get_tokens(file):
    with open(f"./tagged/{file}-tagged.txt") as f:
        yield from normalize_tsv(f)

## Parsing MFW

In [4]:
def load_mfw(nb=500):
    with open("./mfw.json") as f:
        d = json.load(f)
    return [token for token, number in d if token != "'"][:nb]

def load_mfp(nb=500):
    with open("./mfp.json") as f:
        d = json.load(f)
    return [token for token, number in d][:nb]

MFW = load_mfw(NB_MFW)
MFP = load_mfp(NB_MFP)
MFP[:10]

['v-l-n',
 'l-n-v',
 'n-l-n',
 'l-n-l',
 'r-l-n',
 'l-n-p',
 'l-a-n',
 'l-n-n',
 'n-v-l',
 'l-n-b']

## Sampling function

In [5]:
TOKEN_TYPE = str
POS_TYPE = str


def is_punct(token: str, pos: str) -> bool:
    return pos == "u" or token == "'"


def extract_tokens(
    inputs: List[Tuple[TOKEN_TYPE, POS_TYPE]],
    sample_size: int
) -> List[List[Tuple[TOKEN_TYPE, POS_TYPE]]]:
    
    sample = []
    current_size = 0
    
    for token, pos in inputs:
        if not is_punct(token, pos):
            current_size += 1
            
        sample.append((token, pos))
        
        if current_size >= sample_size:
            yield sample
            current_size = 0
            sample = []
            
def get_pos_text(
    text: List[Tuple[TOKEN_TYPE, POS_TYPE]]
) -> str:
    return " ".join([
        tok if tok.lower() in MFW or is_punct(tok, pos) else pos
        for tok, pos in text
    ])

def get_trigrams(tokens: List[str]) -> List[str]:
    return ["-".join(tokens[i:i+3]) for i in range(len(tokens)-3+1)]

def get_text(tokens):
    return " ".join([t for t, _ in tokens])

## Importing preparsed texts

In [6]:
texts = pd.read_csv("tlg-texts.csv")

## Get the features

In [7]:
out_data = []

for idx, text in tqdm.tqdm(texts.iterrows()):
    poses = list(get_tokens(text.file))
    size = len(poses)
    local_margin = MARGIN
    local_samples = 1 
    # We count the number of potential samples ahead
    potential_samples = (size - MARGIN*2) // SAMPLE_SIZE
    
    # If the size of the sample is below the margin + sample size
    if (MARGIN*2+SAMPLE_SIZE) > size:
        local_margin = (size - SAMPLE_SIZE) // 2
    # Otherwise, check if we can extract multiple samples
    elif potential_samples > 1: 
        local_samples = min(potential_samples, MAX_NB_SAMPLE)
    
    samples = list(extract_tokens(poses[local_margin:], SAMPLE_SIZE))
    # We shuffle the sample order
    random.shuffle(samples)
    
    for sample in samples[:local_samples]:
        modified_text = get_pos_text(sample)
        tokens, pos = zip(*[(tok, pos) for tok, pos in sample if not is_punct(tok, pos)])
        filtered_tokens = Counter([tok for tok in tokens if tok in MFW])
        filtered_tokens_count = sum(filtered_tokens.values())
        pos = Counter([trig for trig in get_trigrams(pos) if trig in MFP])
        pos_count = sum(pos.values())
        out_data.append({
            "file": text["file"],
            "author": text["author"],
            "title": text["title"],
            "textgroup": text["textgroup"],
            "tokens": get_text(sample),
            "length": len(tokens),
            "modified_text": modified_text,
            **{
                f"$POS${pos}": freq/pos_count
                for pos, freq in pos.items()
            },
            **{
                f"$MFW${tok}": freq/filtered_tokens_count
                for tok, freq in filtered_tokens.items()
            }
        })

700it [00:26, 26.91it/s]


## Exporting

In [8]:
df = pd.DataFrame(out_data)
df.to_csv("tlg-features.csv", index=False)
df.shape

(2322, 1107)

In [9]:
for x in sorted(df.author.unique()):
    print(x)

Adamantius
Alexander
Amphilochius
Antonius Hagiographus
Asterius
Asterius Sophista
Athanasius
Barlaam,
Basilius
Clemens Alexandrinus
Clemens Romanus Clementina
Cyrillus
Didymus Caecus Didymus the Blind
Dio Chrysostomus
Ephraem
Ephraem Syrus
Epiphanius
Eusebius
Eustathius
Evagrius
Flavius Justinianus Imperator
Gregorius Nazianzenus
Gregorius Nyssenus
Gregorius Thaumaturgus
Hesychius
Hippolytus
Irenaeus
Joannes
Joannes Chrysostomus John Chrysostom
Joannes Damascenus John of Damascus
Julianus
Leontius
Marcellus
Marcus Diaconus
Maximus Confessor
Nemesius
Nicephorus I,
Nicetas Choniates,
Nicolaus I Mysticus
Olympiodorus Diaconus
Origenes
Palladius
Petrus
Procopius
Salaminius Hermias Sozomenus
Severianus
Theodoretus
Theodorus Studites


In [11]:
texts = pd.read_csv("pc-texts.csv")

pc_data = []

for idx, text in tqdm.tqdm(texts.iterrows()):
    poses = list(get_tokens(text.file))
    size = len(poses)
    local_margin = MARGIN
    local_samples = 1 
    # We count the number of potential samples ahead
    potential_samples = (size - MARGIN*2) // SAMPLE_SIZE
    
    # If the size of the sample is below the margin + sample size
    if (MARGIN*2+SAMPLE_SIZE) > size:
        local_margin = (size - SAMPLE_SIZE) // 2
    # Otherwise, check if we can extract multiple samples
    elif potential_samples > 1: 
        local_samples = min(potential_samples, MAX_NB_SAMPLE)
    
    # Right version :
    #samples = list(extract_tokens(poses[local_margin:], SAMPLE_SIZE))
    samples = [poses]
    #print(samples)
    
    #samples = []
    # We shuffle the sample order
    random.shuffle(samples)
    
    for sample in samples[:local_samples]:
        modified_text = get_pos_text(sample)
        tokens, pos = zip(*[(tok, pos) for tok, pos in sample if not is_punct(tok, pos)])
        filtered_tokens = Counter([tok for tok in tokens if tok in MFW])
        filtered_tokens_count = sum(filtered_tokens.values())
        pos = Counter([trig for trig in get_trigrams(pos) if trig in MFP])
        pos_count = sum(pos.values())
        pc_data.append({
            "file": text["file"],
            "author": text["author"],
            "title": text["title"],
            #"textgroup": text["textgroup"],
            "tokens": get_text(sample),
            "length": len(tokens),
            "modified_text": modified_text,
            **{
                f"$POS${pos}": freq/pos_count
                for pos, freq in pos.items()
            },
            **{
                f"$MFW${tok}": freq/filtered_tokens_count
                for tok, freq in filtered_tokens.items()
            }
        })
        
df = pd.DataFrame(pc_data)
df.to_csv("pc-features.csv", index=False)

70it [00:01, 52.69it/s]
