In [1]:
import pandas as pd
import json

def load_mft(nb=500):
    with open("./mft.json") as f:
        d = json.load(f)
    return [token for token, number in d if token != "'"][:nb]

MFT = load_mft(1000)
KEYS = [f"$TRI${t}"for t in MFT]

In [2]:
def apply_mft(row):
    text = f" {row.tokens.strip()} ".replace(" ", "_")
    numbers = [
        text.count(trigram)
        for trigram in MFT
    ]
    s = sum(numbers)
    numbers = [t/s for t in numbers]
    row[KEYS] = numbers
    return row

def update_dataframe(df):
    df.loc[:, KEYS] = 0
    df = df.apply(apply_mft, axis=1)
    return df

In [3]:
for file in ["train", "dev", "test"]:
    x = pd.read_csv(f"tlg-{file}.csv")
    x = update_dataframe(x)
    x.to_csv(f"tlg-{file}.csv", index=False)
 

In [7]:
x = pd.read_csv("pc-features.csv")
x = update_dataframe(x)
x.to_csv("pc-features.csv", index=False)
x

Unnamed: 0,file,author,title,tokens,length,modified_text,$POS$v-d-l,$POS$d-l-n,$POS$l-n-a,$POS$n-b-n,...,$TRI$σῶμ,$TRI$οἷο,$TRI$λῳ_,$TRI$_ἤγ,$TRI$ἁπλ,$TRI$_λύ,$TRI$ἐνδ,$TRI$_λυ,$TRI$λεγ,$TRI$εἴρ
0,PC20-InIlludCredidiPropterQuodLocutusSum.xml,PC20,InIlludCredidiPropterQuodLocutusSum,φασί ποτε τὴν μέλισσαν κούφοις πτεροῖς δένδρεσ...,1869,φασί ποτε τὴν n a a n καὶ n a v καὶ τὴν ἐν ταῖ...,0.004892,0.009785,0.006849,0.013699,...,0.000000,0.000144,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000722,0.0000
1,PC13-InVenerabilemCrucem.xml,PC13,InVenerabilemCrucem,"τί εἴπω , ἢ τί λαλήσω ; ἢ τίνας ὑμᾶς καλέσω ; ...",2226,"τί εἴπω , ἢ τί v ; ἢ p ὑμᾶς v ; n , ἢ n ; n , ...",0.006839,0.009119,0.005319,0.020517,...,0.000250,0.000000,0.000625,0.000000,0.000125,0.000250,0.000250,0.000125,0.000500,0.0005
2,PCX-DeJenunioSermo5B.xml,PCX,DeJenunioSermo5B,ὁ ἔχων προσευχὴν καὶ νηστείαν καὶ ἐλεημοσύνην ...,688,"ὁ ἔχων n καὶ n καὶ n , τότε ἐστὶ a ἐν n , καὶ ...",,0.005682,0.005682,0.048295,...,0.002043,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000409,0.0000
3,PC6-DePoenintentiaSermo1.xml,PC6,DePoenintentiaSermo1,"αʹ . ἀεὶ μὲν μνημονεύειν θεοῦ καλὸν , καὶ πάνυ...",10306,"αʹ . ἀεὶ μὲν v n καλὸν , καὶ πάνυ a · καὶ τί ἂ...",0.005763,0.013199,0.008924,0.018591,...,0.000081,0.000000,0.000135,0.000135,0.000027,0.000162,0.000324,0.000054,0.000270,0.0000
4,PC16-InIlludSiQuaChristoaNovaCreatura.xml,PC16,InIlludSiQuaChristoaNovaCreatura,πολλὴ μὲν γηπόνῳ προθυμία καταβάλλειν τὰ σπέρμ...,5629,"πολλὴ μὲν a n v τὰ n , ἐπειδὰν τὴν n v v , καὶ...",0.004130,0.010483,0.004130,0.012389,...,0.000255,0.000000,0.000000,0.000000,0.000204,0.000153,0.000153,0.000153,0.000562,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65,PC20b-InIlludSimileEstRegnumCaelorumGranoSinap...,PC20b,InIlludSimileEstRegnumCaelorumGranoSinapis,"τί μεῖζον βασιλείας οὐρανῶν , καὶ τί μικρότερο...",1456,"τί μεῖζον n n , καὶ τί a n v ; πῶς τὴν a n τῶν...",0.007134,0.013080,0.005945,0.015458,...,0.000602,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000201,0.0000
66,PC10-DePrecatione1.xml,PC10,DePrecatione1,ἀμφοτέρων ἕνεκα προσήκει τοὺς τοῦ θεοῦ θεράπον...,2009,ἀμφοτέρων ἕνεκα προσήκει τοὺς τοῦ n n v καὶ v ...,0.011215,0.014953,0.004673,0.041121,...,0.000268,0.000000,0.000000,0.000000,0.000000,0.000000,0.000134,0.000134,0.000134,0.0000
67,PC3-InMeretricem.xml,PC3,InMeretricem,σήμερον ὁ φαρισαῖος εἰς ἄριστον τὸν δεσπότην ἐ...,652,"σήμερον ὁ n εἰς a τὸν n ἐκάλεσε , n n v · ἡ δὲ...",0.005181,0.015544,0.005181,0.005181,...,0.000471,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000
68,PC4-InPaschaSermones6.xml,PC4,InPaschaSermones6,ἱεραὶ μὲν ἤδη φωτὸς αὐγάζουσι χριστοῦ ἀκτῖνες ...,6018,"a μὲν ἤδη n v n n καὶ a a n v n , a δὲ n καὶ n...",0.004851,0.010349,0.007762,0.026843,...,0.000524,0.000000,0.000191,0.000048,0.000095,0.000143,0.000095,0.000191,0.000143,0.0000
