In [1]:
from preproc import preprocess_and_slice_text_files
import logging
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
logging.getLogger().setLevel(logging.INFO)

In [3]:
raw_chunks = preprocess_and_slice_text_files("../data/txt")

INFO:root:'BurCom.txt' was processed and split into 24 samples
INFO:root:'AristPha.txt' was processed and split into 20 samples
INFO:root:'BurInt.txt' was processed and split into 64 samples
INFO:root:'AnonMetaph.txt' was processed and split into 61 samples
INFO:root:'BurEthN.txt' was processed and split into 4 samples
INFO:root:'BarMor.txt' was processed and split into 20 samples
INFO:root:'AristMet.txt' was processed and split into 6 samples
INFO:root:'BarMirab.txt' was processed and split into 7 samples
INFO:root:'WilMet.txt' was processed and split into 30 samples
INFO:root:'BxxRhet.txt' was processed and split into 14 samples
INFO:root:'JamPhys.txt' was processed and split into 53 samples
INFO:root:'WilInPar.txt' was processed and split into 161 samples
INFO:root:'BarMun.txt' was processed and split into 5 samples
INFO:root:'WilTet.txt' was processed and split into 33 samples
INFO:root:'BarSig.txt' was processed and split into 3 samples
INFO:root:'BarPri.txt' was processed and spl

In [4]:
# list of lists, so we can keep info about spelling variants in individual arrays

stops = []

with open("../data/functionwords.txt", "r") as file:
    for line in file:
        words = [x for x in line.lower().split() if x != "/"]
        stops.append(list(set(words)))

print(stops[:3])

[['atqui', 'atqvi'], ['aut', 'avt'], ['autem', 'avtem']]


In [6]:
raw_chunks

Unnamed: 0,Translator,Chunk,Work
0,Bur,quoniam quidem ex calido et frigido et sicco e...,Com
1,Bur,sermo non quod nunquam fit in uno eodemque cor...,Com
2,Bur,horis anni invenire quartam coniugationem comp...,Com
3,Bur,quod necesse est in ea putrefieri omnia incipi...,Com
4,Bur,de ipsis per capitula quantum ad presentia uti...,Com
...,...,...,...
1196,Bur,ostendat quod secundum veritatem est homo cum ...,Fid
1197,Bur,causative dicere ut hoc tibi soli peccavi et p...,Fid
1198,Bur,condemnavit peccatum in carne ut iustitia legi...,Fid
1199,Bur,uxorem suam et concepit et genuit quare propte...,Fid


In [5]:
# this is tied to our particular filename convention
# TranslatorWorkNamePossiblySeveralWords.txt

entries = []
for k, txt in raw_chunks.items():
    # grab the part before the chunk number in the key, split into translator
    # and work. `if x` drops empty strings that come from re.split.
    ww = [x for x in re.split("([A-Z][a-z]*)", k.split("_")[0]) if x]
    transl = ww[0]
    work = "".join(ww[1:])
    chunk = " ".join(txt)
    entries.append({"Translator": transl, "Work": work, "Chunk": chunk})
chunk_df = pd.DataFrame(entries)
chunk_df

Unnamed: 0,Translator,Work,Chunk
0,Translator,,Bur Bur Bur Bur Bur Bur Bur Bur Bur Bur Bur Bu...
1,Chunk,,quoniam quidem ex calido et frigido et sicco e...
2,Work,,Com Com Com Com Com Com Com Com Com Com Com Co...


In [7]:
chunk_df = raw_chunks.copy()

In [8]:
chunk_df.Work.unique()

array(['Com', 'Pha', 'Int', 'Metaph', 'EthN', 'Mor', 'Met', 'Mirab',
       'Rhet', 'Phys', 'InPar', 'Mun', 'Tet', 'Sig', 'Pri', 'InTim',
       'Cael', 'Gen', 'His', '6', 'Elem', 'Simp', 'Anim', '3', 'GenA',
       'Ion', 'Pue', 'Mat', 'Men', 'Alex', 'EthV', 'Fid'], dtype=object)

In [9]:
flat_stops = [item for sublist in stops for item in sublist]

In [10]:
# doing this without IDF really just makes it a normalised CountVectorizer. Not
# using IDF because the frequency 'boosting' is really a form a fitting, and we
# want to save that for the classification algorithms.

# L1 vs L2 normalisation is a methodological question. Since we'll be clustering
# these as 'points' L2 has slightly more theoretical support, but either would
# almost certainly be fine.
v = TfidfVectorizer(use_idf=False, analyzer="word", decode_error="replace", norm="l2")
# only count our stopwords
v.fit(flat_stops)


def vectorize(s):
    # make a df with all function words
    X = v.transform([s])
    df = pd.DataFrame(X.toarray())
    df.columns = v.get_feature_names_out()

    # make a blank df
    stops_df_combined = pd.DataFrame()
    for ary in stops:
        ary = sorted(list(set(ary)))
        # each array is a list of spelling variants. sum the counts for all the
        # variants of this stopword that appear in the df, using the first array
        # entry as the label
        stops_df_combined[ary[0]] = df[df.columns.intersection(ary)].sum(axis=1)
    return stops_df_combined

In [11]:
X = pd.concat([vectorize(x) for x in chunk_df.Chunk], axis=0).reset_index(drop=True)
X

Unnamed: 0,atqui,aut,autem,certe,ceu,confestim,cum,dehinc,deinceps,demum,...,sic,sicut,siquidem,tamquam,ut,utique,uelut,ueluti,uero,uidelicet
0,0.0,0.097100,0.534050,0.0,0.0,0.0,0.145650,0.0,0.024275,0.0,...,0.000000,0.000000,0.000000,0.0,0.145650,0.218475,0.024275,0.0,0.218475,0.0
1,0.0,0.200297,0.578636,0.0,0.0,0.0,0.155787,0.0,0.000000,0.0,...,0.000000,0.000000,0.022255,0.0,0.222552,0.311573,0.022255,0.0,0.066766,0.0
2,0.0,0.230022,0.575055,0.0,0.0,0.0,0.000000,0.0,0.023002,0.0,...,0.046004,0.000000,0.000000,0.0,0.253024,0.138013,0.046004,0.0,0.230022,0.0
3,0.0,0.243044,0.710436,0.0,0.0,0.0,0.037391,0.0,0.000000,0.0,...,0.000000,0.000000,0.000000,0.0,0.280435,0.168261,0.000000,0.0,0.037391,0.0
4,0.0,0.649435,0.409426,0.0,0.0,0.0,0.070591,0.0,0.014118,0.0,...,0.000000,0.000000,0.000000,0.0,0.395308,0.211772,0.000000,0.0,0.070591,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1196,0.0,0.024500,0.538996,0.0,0.0,0.0,0.171499,0.0,0.000000,0.0,...,0.000000,0.000000,0.000000,0.0,0.587995,0.024500,0.000000,0.0,0.024500,0.0
1197,0.0,0.000000,0.421459,0.0,0.0,0.0,0.049583,0.0,0.000000,0.0,...,0.000000,0.000000,0.000000,0.0,0.223125,0.049583,0.024792,0.0,0.074375,0.0
1198,0.0,0.000000,0.738485,0.0,0.0,0.0,0.096324,0.0,0.000000,0.0,...,0.000000,0.032108,0.000000,0.0,0.481621,0.064216,0.032108,0.0,0.032108,0.0
1199,0.0,0.000000,0.666795,0.0,0.0,0.0,0.156893,0.0,0.000000,0.0,...,0.000000,0.078446,0.000000,0.0,0.274563,0.039223,0.000000,0.0,0.039223,0.0


In [12]:
stops_tidy = pd.concat([chunk_df, X], axis=1)
stops_tidy

Unnamed: 0,Translator,Chunk,Work,atqui,aut,autem,certe,ceu,confestim,cum,...,sic,sicut,siquidem,tamquam,ut,utique,uelut,ueluti,uero,uidelicet
0,Bur,quoniam quidem ex calido et frigido et sicco e...,Com,0.0,0.097100,0.534050,0.0,0.0,0.0,0.145650,...,0.000000,0.000000,0.000000,0.0,0.145650,0.218475,0.024275,0.0,0.218475,0.0
1,Bur,sermo non quod nunquam fit in uno eodemque cor...,Com,0.0,0.200297,0.578636,0.0,0.0,0.0,0.155787,...,0.000000,0.000000,0.022255,0.0,0.222552,0.311573,0.022255,0.0,0.066766,0.0
2,Bur,horis anni invenire quartam coniugationem comp...,Com,0.0,0.230022,0.575055,0.0,0.0,0.0,0.000000,...,0.046004,0.000000,0.000000,0.0,0.253024,0.138013,0.046004,0.0,0.230022,0.0
3,Bur,quod necesse est in ea putrefieri omnia incipi...,Com,0.0,0.243044,0.710436,0.0,0.0,0.0,0.037391,...,0.000000,0.000000,0.000000,0.0,0.280435,0.168261,0.000000,0.0,0.037391,0.0
4,Bur,de ipsis per capitula quantum ad presentia uti...,Com,0.0,0.649435,0.409426,0.0,0.0,0.0,0.070591,...,0.000000,0.000000,0.000000,0.0,0.395308,0.211772,0.000000,0.0,0.070591,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1196,Bur,ostendat quod secundum veritatem est homo cum ...,Fid,0.0,0.024500,0.538996,0.0,0.0,0.0,0.171499,...,0.000000,0.000000,0.000000,0.0,0.587995,0.024500,0.000000,0.0,0.024500,0.0
1197,Bur,causative dicere ut hoc tibi soli peccavi et p...,Fid,0.0,0.000000,0.421459,0.0,0.0,0.0,0.049583,...,0.000000,0.000000,0.000000,0.0,0.223125,0.049583,0.024792,0.0,0.074375,0.0
1198,Bur,condemnavit peccatum in carne ut iustitia legi...,Fid,0.0,0.000000,0.738485,0.0,0.0,0.0,0.096324,...,0.000000,0.032108,0.000000,0.0,0.481621,0.064216,0.032108,0.0,0.032108,0.0
1199,Bur,uxorem suam et concepit et genuit quare propte...,Fid,0.0,0.000000,0.666795,0.0,0.0,0.0,0.156893,...,0.000000,0.078446,0.000000,0.0,0.274563,0.039223,0.000000,0.0,0.039223,0.0


In [13]:
stops_tidy[stops_tidy.Work == "Rhet"]

Unnamed: 0,Translator,Chunk,Work,atqui,aut,autem,certe,ceu,confestim,cum,...,sic,sicut,siquidem,tamquam,ut,utique,uelut,ueluti,uero,uidelicet
236,Bxx,rethorica est convertibilis dialetice utreque ...,Rhet,0.0,0.394042,0.497737,0.0,0.0,0.0,0.020739,...,0.062217,0.041478,0.0,0.020739,0.165912,0.228129,0.0,0.0,0.331825,0.0
237,Bxx,iuverit maxime utens iuste et leserit iniuste ...,Rhet,0.0,0.27781,0.590345,0.0,0.0,0.0,0.052089,...,0.052089,0.156268,0.0,0.034726,0.121542,0.069452,0.0,0.0,0.347262,0.0
238,Bxx,horum cognitum non oportet dicere ipse enim au...,Rhet,0.0,0.267038,0.534076,0.0,0.0,0.0,0.066759,...,0.05007,0.233658,0.0,0.0,0.183588,0.066759,0.0,0.0,0.317107,0.0
239,Bxx,quomodo non lesit quoniam autem iniustificat n...,Rhet,0.0,0.524604,0.671493,0.0,0.0,0.0,0.0,...,0.020984,0.041968,0.0,0.020984,0.104921,0.125905,0.0,0.0,0.104921,0.0
240,Bxx,sunt ex quibus autem oportet et de hiis et ali...,Rhet,0.0,0.636098,0.371057,0.0,0.0,0.0,0.070678,...,0.070678,0.035339,0.0,0.0,0.088347,0.141355,0.0,0.0,0.512412,0.0
241,Bxx,ut puta quorum natura contingit vero et extra ...,Rhet,0.0,0.501868,0.259587,0.0,0.0,0.0,0.017306,...,0.017306,0.173058,0.0,0.0,0.173058,0.190364,0.0,0.0,0.380728,0.0
242,Bxx,quidem semper et plus ad minus magnum autem et...,Rhet,0.0,0.446895,0.169512,0.01541,0.0,0.0,0.01541,...,0.03082,0.092461,0.0,0.046231,0.123281,0.154102,0.0,0.0,0.385254,0.0
243,Bxx,dicit meleagrum persuaderi ab uxore dicente qu...,Rhet,0.0,0.279386,0.558772,0.0,0.0,0.0,0.059868,...,0.039912,0.099781,0.0,0.019956,0.079825,0.079825,0.0,0.0,0.439035,0.0
244,Bxx,de universali et de partibus dictum est secund...,Rhet,0.0,0.305869,0.415108,0.0,0.0,0.0,0.021848,...,0.043696,0.152935,0.0,0.043696,0.218478,0.174783,0.0,0.0,0.19663,0.0
245,Bxx,enim ex contrariis est de accusatione vero et ...,Rhet,0.0,0.480519,0.294512,0.015501,0.0,0.0,0.015501,...,0.0,0.015501,0.0,0.046502,0.062002,0.108504,0.0,0.0,0.558022,0.0


In [14]:
stops_tidy.to_csv("../data/corpus_rhet.csv")