In [1]:
from preproc import preprocess_and_slice_text_files
import logging
import re
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
logging.getLogger().setLevel(logging.INFO)

In [6]:
raw_df = preprocess_and_slice_text_files('../data/txt')

INFO:root:'BurEthV.txt' was processed and split into 9 samples
INFO:root:'BarPri.txt' was processed and split into 2 samples
INFO:root:'WilHis.txt' was processed and split into 39 samples
INFO:root:'WilElem.txt' was processed and split into 25 samples
INFO:root:'JamAnim.txt' was processed and split into 19 samples
INFO:root:'BurGen.txt' was processed and split into 15 samples
INFO:root:'WilSimp.txt' was processed and split into 120 samples
INFO:root:'BarPro.txt' was processed and split into 10 samples
INFO:root:'WilAlex.txt' was processed and split into 72 samples
INFO:root:'WilRhet.txt' was processed and split into 15 samples
INFO:root:'WilGenA.txt' was processed and split into 44 samples
INFO:root:'Myst3.txt' was processed and split into 10 samples
INFO:root:'BarMor.txt' was processed and split into 20 samples
INFO:root:'AristPha.txt' was processed and split into 20 samples
INFO:root:'BurCom.txt' was processed and split into 24 samples
INFO:root:'AnonPhys.txt' was processed and split

In [7]:
# list of lists, so we can keep info about spelling variants in individual arrays

stops = []

with open('../data/functionwords.txt','r') as file:
    for line in file:
        words = [x for x in line.lower().split() if x != '/']
        stops.append(list(set(words)))
            
print(stops[:3])

[['atqui', 'atqvi'], ['aut', 'avt'], ['autem', 'avtem']]


In [8]:
raw_df.Work.unique()

array(['EthV', 'Pri', 'His', 'Elem', 'Anim', 'Gen', 'Simp', 'Pro', 'Alex',
       'Rhet', 'GenA', '3', 'Mor', 'Pha', 'Com', 'Phys', 'Int', 'Met',
       'Sig', 'Tet', '6', 'Mun', 'Pue', 'Mat', 'InPar', 'Ion', 'Men',
       'Fid', 'Mirab', 'Metaph', 'Cael', 'InTim', 'EthN'], dtype=object)

In [9]:
flat_stops = [item for sublist in stops for item in sublist]

In [10]:
# doing this without IDF really just makes it a normalised CountVectorizer. Not
# using IDF because the frequency 'boosting' is really a form a fitting, and we
# want to save that for the classification algorithms.

# L1 vs L2 normalisation is a methodological question. Since we'll be clustering
# these as 'points' L2 has slightly more theoretical support, but either would
# almost certainly be fine.
v = TfidfVectorizer(use_idf=False, analyzer='word', decode_error='replace',norm='l2')
# only count our stopwords
v.fit(flat_stops)

def vectorize(s):
    # make a df with all function words
    X = v.transform([s])
    df = pd.DataFrame(X.toarray())
    df.columns = v.get_feature_names_out()

    # make a blank df
    stops_df_combined = pd.DataFrame()
    for ary in stops:
        ary = sorted(list(set(ary)))
        # each array is a list of spelling variants. sum the counts for all the
        # variants of this stopword that appear in the df, using the first array
        # entry as the label
        stops_df_combined[ary[0]] = df[df.columns.intersection(ary)].sum(axis=1)
    return stops_df_combined

In [12]:
# NB NOT SCALED!!

X = pd.concat([vectorize(x) for x in raw_df.Chunk],axis=0).reset_index(drop=True)
X

Unnamed: 0,atqui,aut,autem,certe,ceu,confestim,cum,dehinc,deinceps,demum,...,sic,sicut,siquidem,tamquam,ut,utique,uelut,ueluti,uero,uidelicet
0,0.000000,0.062338,0.685717,0.0,0.0,0.0,0.020779,0.0,0.000000,0.0,...,0.207793,0.000000,0.0,0.000000,0.207793,0.041559,0.0,0.0,0.145455,0.0
1,0.000000,0.049793,0.813285,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.165977,0.000000,0.0,0.000000,0.016598,0.066391,0.0,0.0,0.066391,0.0
2,0.000000,0.000000,0.896433,0.0,0.0,0.0,0.011493,0.0,0.000000,0.0,...,0.034478,0.000000,0.0,0.000000,0.091942,0.011493,0.0,0.0,0.045971,0.0
3,0.000000,0.064567,0.774798,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.096850,0.000000,0.0,0.000000,0.080708,0.064567,0.0,0.0,0.032283,0.0
4,0.000000,0.150855,0.822845,0.0,0.0,0.0,0.054856,0.0,0.000000,0.0,...,0.013714,0.000000,0.0,0.000000,0.095999,0.123427,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1206,0.025294,0.328824,0.227648,0.0,0.0,0.0,0.000000,0.0,0.075883,0.0,...,0.126471,0.075883,0.0,0.075883,0.430001,0.000000,0.0,0.0,0.455295,0.0
1207,0.000000,0.298142,0.238514,0.0,0.0,0.0,0.059628,0.0,0.000000,0.0,...,0.089443,0.000000,0.0,0.000000,0.268328,0.000000,0.0,0.0,0.566471,0.0
1208,0.000000,0.380143,0.253429,0.0,0.0,0.0,0.050686,0.0,0.000000,0.0,...,0.050686,0.000000,0.0,0.025343,0.456172,0.000000,0.0,0.0,0.481514,0.0
1209,0.030331,0.242647,0.242647,0.0,0.0,0.0,0.060662,0.0,0.000000,0.0,...,0.060662,0.060662,0.0,0.000000,0.121324,0.000000,0.0,0.0,0.454963,0.0


In [51]:
stops_tidy = pd.concat([raw_df,X],axis=1)
stops_tidy

Unnamed: 0,Translator,Chunk,Work,atqui,aut,autem,certe,ceu,confestim,cum,...,sic,sicut,siquidem,tamquam,ut,utique,uelut,ueluti,uero,uidelicet
0,Bur,duplici autem virtute existente hac quidem int...,EthV,0.000000,0.062338,0.685717,0.0,0.0,0.0,0.020779,...,0.207793,0.000000,0.0,0.000000,0.207793,0.041559,0.0,0.0,0.145455,0.0
1,Bur,malicia autem contrarium fiet autem nobis et e...,EthV,0.000000,0.049793,0.813285,0.0,0.0,0.0,0.000000,...,0.165977,0.000000,0.0,0.000000,0.016598,0.066391,0.0,0.0,0.066391,0.0
2,Bur,bene habentibus operis quoniam neque auferendu...,EthV,0.000000,0.000000,0.896433,0.0,0.0,0.0,0.011493,...,0.034478,0.000000,0.0,0.000000,0.091942,0.011493,0.0,0.0,0.045971,0.0
3,Bur,et medietas amicicia qui superhabundat autem q...,EthV,0.000000,0.064567,0.774798,0.0,0.0,0.0,0.000000,...,0.096850,0.000000,0.0,0.000000,0.080708,0.064567,0.0,0.0,0.032283,0.0
4,Bur,iacit volens in salute autem sui ipsius et rel...,EthV,0.000000,0.150855,0.822845,0.0,0.0,0.0,0.054856,...,0.013714,0.000000,0.0,0.000000,0.095999,0.123427,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1206,Anon,palam nec enim impar infinitus est nec par gen...,Metaph,0.025294,0.328824,0.227648,0.0,0.0,0.0,0.000000,...,0.126471,0.075883,0.0,0.075883,0.430001,0.000000,0.0,0.0,0.455295,0.0
1207,Anon,aut non quare eadem accident et sic aut enim n...,Metaph,0.000000,0.298142,0.238514,0.0,0.0,0.0,0.059628,...,0.089443,0.000000,0.0,0.000000,0.268328,0.000000,0.0,0.0,0.566471,0.0
1208,Anon,quidem potentia illud vero actu ergo potentia ...,Metaph,0.000000,0.380143,0.253429,0.0,0.0,0.0,0.050686,...,0.050686,0.000000,0.0,0.025343,0.456172,0.000000,0.0,0.0,0.481514,0.0
1209,Anon,tempori non finis est non etiam erit sempitern...,Metaph,0.030331,0.242647,0.242647,0.0,0.0,0.0,0.060662,...,0.060662,0.060662,0.0,0.000000,0.121324,0.000000,0.0,0.0,0.454963,0.0


In [32]:
workfns = sorted([x.split('.')[0] for x in os.listdir('../data/txt')])

In [37]:
# manually copied from an Excel spreadsheet

s="""Aristoteles
Aristoteles
Plato
Aristoteles
Plato
Aristoteles
Aristoteles
Aristoteles
Theophrastus
Aristoteles
Hippocrates
Aristoteles
Theophrastus
Galenus
Aristoteles
Aristoteles
Johannes Damascenus
Aristoteles
Galenus
Johannes Chrysostomus
Johannes Chrysostomus
Aristoteles
Aristoteles
Aristoteles
Unknown
Unknown
Alexander of Aphrodisias
Aristoteles
Proclus
Aristoteles
Aristoteles
Proclus
Proclus
Aristoteles
Aristoteles
Simplicius
Ptolemaeus"""

In [41]:
authors = s.replace('\n',',').split(',')

In [45]:
# they're at least the same size, or this would fail

work_lookup = dict(zip(workfns,authors))

In [46]:
def find_author(r):
    k = r.Translator+r.Work
    return work_lookup[k]

In [52]:
stops_tidy.insert(3, 'Author', stops_tidy.apply(find_author,axis=1))

In [55]:
stops_tidy[stops_tidy.Translator=='Wil']

Unnamed: 0,Translator,Chunk,Work,Author,atqui,aut,autem,certe,ceu,confestim,...,sic,sicut,siquidem,tamquam,ut,utique,uelut,ueluti,uero,uidelicet
11,Wil,ï»¿earum que sunt in animalibus partium hee qu...,His,Aristoteles,0.0,0.048741,0.828591,0.0,0.0,0.000000,...,0.000000,0.060926,0.000000,0.000000,0.170592,0.024370,0.012185,0.0,0.000000,0.0
12,Wil,utroque participant et gregalium autem et soli...,His,Aristoteles,0.0,0.146733,0.843717,0.0,0.0,0.000000,...,0.000000,0.009171,0.000000,0.000000,0.165075,0.000000,0.018342,0.0,0.000000,0.0
13,Wil,ex parte a principio aliud autem alimentum ei ...,His,Aristoteles,0.0,0.082154,0.798069,0.0,0.0,0.011736,...,0.011736,0.000000,0.000000,0.000000,0.187781,0.000000,0.000000,0.0,0.011736,0.0
14,Wil,differunt et tota aut in has quidem habere has...,His,Aristoteles,0.0,0.218315,0.903028,0.0,0.0,0.000000,...,0.019847,0.009923,0.000000,0.019847,0.049617,0.009923,0.019847,0.0,0.000000,0.0
15,Wil,ligata quemadmodum blesis et balbis est autem ...,His,Aristoteles,0.0,0.018858,0.914606,0.0,0.0,0.000000,...,0.009429,0.018858,0.000000,0.000000,0.047145,0.009429,0.018858,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1141,Wil,igni econverso autem dicere mediis est extremi...,Cael,Aristoteles,0.0,0.041073,0.753001,0.0,0.0,0.013691,...,0.027382,0.000000,0.013691,0.000000,0.041073,0.095836,0.000000,0.0,0.000000,0.0
1142,Wil,ï»¿de commento procli super timeum platonis su...,InTim,Proclus,0.0,0.000000,0.686544,0.0,0.0,0.000000,...,0.054924,0.000000,0.000000,0.000000,0.247156,0.082385,0.054924,0.0,0.000000,0.0
1143,Wil,autem contactus secundum quem attingimus diuin...,InTim,Proclus,0.0,0.111221,0.711814,0.0,0.0,0.000000,...,0.155709,0.022244,0.000000,0.000000,0.244686,0.022244,0.000000,0.0,0.000000,0.0
1144,Wil,per se a partialibus participatur animabus sed...,InTim,Proclus,0.0,0.037107,0.723588,0.0,0.0,0.000000,...,0.074214,0.018554,0.000000,0.018554,0.204089,0.074214,0.018554,0.0,0.018554,0.0


In [56]:
stops_tidy.to_csv('../data/arist_corpus.csv')