In [2]:
from preproc import preprocess_and_slice_text_files
import logging
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
logging.getLogger().setLevel(logging.INFO)

In [4]:
raw_chunks = preprocess_and_slice_text_files('../data/txt')

INFO:root:'BurCom.txt' was processed and split into 24 samples
INFO:root:'AristPha.txt' was processed and split into 20 samples
INFO:root:'BurInt.txt' was processed and split into 64 samples
INFO:root:'AnonMetaph.txt' was processed and split into 61 samples
INFO:root:'BurEthN.txt' was processed and split into 4 samples
INFO:root:'BarMor.txt' was processed and split into 20 samples
INFO:root:'AristMet.txt' was processed and split into 6 samples
INFO:root:'BarMirab.txt' was processed and split into 7 samples
INFO:root:'WilMet.txt' was processed and split into 30 samples
INFO:root:'JamPhys.txt' was processed and split into 53 samples
INFO:root:'WilInPar.txt' was processed and split into 161 samples
INFO:root:'BarMun.txt' was processed and split into 5 samples
INFO:root:'WilTet.txt' was processed and split into 33 samples
INFO:root:'BarSig.txt' was processed and split into 3 samples
INFO:root:'AristPhaP.txt' was processed and split into 0 samples
INFO:root:'BarPri.txt' was processed and sp

In [5]:
# list of lists, so we can keep info about spelling variants in individual arrays

stops = []

with open('../data/functionwords.txt','r') as file:
    for line in file:
        words = [x for x in line.lower().split() if x != '/']
        stops.append(list(set(words)))
            
print(stops[:3])

[['atqui', 'atqvi'], ['aut', 'avt'], ['autem', 'avtem']]


In [6]:
# this is tied to our particular filename convention
# TranslatorWorkNamePossiblySeveralWords.txt

entries = []
for k, txt in raw_chunks.items():
    # grab the part before the chunk number in the key, split into translator
    # and work. `if x` drops empty strings that come from re.split.
    ww = [x for x in re.split('([A-Z][a-z]*)', k.split('_')[0]) if x]
    transl = ww[0]
    work = ''.join(ww[1:])
    chunk = ' '.join(txt)
    entries.append({
        'Translator' : transl,
        'Work' : work,
        'Chunk' : chunk
        })
chunk_df = pd.DataFrame(entries)
chunk_df

Unnamed: 0,Translator,Work,Chunk
0,Bur,Com,quoniam quidem ex calido et frigido et sicco e...
1,Bur,Com,sermo non quod nunquam fit in uno eodemque cor...
2,Bur,Com,horis anni invenire quartam coniugationem comp...
3,Bur,Com,quod necesse est in ea putrefieri omnia incipi...
4,Bur,Com,de ipsis per capitula quantum ad presentia uti...
...,...,...,...
1180,Bur,Fid,ostendat quod secundum veritatem est homo cum ...
1181,Bur,Fid,causative dicere ut hoc tibi soli peccavi et p...
1182,Bur,Fid,condemnavit peccatum in carne ut iustitia legi...
1183,Bur,Fid,uxorem suam et concepit et genuit quare propte...


In [7]:
chunk_df.Work.unique()

array(['Com', 'Pha', 'Int', 'Metaph', 'EthN', 'Mor', 'Met', 'Mirab',
       'Phys', 'InPar', 'Mun', 'Tet', 'Sig', 'Pri', 'InTim', 'Cael',
       'Gen', 'P', 'His', '6', '5', 'Elem', 'Simp', '4', 'Anim', '1', '3',
       '2', 'GenA', 'Hom', 'Ion', 'Pue', 'Mat', 'Men', 'Alex', 'EthV',
       'Fid'], dtype=object)

In [8]:
flat_stops = [item for sublist in stops for item in sublist]

In [9]:
# doing this without IDF really just makes it a normalised CountVectorizer. Not
# using IDF because the frequency 'boosting' is really a form a fitting, and we
# want to save that for the classification algorithms.

# L1 vs L2 normalisation is a methodological question. Since we'll be clustering
# these as 'points' L2 has slightly more theoretical support, but either would
# almost certainly be fine.
v = TfidfVectorizer(use_idf=False, analyzer='word', decode_error='replace',norm='l2')
# only count our stopwords
v.fit(flat_stops)

def vectorize(s):
    # make a df with all function words
    X = v.transform([s])
    df = pd.DataFrame(X.toarray())
    df.columns = v.get_feature_names_out()

    # make a blank df
    stops_df_combined = pd.DataFrame()
    for ary in stops:
        ary = sorted(list(set(ary)))
        # each array is a list of spelling variants. sum the counts for all the
        # variants of this stopword that appear in the df, using the first array
        # entry as the label
        stops_df_combined[ary[0]] = df[df.columns.intersection(ary)].sum(axis=1)
    return stops_df_combined

In [10]:
X = pd.concat([vectorize(x) for x in chunk_df.Chunk],axis=0).reset_index(drop=True)
X

Unnamed: 0,atqui,aut,autem,certe,ceu,confestim,cum,dehinc,deinceps,demum,...,sic,sicut,siquidem,tamquam,ut,utique,uelut,ueluti,uero,uidelicet
0,0.0,0.097100,0.534050,0.0,0.0,0.0,0.145650,0.0,0.024275,0.0,...,0.000000,0.000000,0.000000,0.0,0.145650,0.218475,0.024275,0.0,0.218475,0.0
1,0.0,0.200297,0.578636,0.0,0.0,0.0,0.155787,0.0,0.000000,0.0,...,0.000000,0.000000,0.022255,0.0,0.222552,0.311573,0.022255,0.0,0.066766,0.0
2,0.0,0.230022,0.575055,0.0,0.0,0.0,0.000000,0.0,0.023002,0.0,...,0.046004,0.000000,0.000000,0.0,0.253024,0.138013,0.046004,0.0,0.230022,0.0
3,0.0,0.243044,0.710436,0.0,0.0,0.0,0.037391,0.0,0.000000,0.0,...,0.000000,0.000000,0.000000,0.0,0.280435,0.168261,0.000000,0.0,0.037391,0.0
4,0.0,0.649435,0.409426,0.0,0.0,0.0,0.070591,0.0,0.014118,0.0,...,0.000000,0.000000,0.000000,0.0,0.395308,0.211772,0.000000,0.0,0.070591,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1180,0.0,0.024500,0.538996,0.0,0.0,0.0,0.171499,0.0,0.000000,0.0,...,0.000000,0.000000,0.000000,0.0,0.587995,0.024500,0.000000,0.0,0.024500,0.0
1181,0.0,0.000000,0.421459,0.0,0.0,0.0,0.049583,0.0,0.000000,0.0,...,0.000000,0.000000,0.000000,0.0,0.223125,0.049583,0.024792,0.0,0.074375,0.0
1182,0.0,0.000000,0.738485,0.0,0.0,0.0,0.096324,0.0,0.000000,0.0,...,0.000000,0.032108,0.000000,0.0,0.481621,0.064216,0.032108,0.0,0.032108,0.0
1183,0.0,0.000000,0.666795,0.0,0.0,0.0,0.156893,0.0,0.000000,0.0,...,0.000000,0.078446,0.000000,0.0,0.274563,0.039223,0.000000,0.0,0.039223,0.0


In [11]:
stops_tidy = pd.concat([chunk_df,X],axis=1)
stops_tidy

Unnamed: 0,Translator,Work,Chunk,atqui,aut,autem,certe,ceu,confestim,cum,...,sic,sicut,siquidem,tamquam,ut,utique,uelut,ueluti,uero,uidelicet
0,Bur,Com,quoniam quidem ex calido et frigido et sicco e...,0.0,0.097100,0.534050,0.0,0.0,0.0,0.145650,...,0.000000,0.000000,0.000000,0.0,0.145650,0.218475,0.024275,0.0,0.218475,0.0
1,Bur,Com,sermo non quod nunquam fit in uno eodemque cor...,0.0,0.200297,0.578636,0.0,0.0,0.0,0.155787,...,0.000000,0.000000,0.022255,0.0,0.222552,0.311573,0.022255,0.0,0.066766,0.0
2,Bur,Com,horis anni invenire quartam coniugationem comp...,0.0,0.230022,0.575055,0.0,0.0,0.0,0.000000,...,0.046004,0.000000,0.000000,0.0,0.253024,0.138013,0.046004,0.0,0.230022,0.0
3,Bur,Com,quod necesse est in ea putrefieri omnia incipi...,0.0,0.243044,0.710436,0.0,0.0,0.0,0.037391,...,0.000000,0.000000,0.000000,0.0,0.280435,0.168261,0.000000,0.0,0.037391,0.0
4,Bur,Com,de ipsis per capitula quantum ad presentia uti...,0.0,0.649435,0.409426,0.0,0.0,0.0,0.070591,...,0.000000,0.000000,0.000000,0.0,0.395308,0.211772,0.000000,0.0,0.070591,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1180,Bur,Fid,ostendat quod secundum veritatem est homo cum ...,0.0,0.024500,0.538996,0.0,0.0,0.0,0.171499,...,0.000000,0.000000,0.000000,0.0,0.587995,0.024500,0.000000,0.0,0.024500,0.0
1181,Bur,Fid,causative dicere ut hoc tibi soli peccavi et p...,0.0,0.000000,0.421459,0.0,0.0,0.0,0.049583,...,0.000000,0.000000,0.000000,0.0,0.223125,0.049583,0.024792,0.0,0.074375,0.0
1182,Bur,Fid,condemnavit peccatum in carne ut iustitia legi...,0.0,0.000000,0.738485,0.0,0.0,0.0,0.096324,...,0.000000,0.032108,0.000000,0.0,0.481621,0.064216,0.032108,0.0,0.032108,0.0
1183,Bur,Fid,uxorem suam et concepit et genuit quare propte...,0.0,0.000000,0.666795,0.0,0.0,0.0,0.156893,...,0.000000,0.078446,0.000000,0.0,0.274563,0.039223,0.000000,0.0,0.039223,0.0


In [12]:
stops_tidy.to_csv('../data/corpus.csv')