In [1]:
import numpy as np
import pandas as pd
import scipy as sp
import seaborn as sns
import matplotlib.pyplot as plt

import os
import string
import glob
import re
import random

from collections import defaultdict, Counter

from sklearn.feature_selection import mutual_info_classif

In [2]:
stops_tidy = pd.read_csv('../data/stops_tidy.csv',index_col=0)


In [3]:
df = stops_tidy[~stops_tidy.Translator.str.contains('Myst')]
df

Unnamed: 0,Translator,Work,Chunk,atqui,aut,autem,certe,ceu,confestim,cum,...,sic,sicut,siquidem,tamquam,ut,utique,velut,veluti,vero,videlicet
0,Bur,Com,quoniam quidem ex calido et frigido et sicco e...,0.0,0.034605,0.190328,0.0,0.0,0.0,0.051908,...,0.000000,0.000000,0.000000,0.0,0.051908,0.077862,0.008651,0.0,0.077862,0.0
1,Bur,Com,sermo non quod nunquam fit in uno eodemque cor...,0.0,0.079463,0.229559,0.0,0.0,0.0,0.061804,...,0.000000,0.000000,0.008829,0.0,0.088292,0.123609,0.008829,0.0,0.026488,0.0
2,Bur,Com,horis anni invenire quartam coniugationem comp...,0.0,0.091264,0.228161,0.0,0.0,0.0,0.000000,...,0.018253,0.000000,0.000000,0.0,0.100391,0.054759,0.018253,0.0,0.091264,0.0
3,Bur,Com,quod necesse est in ea putrefieri omnia incipi...,0.0,0.114119,0.333577,0.0,0.0,0.0,0.017557,...,0.000000,0.000000,0.000000,0.0,0.131675,0.079005,0.000000,0.0,0.017557,0.0
4,Bur,Com,de ipsis per capitula quantum ad presentia uti...,0.0,0.371354,0.234115,0.0,0.0,0.0,0.040365,...,0.000000,0.000000,0.000000,0.0,0.226042,0.121094,0.000000,0.0,0.040365,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1171,Bur,Fid,ostendat quod secundum veritatem est homo cum ...,0.0,0.008708,0.191573,0.0,0.0,0.0,0.060955,...,0.000000,0.000000,0.000000,0.0,0.208988,0.008708,0.000000,0.0,0.008708,0.0
1172,Bur,Fid,causative dicere ut hoc tibi soli peccavi et p...,0.0,0.000000,0.162756,0.0,0.0,0.0,0.019148,...,0.000000,0.000000,0.000000,0.0,0.086165,0.019148,0.009574,0.0,0.028722,0.0
1173,Bur,Fid,condemnavit peccatum in carne ut iustitia legi...,0.0,0.000000,0.229977,0.0,0.0,0.0,0.029997,...,0.000000,0.009999,0.000000,0.0,0.149985,0.019998,0.009999,0.0,0.009999,0.0
1174,Bur,Fid,uxorem suam et concepit et genuit quare propte...,0.0,0.000000,0.170615,0.0,0.0,0.0,0.040145,...,0.000000,0.020072,0.000000,0.0,0.070253,0.010036,0.000000,0.0,0.010036,0.0


In [4]:
samp_ctr = Counter(df.Translator)
samp_ctr

Counter({'Bur': 371,
         'Arist': 35,
         'Anon': 66,
         'Bar': 44,
         'Wil': 554,
         'Jam': 87})

In [5]:
SAMPLE_SIZE = min(samp_ctr.values())//2


In [6]:
def mi_from_df(df: pd.DataFrame) -> pd.DataFrame:
    ret = []
    for transl in df.Translator.unique():
        ovr_y = df.Translator.apply(lambda x: x==transl)
        mi = mutual_info_classif(df.drop(['Translator','Work','Chunk'],axis=1),ovr_y,random_state=42)
        ranked = sorted(list(zip(mi,df.columns[3:])),key=lambda x: x[0],reverse=True)
        for x in ranked:
            w = x[1]
            odds = df[df.Translator==transl][w].mean() / df[w].mean()
            ret.append({'translator':transl,'word':x[1],'mi':x[0],'odds':odds})
    return pd.DataFrame(ret)

In [7]:
def bootstrap_mi(df: pd.DataFrame, n: int=100) -> pd.DataFrame:

    # for each bootstrap balanced sample, calculate an MI vector as before
    raw_arrays = defaultdict(list)
    for _ in range(n):
        samp = df.groupby('Translator').apply(lambda x: x.sample(SAMPLE_SIZE)).reset_index(drop=True)
        ret = []
        for transl in samp.Translator.unique():
            ovr_y = samp.Translator.apply(lambda x: x==transl)
            mi = mutual_info_classif(samp.drop(['Translator','Work','Chunk'],axis=1),ovr_y)
            raw_arrays[transl].append(mi)

    # Now use the average bootstrapped MI, but calculate relative odds vs the
    # entire corpus (it's just a guide to whether the word more or less common)
    for transl in df.Translator.unique():
        mi = np.array(raw_arrays[transl]).mean(axis=0)
        ranked = sorted(list(zip(mi,df.columns[3:])),key=lambda x: x[0],reverse=True)
        for x in ranked:
            w = x[1]
            odds = df[df.Translator==transl][w].mean() / df[w].mean()
            ret.append({'translator':transl,'word':x[1],'mi':x[0],'odds':odds})

    return pd.DataFrame(ret)

Because the classes are so imbalanced, it looks like what we get here is influenced by the best way to tell Wil and Bur (the biggest classes) apart.

In [8]:
ranked_words = mi_from_df(df)
ranked_words.groupby('translator').head(5).reset_index(drop=True)

Unnamed: 0,translator,word,mi,odds
0,Bur,rursus,0.333401,3.003095
1,Bur,sic,0.275262,0.14811
2,Bur,ita,0.242001,2.245316
3,Bur,iterum,0.195268,0.027939
4,Bur,quoniam,0.184171,2.052034
5,Arist,sane,0.128727,30.978305
6,Arist,quoque,0.110303,27.461071
7,Arist,nempe,0.10917,26.72387
8,Arist,quippe,0.086353,22.206966
9,Arist,certe,0.077413,24.297268


In any case the bootstrapped version seems maybe more reasonable from a theoretical standpoint? Unless we're averaging too many things??

In [9]:
bootstrap_mi(df,100).groupby('translator').head(5).reset_index(drop=True)

Unnamed: 0,translator,word,mi,odds
0,Bur,rursus,0.188757,3.003095
1,Bur,sic,0.142046,0.14811
2,Bur,denique,0.131961,3.095528
3,Bur,iterum,0.106568,0.027939
4,Bur,ergo,0.102864,0.172742
5,Arist,sane,0.384716,30.978305
6,Arist,quoque,0.378811,27.461071
7,Arist,nempe,0.353067,26.72387
8,Arist,quippe,0.331519,22.206966
9,Arist,etiam,0.266128,7.266679
