In [1]:
import pandas as pd
import numpy as np
import math
from scipy.stats import rankdata
import re
from cltk.stem.latin.j_v import JVReplacer

In [2]:
replacer = JVReplacer()

In [3]:
df = pd.read_csv('../data/datasets/vf_intertext_dataset_1_0.csv')

In [4]:
tess_path = '../data/tesseraeresults/'
tess_results = ['Lucan', 'Ovid', 'Statius', 'Vergil']

In [5]:
vfranks_all = []
counter_all = []
count_all = []
n_all = []
P1_all = []

for result_author in tess_results:
    results_file = f'{tess_path}tesresults{result_author}.csv'

    #read Tesserae results (Author vs. VF1) into dataframe
    df_tesserae = pd.read_csv(results_file) #Change depending on author

    #assign ranks to tesserae data based off min scoring method.
    ranks = rankdata(df_tesserae[['SCORE']][::-1], method='min')
    df_tesserae['RANK'] = ranks

    listOfWords=[]
    listOfWords2=[]
    listOfBooks = []
    linenums = []
    ref = []

    # Process line nums based off of the standardized vf "Discovered reference "
    # splits and retrieves booknumber, line num 
    # takes in value in Discovered reference column
    def vfprocesslinenums(reference):
        subs = reference[reference.index("Part")+5:] #offset. 
        subs = subs[subs.index(" ")+1:]
        return [int(subs.split(".")[0])], [int(subs.split(".")[1])]
    
    def processlinenums(reference):
        bookdone = 0
        book = ""
        line = ""
        for char in reference:
            if char is '-':
                break
            elif char is '.' and bookdone is 1:
                bookdone = 2
            elif char.isdigit():
                if bookdone == 0:
                    bookdone = 1
                if bookdone == 1:
                    book+=char
                else:
                    line +=char
        if book is "" or line is "":
            return [],[]
        return [int(book)],[int(line)]    
    
    for index, row in df.iterrows():
        a = replacer.replace(row['Query Phrase'].lower()).split()
        b = replacer.replace(row['Result Phrase'].lower()).split()
        c, d = [row['Intertext: Book']], [row['Intertext: Line Start']]
        f = f"{row['Intertext: Author']} {row['Intertext: Work']} {row['Intertext: Book']}.{row['Intertext: Line Start']}"
        listOfWords.append(a)
        listOfWords2.append(b)
        listOfBooks.append(c)
        linenums.append(d)
        ref.append(f)

    # combined list
    listsCombined=[]
    for i, word in enumerate(listOfWords):
        c= []
        c.append(listOfWords[i])
        c.append(listOfWords2[i])
        c.append(listOfBooks[i])
        c.append(linenums[i])
        c.append(ref[i])
        listsCombined.append(c)

    # generate list of phrases
    P1=[]
    P2=[]
    P3=[]
    P4=[]
    P5=[]
    for index, row in df_tesserae.iterrows(): 
        a = replacer.replace(row['TARGET_TXT'].lower())
        b = replacer.replace(row['SOURCE_TXT'].lower())
        c,d = processlinenums(row['SOURCE_LOC'])
        e = row['RANK']
        P1.append(a)
        P2.append(b)
        P3.append(c)
        P4.append(d)
        P5.append(e)

    # given a list of queries, count how many times (w1, w2) appears in P1 and (w3, w4) appears in P2 at the same row
    # print total (including duplicate hits)
    n = 0

    trackDuplicates = []
    vfranks = []
    counter =0
    count =0
    newf = open("temp/tesserae_comparison_results.txt", 'w', encoding="utf-8")
    for j, word in enumerate(listsCombined):
        count +=1
        if result_author in listsCombined[j][4]: #Change this based off of what author evaluating
            counter += 1
            record = "0"
            for i, (phrase1, phrase2) in enumerate(zip(P1,P2)):
                match = True
                index1 = int(phrase1.index("**")+2)
                index2 = int(phrase1.index("**",index1))
                index3 = int(phrase1.index("**",index2+2)+2)
                index4 = int(phrase1.index("**",index3))
                strictstring = phrase1[index1:index2]+phrase1[index3:index4]
                for word in listsCombined[j][0]:
                    if word.lower() not in strictstring:
                        match = False

                index1 = int(phrase2.index("**")+2)
                index2 = int(phrase2.index("**",index1))
                index3 = int(phrase2.index("**",index2+2)+2)
                index4 = int(phrase2.index("**",index3))
                strictstring = phrase2[index1:index2]+phrase2[index3:index4]
                for word in listsCombined[j][1]:
                    if word.lower() not in strictstring:
                        match = False

                if abs(listsCombined[j][3][0]-P4[i][0]) > 1:
                    match = False
                if listsCombined[j][2][0] is not P3[i][0]:
                    match = False
                if match:
                    n+=1
                    record = str(P5[i])
                    a = listsCombined[j]
                    trackDuplicates.append(a)
                    vfranks.append(P5[i])
            newf.write(record+"\n")

    newf.close()

    vfranks_all.append(vfranks)
    counter_all.append(counter)
    count_all.append(count)
    n_all.append(n)
    P1_all.append(P1)

In [6]:
P1_lens = [len(item) for item in P1_all]

In [7]:
results_data = zip(tess_results, n_all, counter_all, P1_lens)

In [8]:
results_df = pd.DataFrame(results_data, columns = ['Author', 'Recovered (VF)', 'Total (VF)', 'Total Results'])
results_df.set_index('Author', inplace=True)

In [9]:
results_df['recall'] = results_df['Recovered (VF)'] / results_df['Total (VF)']
results_df['precision'] = results_df['Recovered (VF)'] / results_df['Total Results']

In [10]:
results_df

Unnamed: 0_level_0,Recovered (VF),Total (VF),Total Results,recall,precision
Author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Lucan,48,150,6537,0.32,0.007343
Ovid,52,150,9549,0.346667,0.005446
Statius,35,124,7564,0.282258,0.004627
Vergil,185,521,9392,0.355086,0.019698


In [11]:
total_df = pd.DataFrame(results_df[['Recovered (VF)', 'Total (VF)', 'Total Results']].sum()).T
total_df['recall'] = total_df['Recovered (VF)'] / total_df['Total (VF)']
total_df['precision'] = total_df['Recovered (VF)'] / total_df['Total Results']
total_df.index = ['Total']
total_df

Unnamed: 0,Recovered (VF),Total (VF),Total Results,recall,precision
Total,320,945,33042,0.338624,0.009685
