In [None]:
#This file measures the performance of the query execution, using the other files and their functionalities as imports.
#Based on PerformanceMeasure, same output format but specialized on the inverted index files.
#@author Thorsten

In [None]:
#import and initialize the query execution
import executeQuery_InvertedIndex as eq
import loadDataRaw
import numpy as np
import time #time measurement for the performance measure
import gc #gc.collect() runs garbage collection manually
import csv
documents=np.array(loadDataRaw.readFile('train.docs'))
#here don't give the document names as strings to the index becuasae the returned results would be so big
#that only a few hundret fit into memory
#Instead give the position in the index and store in docIDs to be able to lookup there for fast reconverting later.
docIDs=documents[:,0]
#eq.initExecution(documents[:,1], documents[:,0])
eq.initExecution(documents[:,1], np.array(range(len(documents))))

In [None]:
#for testing, not needed later
#query=('This is a test!')
#query=('how contaminated are our children ?')
#next query: Original from query document, so indexed
#query='statin breast cancer survival nationwide cohort study finland abstract recent studies suggested statins established drug group prevention cardiovascular mortality delay prevent breast cancer recurrence effect disease-specific mortality remains unclear evaluated risk breast cancer death statin users population-based cohort breast cancer patients study cohort included newly diagnosed breast cancer patients finland num num num cases identified finnish cancer registry information statin diagnosis obtained national prescription database cox proportional hazards regression method estimate mortality statin users statin time-dependent variable total num participants statins median follow-up num years diagnosis range num num years num participants died num num due breast cancer adjustment age tumor characteristics treatment selection post-diagnostic pre-diagnostic statin lowered risk breast cancer death hr num num ci num num hr num num ci num num risk decrease post-diagnostic statin affected healthy adherer bias greater likelihood dying cancer patients discontinue statin association dose-dependent observed low-dose/short-term dose time-dependence survival benefit pre-diagnostic statin users suggests causal effect evaluated clinical trial testing statins effect survival breast cancer patients '
#timeBefore=time.perf_counter()
#result=eq.executeQuery(query, sort=False)
#timeAfter=time.perf_counter()
#print(timeAfter-timeBefore)
#print('min: ', np.min(result[1]))
#print('max: ', np.max(result[1]))
#print('avg: ', np.average(result[1]))
#print('len: ', len(result[1]))

In [None]:
#import the query file
#choose one of them
#queries=np.array(loadDataRaw.readFile('train.nontopic-titles.queries'))
#queries=np.array(loadDataRaw.readFile('train.vid-desc.queries'))
#queries=np.array(loadDataRaw.readFile('train.vid-titles.queries'))
#queries=np.array(loadDataRaw.readFile('train.all.queries'))
queries=np.array(loadDataRaw.readFile('train.titles.queries'))

#queries=np.array(loadDataRaw.readFile('train.titles.queries')[0:10]) #subset for testing
ids=queries[:,0] #now containing all ids
queries=queries[:,1] #now containing all queries

In [None]:
#execute the queries
runtimes=np.empty(len(queries))
resultIDs=np.empty(len(queries), dtype=object)
resultSims=np.empty(len(queries), dtype=object)
print('Execute ', len(queries), ' queries...')
for i in range(len(queries)):
    timeBefore=time.perf_counter()
    result=eq.executeQuery(queries[i], sort=False)
    timeAfter=time.perf_counter()
    runtimes[i]=timeAfter-timeBefore
    resultIDs[i]=result[0]
    resultSims[i]=result[1]
    gc.collect() #run the garbage collection to avoid running out of memory
print('Done!')

In [None]:
#postprocessing
#some statistics
print('Complete runtime in s:\t', sum(runtimes))
print('Min runtime in ms:\t', min(runtimes)*1000)
print('Max runtime in ms:\t', max(runtimes)*1000)
print('Avg runtime in ms:\t', np.average(runtimes)*1000)
print('Variance o² in ms:\t', np.var(runtimes)*1000)
print('Std o in ms:\t\t', np.std(runtimes)*1000)
#tidy up to avoid running out of memory
#del runtimes
#del generateDTM
#gc.collect() #manual run of garbage collection

In [None]:
#export the short versoin without the gold standard values
print("Start writing the short result version to disk.")
with open('Results_short.csv', 'w', encoding='utf-8') as csvOutput: #change file name as needed
    writer=csv.writer(csvOutput, lineterminator='\n', delimiter='\t')
    #headers
    writer.writerow(['QUERY_ID', 'DOC_ID', 'sim_results'])
    for i in range(len(ids)): #for every query id...
        for j in range(len(resultIDs[i])):
            writer.writerow([ids[i], resultIDs[i][j], resultSims[i][j]])
print("Completed writing the short result version.")

In [None]:
#load the gold standard
goldStandard=np.array(loadDataRaw.readFile('train.3-2-1.qrel'))
goldStandard=np.delete(goldStandard, 1, 1) #now [queryID][docID][relevance]

In [None]:
#prepare the comparing
#Build the result matix
mat=np.zeros((4, 4)) #Format: sum, count, min, max
mat[:,2]=1
mat[:,3]=-1

In [None]:
#join queries and gold standard and fill the matrix
#sortingn becomes a little tricky
gc.collect()
idOrder=np.argsort(ids)
ids=ids[idOrder]
resultIDs=resultIDs[idOrder]
resultSims=resultSims[idOrder]
runtimes=runtimes[idOrder]
del idOrder

for i in range(len(ids)):
    order=np.argsort(resultIDs[i])
    resultIDs[i]=resultIDs[i][order]
    resultSims[i]=resultSims[i][order]
del order

goldStandard=goldStandard[goldStandard[:,1].argsort()]
goldStandard=goldStandard[goldStandard[:,0].argsort(kind='mergesort')]
gc.collect()

In [None]:
#prepere something to write later
file=[] #stores what to write later into the csv-file

#headers
file.append([])
for name in ['queryID', 'queryRuntime', 'docResultID', 'resultSimilarity', 'goldStandardValue']:
    file[0]+=[name]

In [None]:
#now merge the arrays and fill the file variable
#Remember you must use the lookup table docIDs for the resultIDs.
gsPointer=0 #pointer to the current field in the gold standard
notFound=0
#    because of the problem that one result id can have many entries in the gold standard,
#    but not the other way around.
for i in range(len(ids)): #for every query id...
    while((gsPointer<len(goldStandard)) and (ids[i]>goldStandard[gsPointer][0])): #missed a gold standard entry, so sth went wrong
        #Critical error only if all queries are made, but decided to keep on working
        #print('Error: i=', i, ' gsPointer=', gsPointer, ' Missed Gold standard entry=', goldStandard[gsPointer], 'id=', ids[i])
        gsPointer+=1
        notFound+=1
    for j in range(len(resultIDs[i])):
        if((gsPointer<len(goldStandard)) and ((ids[i]==goldStandard[gsPointer][0]) & (docIDs[resultIDs[i][j]]==goldStandard[gsPointer][1]))): #match
            mat[int(goldStandard[gsPointer][2])][0]+=resultSims[i][j]
            mat[int(goldStandard[gsPointer][2])][1]+=1
            mat[int(goldStandard[gsPointer][2])][2]=min(mat[int(goldStandard[gsPointer][2])][2], resultSims[i][j])
            mat[int(goldStandard[gsPointer][2])][3]=max(mat[int(goldStandard[gsPointer][2])][3], resultSims[i][j])
            file.append([ids[i], runtimes[i], docIDs[resultIDs[i][j]], resultSims[i][j], goldStandard[gsPointer][2]])
            gsPointer+=1
        else: #search result not in the gold standard, meaning it has importance 0
            mat[0][0]+=resultSims[i][j]
            mat[0][1]+=1
            mat[0][2]=min(mat[0][2], resultSims[i][j])
            mat[0][3]=max(mat[0][3], resultSims[i][j])
            file.append([ids[i], runtimes[i], docIDs[resultIDs[i][j]], resultSims[i][j], '0'])
while(gsPointer<len(goldStandard)):
    notFound+=1
    gsPointer+=1

In [None]:
#Print statistics for the results
print('Not found from the gold standard (errors): ', notFound)
print('Result performance value matrix:')
print('\t\tsum, count, min, max')
print('importance 0:\t', mat[0], 'Avg: ', mat[0][0]/mat[0][1])
print('importance 1:\t', mat[1], 'Avg: ', mat[1][0]/mat[1][1])
print('importance 2:\t', mat[2], 'Avg: ', mat[2][0]/mat[2][1])
print('importance 3:\t', mat[3], 'Avg: ', mat[3][0]/mat[3][1])

In [None]:
#Wirte the results to disk
#del runtimes, ids, resultIDs, resultSims, goldStandard

#export
print("Start writing the results to disk.")
with open('Results.csv', 'w', encoding='utf-8') as csvOutput: #change file name as needed
    writer=csv.writer(csvOutput, lineterminator='\n', delimiter='\t')
    for i in range(len(file)):
        writer.writerow(file[i])
print("Completed writing the results.")