In [1]:
import xml.etree.ElementTree as ET
import time
import numpy as np

In [2]:
# timestamp with start time
timestamp_start = time.time()

In [3]:
# parse xml into python element
tree = ET.parse('cora-all-id.xml')
root = tree.getroot()

# threshold for similarity calculation
threshold = 0.35

In [4]:
def getDiff(attributes_1, attributes_2):
    sim = 0
    for att_1 in attributes_1:
        for att_2 in attributes_2:
            if(att_1==att_2):
                sim += 1
    delta = len(attributes_1) - sim 
    return delta

In [5]:
# modified version of hamming distance
def calculateSimilarity(pub_comp, pub):
    diff = 0
    l_pub = len(pub)
    l_pub_comp = len(pub_comp)
    if(l_pub_comp < l_pub):
        diff = getDiff(pub, pub_comp)
    elif(l_pub_comp > l_pub):
        diff = getDiff(pub_comp, pub)
    elif(l_pub_comp == l_pub):
        diff = getDiff(pub_comp, pub)
    return diff

In [6]:
publications = []   # list with the author ids for each publication
list_ids = []       # list with the ids of the publications

# first loop to store the relevant data into lists
for child in root:
    list_local_authors = []
    title = []
    for element in child:
        if(element.tag == 'author'): # store the author ids into the list
            list_local_authors.append(element.attrib['id']) # collect all authors in a local list first 
        elif(element.tag == 'venue'):
            for ven in element:
                for item in ven:
                    list_local_authors.append(item.text)
        elif(element.tag == 'title'):
            title.append(element.text)
    if(title != ""):
        full_title = ''.join(title)
        list_local_authors.append(full_title)
    list_ids.append(child.attrib['id']) # store the id of the current publication in a list
    publications.append(list_local_authors) # append the list with the collected authors to the publications

In [7]:
# GOLD STANDART
#--------------
golden_duplicates = []
# first loop to store the relevant data into lists
pos_comp = 0
for child in root:
    pos = 0
    for child_comp in root:
        if(child_comp.attrib['id'] == child.attrib['id'] and pos != pos_comp):
            golden_duplicates.append((pos_comp,pos))
        pos += 1
    pos_comp += 1

In [8]:
#FIND DUPLICATES
#---------------
duplicates = [] # list with the publication ids of the duplicates

for j in range(0, len(publications)):
    list_buffer = [] # collect all duplicates in the local list first
    for i in range(0, len(publications)):
        diff = calculateSimilarity(publications[j], publications[i])
        if( diff < ( threshold* len(publications[j])) and i != j ):
            duplicates.append((j, i)) # add tuple of pair to evaluation list

In [9]:
true_positive = 0
false_positive = 0
false_negative = 0
tp = False
timestamp2 = time.time() - timestamp_start
for dup in duplicates:
    for i in range(len(golden_duplicates)):
        gold = golden_duplicates[i]
        if gold == dup:
            true_positive += 1
            del golden_duplicates[i]
            tp = True
            break
    if(not tp):
        false_positive += 1
for gold in golden_duplicates:
    if gold not in duplicates:
        false_negative += 1
timestamp2_end = (time.time() - timestamp2) / 1000
print("hat gedauert: " + str(timestamp2_end))

hat gedauert: 1528799.948895261


In [10]:
print("true pos: " + str(true_positive))
print("false pos: " + str(false_positive))
precision = true_positive / (true_positive + false_positive)
recall = true_positive / (true_positive + false_negative)
f1_score = 2*true_positive / (2*true_positive + false_positive + false_negative)

print("Evaluation")
print("Precision: " + str(precision))
print("Recall: " + str(recall))
print("F1 Score: " + str(f1_score))

########################################
# PRINTING
print("#### gold-standart duplicates ###")
for i in range(17):    
    print(golden_duplicates[i])

print("#### retrieved duplicates ###")
for i in range(17):
    print(duplicates[i])

print("#######################################")
print(str(len(golden_duplicates)) + " #duplicates in gold-standart"  )
print(str(len(duplicates)) + " #retrieved duplicates " )
print("#######################################")

# timestamp for termination time
timestamp_end = time.time() - timestamp_start
print("Calculation Finished")
print('Time passed: ' + str(timestamp_end))
print("#######################################")

true pos: 56904
false pos: 0
Evaluation
Precision: 1.0
Recall: 0.3944818024263432
F1 Score: 0.5657754755063285
#### gold-standart duplicates ###
(11, 12)
(11, 13)
(12, 11)
(13, 11)
(15, 17)
(16, 17)
(17, 15)
(17, 16)
(18, 19)
(18, 20)
(18, 21)
(19, 18)
(19, 22)
(20, 18)
(20, 22)
(21, 18)
(21, 22)
#### retrieved duplicates ###
(0, 1)
(0, 2)
(0, 3)
(0, 4)
(0, 5)
(0, 6)
(0, 7)
(1, 0)
(1, 2)
(1, 3)
(1, 4)
(1, 5)
(1, 6)
(1, 7)
(2, 0)
(2, 1)
(2, 3)
#######################################
87346 #duplicates in gold-standart
91012 #retrieved duplicates 
#######################################
Calculation Finished
Time passed: 1563.265881061554
#######################################
