In [1]:
import pandas as pd

rawData = 'raw.csv'
groundTruth = 'groundTruth.csv'
improvedFixedData = 'improved_result.csv'
unimprovedFixedData = 'unimproved_result.csv'

columns = [
    'city_name', 
    'street_name'
]

In [2]:
def getFixedNum(rawCol, fixedCol):
    num = 0
    for i, e in enumerate(rawCol):
        if not pd.isna(e):
            if e != fixedCol[i]:
                num += 1
    return num

def getCorrectlyFixedNum(gtCol, fixedCol, rawCol):
    num = 0
    for i, e in enumerate(gtCol):
        if not pd.isna(e):
            if e == rawCol[i]: continue
            elif e == fixedCol[i]:
                num += 1
    return num

In [3]:
raw_DF = pd.read_csv(rawData, header=0)
improved_DF = pd.read_csv(improvedFixedData, header=0)
unimproved_DF = pd.read_csv(unimprovedFixedData, header=0)
groundTruth_DF = pd.read_csv(groundTruth, header=0)

In [4]:
for column in columns:
    # /// Calculate and print the unimproved result. ///
    # Number of all records fixed by manual inspection (regarded as ground truth)
    numRFMI = getFixedNum(raw_DF[column].to_list(), groundTruth_DF[column].to_list())
    # Number of all records fixed by program
    numRFP = getFixedNum(raw_DF[column].to_list(), unimproved_DF[column].to_list())
    # Number of records correctly fixed by program
    numRCFP = getCorrectlyFixedNum(groundTruth_DF[column].to_list(), unimproved_DF[column].to_list(), raw_DF[column].to_list())
    
    print(column.replace('_', ' ').upper() + " Unimproved:")
    print("Precision: " + str(numRCFP / numRFP))
    print("Recall: " + str(numRCFP / numRFMI))
    print('\n')
    
    # /// Calculate and print the improved result. ///
    # Number of all records fixed by manual inspection (regarded as ground truth)
    numRFMI = getFixedNum(raw_DF[column].to_list(), groundTruth_DF[column].to_list())
    # Number of all records fixed by program
    numRFP = getFixedNum(raw_DF[column].to_list(), improved_DF[column].to_list())
    # Number of records correctly fixed by program
    numRCFP = getCorrectlyFixedNum(groundTruth_DF[column].to_list(), improved_DF[column].to_list(), raw_DF[column].to_list())
    
    print(column.replace('_', ' ').upper() + " Improved:")
    print("Precision: " + str(numRCFP / numRFP))
    print("Recall: " + str(numRCFP / numRFMI))
    print('\n')

CITY NAME Unimproved:
Precision: 1.0
Recall: 0.1891891891891892


CITY NAME Improved:
Precision: 0.9736842105263158
Recall: 1.0


STREET NAME Unimproved:
Precision: 0.9171597633136095
Recall: 0.9281437125748503


STREET NAME Improved:
Precision: 0.9585798816568047
Recall: 0.9700598802395209


