In [11]:
def read_goldstandard(file_path):
    """Reads the Goldstandard Test Data from a CSV file and returns a dictionary."""
    goldstandard = {}
    with open(file_path, 'r') as file:
        next(file)  # Skip the header
        for line in file:
            dbpedia, forbes, match = line.strip().split(',')
            goldstandard[(dbpedia.strip(), forbes.strip())] = match == 'True'
    return goldstandard

def read_correspondence(file_path):
    """Reads the Correspondence data from a CSV file and returns a set."""
    correspondence = set()
    with open(file_path, 'r') as file:
        for line in file:
            dbpedia, forbes, _ = line.strip().split(',')
            # Remove quotation marks
            dbpedia = dbpedia.strip().replace('"', '')
            forbes = forbes.strip().replace('"', '')
            correspondence.add((dbpedia, forbes))
    return correspondence

def find_false_positives_negatives(goldstandard, correspondence):
    """Identifies false positives and false negatives."""
    false_positives = []
    false_negatives = []

    # Check for false positives
    for pair in correspondence:
        if pair in goldstandard and not goldstandard[pair]:
            false_positives.append(pair)

    # Check for false negatives
    for pair, match in goldstandard.items():
        if match and pair not in correspondence:
            false_negatives.append(pair)

    return false_positives, false_negatives

In [12]:
# Example usage
goldstandard_file_path = r"C:\Users\stefa\WebDataIntegrationProj\Python\Identiy Resolution\goldstandard\dbpedia_forbes_goldstandard_test.csv"
correspondence_file_path = r'C:\Users\stefa\WebDataIntegrationProj\Results\IdentityResolution\dbpedia_forbes_correspondences_ML_Ada.csv'

goldstandard_data = read_goldstandard(goldstandard_file_path)
correspondence_data = read_correspondence(correspondence_file_path)
false_positives, false_negatives = find_false_positives_negatives(goldstandard_data, correspondence_data)

print("False Positives:", false_positives)
print("False Negatives:", false_negatives)

False Positives: [('DBPedia_1048', 'Forbes_1257')]
False Negatives: [('DBPedia_477', 'Forbes_925'), ('DBPedia_3577', 'Forbes_500'), ('DBPedia_8927', 'Forbes_1318'), ('DBPedia_5856', 'Forbes_1067'), ('DBPedia_9085', 'Forbes_1489'), ('DBPedia_4402', 'Forbes_1814'), ('DBPedia_4216', 'Forbes_1070'), ('DBPedia_2147', 'Forbes_417'), ('DBPedia_7306', 'Forbes_277'), ('DBPedia_9552', 'Forbes_1404'), ('DBPedia_270', 'Forbes_1342'), ('DBPedia_9450', 'Forbes_1045'), ('DBPedia_9365', 'Forbes_1136')]
