In [20]:
import json

# Load data from JSON
with open('apiResponse/all_responses.json', 'r', encoding='utf-8') as file:
    api_data = json.load(file)

with open('Samples/sample_siirtokarjalaiset_annotated.json', 'r', encoding='utf-8') as file:
    hand_data = json.load(file)



In [25]:

from fuzzywuzzy import fuzz

def parse_response(response_str):
    lines = response_str.split('\n')
    parsed_response = {}
    for line in lines:
        key, _, value = line.partition(': ')
        parsed_response[key] = value.strip() if value.strip() else None
    return parsed_response

def are_similar(str1, str2, threshold=40):
    return fuzz.token_set_ratio(str1, str2) > threshold

def compare_values(api_values, annotated_values):
    api_list = api_values.lower().split(', ') if api_values else []
    annotated_list = annotated_values.lower().split(', ') if annotated_values else []
    
    # Using sets for the matches and mismatches so each item can only appear once.
    matches = set()
    mismatches = set(api_list).union(set(annotated_list))  # Start with all as mismatches.
    
    # Compare each combination of api and annotated values for similarity.
    for api_val in api_list:
        for ann_val in annotated_list:
            if are_similar(api_val, ann_val):
                matches.add(api_val)  # Add to matches if similar.
                # Remove from mismatches if it was previously considered a mismatch.
                mismatches.discard(api_val)
                mismatches.discard(ann_val)
    
    return matches, mismatches


# Parse JSON strings
api_responses = api_data
hand_annotated = hand_data
# Loop over all elements in api_responses and hand_annotated to compare them
results = []
total_matches = 0
total_mismatches = 0

for api_resp, hand_ann in zip(api_responses, hand_annotated):
    parsed_api_response = parse_response(api_resp['api_response'])
    
    comparison_results = {
        "index": hand_ann['index'],
        "person_name": hand_ann['primary_person_name'],
        "spouse_name": hand_ann['spouse_name'],
        "detail": []
    }
    
    for key in ["person_hobbies", "person_social_orgs", "spouse_hobbies", "spouse_social_orgs"]:
        split_keys = key.split("_")
        api_key = split_keys[0].capitalize() + "".join(word.capitalize() for word in split_keys[1:])
        
        matches, mismatches = compare_values(parsed_api_response.get(api_key, ""), hand_ann[key])
        
        detail = {
            "type": key,
            "matches": list(matches),
            "mismatches": list(mismatches)
        }
        comparison_results["detail"].append(detail)
        
        total_matches += len(matches)
        total_mismatches += len(mismatches)
    
    results.append(comparison_results)

output_json = json.dumps(results, indent=4, ensure_ascii=False)

# To store the results in a file:
with open("comparison_results.json", "w") as file:
    file.write(output_json)

# Printing total matches and mismatches
print(f"Total Matches: {total_matches}")
print(f"Total Mismatches: {total_mismatches}")

# Calculating and printing the match percentage
total_comparisons = total_matches + total_mismatches
if total_comparisons > 0:  # Prevent division by zero
    match_percentage = (total_matches / total_comparisons) * 100
    print(f"Match Percentage: {match_percentage:.2f}%")
else:
    print("No comparisons were made (Total Comparisons: 0).")

Total Matches: 23
Total Mismatches: 34
Match Percentage: 40.35%
