In [6]:
import json
from datetime import datetime

# Load the JSON records from a file
def load_json(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

# Save the deduplicated JSON records to a file
def save_json(data, file_path):
    with open(file_path, 'w') as f:
        json.dump(data, f, indent=4)

# Deduplicate the records based on the given rules
def deduplicate(records):
    unique_by_id = {}
    unique_by_email = {}
    log = []

    for record in records:
        record_id = record.get("_id")
        record_email = record.get("email")
        record_date = datetime.fromisoformat(record.get("entryDate"))

        # Determine the unique key for deduplication
        conflict_key = record_id if record_id in unique_by_id else record_email

        if conflict_key in unique_by_id or conflict_key in unique_by_email:
            existing_record = unique_by_id.get(conflict_key) or unique_by_email.get(conflict_key)
            existing_date = datetime.fromisoformat(existing_record["entryDate"])

            if record_date > existing_date or (record_date == existing_date and records.index(record) > records.index(existing_record)):
                # Log the changes
                changes = []
                for key in record.keys():
                    if record[key] != existing_record.get(key):
                        changes.append({"field": key, "from": existing_record.get(key), "to": record[key]})

                log.append({
                    "source_record": existing_record,
                    "output_record": record,
                    "field_changes": changes,
                })

                # Update the record
                if record_id in unique_by_id:
                    unique_by_id[record_id] = record
                if record_email in unique_by_email:
                    unique_by_email[record_email] = record
        else:
            # Add the record as new
            if record_id:
                unique_by_id[record_id] = record
            if record_email:
                unique_by_email[record_email] = record

    # Return the deduplicated records and the log
    deduplicated_records = list({**unique_by_id, **unique_by_email}.values())
    return deduplicated_records, log

# Main function to process the file
def main():
    input_file = "/content/leads.json"
    output_file = "deduplicated_leads.json"
    log_file = "deduplication_log.json"

    # Load the input records
    data = load_json(input_file)
    records = data.get("leads", [])

    # Deduplicate the records
    deduplicated_records, log = deduplicate(records)

    # Save the deduplicated records and the log
    save_json({"leads": deduplicated_records}, output_file)
    save_json(log, log_file)

main()
