In [None]:
import antigranular as ag

session = ag.login(
    "*****",
    "*****",
    competition="Harvard OpenDP Hackathon",
)

local_host_port: 0440b9cb-a832-4024-a61c-ae068edd15c1
server_hostname: ip-100-100-18-69.eu-west-1.compute.internal
tls_cert_name: ip-100-100-18-69.eu-west-1.compute.internal_0440b9cb-a832-4024-a61c-ae068edd15c1
cert_thumbprint: 84da402963f19d61e7bf9852264428a47d371be9bd2714c99dc30f25f8915a09b8b72232facd6bfe26de52ed83d9cd3e7fb27a8b6da89af59cdd0c3a1361ae75
Dataset "Flight Company Dataset" loaded to the kernel as [92mflight_company_dataset[0m

Dataset "Health Organisation Dataset" loaded to the kernel as [92mhealth_organisation_dataset[0m

Connected to Antigranular server session id: d037997d-3622-4710-b17f-cba8fa784896, the session will time out if idle for 25 minutes
Cell magic '%%ag' registered successfully, use `%%ag` in a notebook cell to execute your python code on Antigranular private python server
🚀 Everything's set up and ready to roll!


In [None]:
%%ag

flight = flight_company_dataset
health = health_organisation_dataset

# Lets remove those passenger records who tested negative.
health['covidtest_result'] = health['covidtest_result'].where(
    health['covidtest_result'] == 'positive'
)
health = health.dropna()

**Flight Company Dataset**

* The first dataset contains raw data from airline operators about all the flights operated between 1 Jan 2020 to 31 Jan 2020, including the passenger details for each flight.
* Dataset size = 85242 rows × 9 columns
* This dataset seems to have inconsistent text entry conventions for the dates, emails, phone numbers etc. and might need a bit of cleanup.

| Attribute              | Description                                                                                                             | Example(s)                              |
|------------------------|-------------------------------------------------------------------------------------------------------------------------|-----------------------------------------|
| flight_number          | A unique identifier for each flight from the airline company.                                                           | ROM-LON-459, BRA-IND-223, TOK-ROM-950   |
| flight_date            | The date of departure of the flight.                                                                                     | yyyy-mm-dd                              |
| flight_from            | The departure city for the flight, also reflected in the first group of 3 letters of the flight number.                  | -                                       |
| flight_to              | The destination city for the flight, also reflected in the second group of 3 letters of the flight number.               | -                                       |
| passenger_firstname    | First name and sometimes the middle name of the passenger are jointly recorded, sometimes without any space between them.| NokVanu, Nok Vanu, Nok                 |
| passenger_lastname     | The last name of the passenger.                                                                                          | -                                       |
| passenger_date_of_birth| The date of birth of the passenger, in various formats.                                                                  | 01.Sep.1990, 1 September 90, 1-Sept-1990|
| passenger_phone_number | The phone number of the passenger, in multiple formats.                                                                  | +1-343-343-9900, 0091 992 992 9900      |
| passenger_email_address| The email address of the passenger, including some with spelling mistakes.                                               | blue_daisy_345@htmail.co.uk, nok_jam_nok@gamil.com |


**Health Organisation Dataset**

* This dataset contains raw data from a National Health Organization, which includes patient details and test results for all people who did a COVID test
* Dataset size = 71707 rows × 8 columns
* This dataset seems to have inconsistent text entry conventions for the dates, emails, phone numbers etc. and might need a bit of cleanup.



| Attribute             | Description                                                                                                           | Example(s)                                      |
|-----------------------|-----------------------------------------------------------------------------------------------------------------------|-------------------------------------------------|
| patient_firstname     | First name and sometimes the middle name of the patient are jointly recorded, sometimes without any space between them. | TamNok, Tam Nok, Tam                            |
| patient_lastname      | The last name of the patient.                                                                                         | -                                               |
| patient_date_of_birth | The date of birth of the patient, in various formats.                                                                  | 01.Sep.1990, 1 September 90, 1-Sept-1990        |
| patient_phone_number  | The phone number of the patient, in multiple formats.                                                                  | +44-343-343-9900, 001 992 992 8900              |
| patient_email_address | The email address of the patient, including some with spelling mistakes.                                               | blue.daisy.345@htmail.com, nok_jam_nok@googlemail.com |
| covidtest_date        | The date that the patient took the COVID test.                                                                         | yyyy-mm-dd                                      |
| covidtest_result      | The result of the COVID test; indicates if the patient had COVID at the time of the test.                              | positive, negative                              |
| patient_address       | The address of the patient when they took the COVID test; could be an office or hotel address.                         | -                                               |


In [None]:
%%ag

import re
import datetime
import op_pandas as opd
import op_recordlinkage as rl


# The dates in the dataset are not uniformly formatted, format="mixed" allows for individual analysis of each date entry.
flight["passenger_date_of_birth_1"] = opd.to_datetime(flight["passenger_date_of_birth"], errors="coerce", format="mixed")
health["patient_date_of_birth_1"] = opd.to_datetime(health["patient_date_of_birth"], errors="coerce", format="mixed")


def get_month_1(dob: str) -> str:
    """
    Extracts the month from a date string in various formats.

    This function processes a date of birth (DOB) string and attempts to extract the month.
    It first checks for the month in a three-letter abbreviation format (e.g., 'jan', 'feb', etc.).
    If not found, it uses regex to identify numeric representations of the month.
    It ensures that single-digit months are zero-padded (e.g., '1' becomes '01').
    In cases where two potential months are identified, the latter is assumed to be the correct one.
    """

    # Mapping of three-letter month abbreviations to their numeric two-digit format
    month_dict = {'jan': '01', 'feb': '02', 'mar': '03', 'apr': '04', 'may': '05', 'jun': '06',
                  'jul': '07', 'aug': '08', 'sep': '09', 'oct': '10', 'nov': '11', 'dec': '12'}

    dob_lower = dob.lower() # Convert DOB to lowercase for case-insensitive matching

    # Check for the presence of three-letter month abbreviations in DOB
    for month in month_dict:
        if month in dob_lower:
            return month_dict[month]

    # Pad single-digit months and days with a leading zero
    dob2 = re.sub(r"\b([1-9])\b", r"0\1", dob_lower)

    # Extract two-digit numbers that could represent a month
    matches = re.findall(r"\b(0[1-9]|1[0-2])\b", dob2)
    if matches:
        # Handle cases with multiple potential month values
        if len(matches) == 2 and matches[0] != matches[1]:
            return matches[1] # Return the latter value if both are valid months
        else:
            return matches[0] # Return the single valid month value
    return "13" # Return '13' if no valid month is found

def get_month_2(dob: str) -> str:
    # same as get_month_1 but assumes month to be before the day

    month_dict = {'jan': '01', 'feb': '02', 'mar': '03', 'apr': '04', 'may': '05', 'jun': '06', 'jul': '07', 'aug': '08', 'sep': '09', 'oct': '10', 'nov': '11', 'dec': '12'}

    dob_lower = dob.lower()
    for month in month_dict:
        if month in dob_lower:
            return month_dict[month]

    dob2 = re.sub(r"\b([1-9])\b", r"0\1", dob)
    matches = re.findall(r"\b(0[1-9]|1[0-2])\b", dob2)
    if matches:
        if len(matches) == 2 and matches[0] != matches[1]:
            return str(matches[0]) # month before day
        else:
            return str(matches[0])
    return "13"

def add_month(df: opd.PrivateDataFrame, person: str)-> None:
    df[f"{person}_month_1"]  = df[f"{person}_date_of_birth"].map(get_month_1)
    df[f"{person}_month_2"]  = df[f"{person}_date_of_birth"].map(get_month_2)

add_month(flight, "passenger")
add_month(health, "patient")

def standardize_email(email: str) -> str:
	# blue_daisy_345@htmail.co.uk	-> blue daisy 345
	# nok_jam_nok@gamil.com	        -> nok jam nok
	# blue.daisy.345@htmail.com	    -> blue daisy 345
	# nok_jam_nok@googlemail.com	-> nok jam nok

    # Removing the domain part of the email
    email = re.sub(r"@.*", "", email)

    # Replacing special characters with space
    email = re.sub(r"[\W_]+", " ", email)

    return email.strip()

flight["passenger_email_address_1"] = flight[["passenger_email_address"]].applymap(standardize_email)
health["patient_email_address_1"] = health[["patient_email_address"]].applymap(standardize_email)

def standardize_phone_number(phone_number: str) -> str:
    # Remove all non-digit characters from the phone number

    # +1-343-343-9900	-> 13433439900
    # 0091 992 992 9900	-> 00919929929900
    # +44-343-343-9900	-> 443433439900
    # 001 992 992 8900	-> 0019929928900
    cleaned_number = re.sub(r"\D", "", phone_number)
    return cleaned_number

flight["passenger_phone_number_1"] = flight[["passenger_phone_number"]].applymap(standardize_phone_number)
health["patient_phone_number_1"] = health[["patient_phone_number"]].applymap(standardize_phone_number)

def clean_first_name(name: str) -> str:
    """
    Extracts the first name from a given string.
    Assumes that the first word is the first name, and if there are multiple capital letters in the first word,
    the first name ends before the second capital letter.
    e.g. "John Smith" -> "John"
    e.g. "JohnSmith" -> "John"
    e.g. "JohnSmithJunior" -> "John"

    :param name: A string representing a full name.
    :return: The extracted first name.
    """
    # Split the name into words based on spaces
    words = name.split()

    # Check the first word for capital letters
    first_word = words[0]
    capital_letters = [char for char in first_word if char.isupper()]

    # If more than one capital letter is present, assume first name ends before the second capital letter
    if len(capital_letters) > 1:
        second_capital_index = first_word.find(capital_letters[1])
        return first_word[:second_capital_index]

    # If only one capital letter, assume the entire word is the first name
    else:
        return first_word


flight["passenger_firstname_1"] = flight[["passenger_firstname"]].applymap(clean_first_name, eps=0)
health["patient_firstname_1"] = health[["patient_firstname"]].applymap(clean_first_name, eps=0)


def standardize_lastname(lastname: str) -> str:
    # Convert to lowercase and strip spaces at the beginning and end
    cleaned_lastname = lastname.lower().strip()
    return cleaned_lastname

flight["passenger_lastname_1"] = flight[["passenger_lastname"]].applymap(standardize_lastname)
health["patient_lastname_1"] = health[["patient_lastname"]].applymap(standardize_lastname)

In [None]:
%%ag
ag_print(health.dtypes)

patient_firstname                  object
patient_lastname                   object
patient_date_of_birth              object
patient_phone_number               object
patient_email_address              object
covidtest_date                     object
covidtest_result                   object
patient_address                    object
patient_firstname_1                object
patient_lastname_1                 object
patient_date_of_birth_1    datetime64[ns]
patient_month_1                    object
patient_month_2                    object
patient_email_address_1            object
patient_phone_number_1             object
dtype: object



In [None]:
%%ag

import op_recordlinkage as rl

indexer = rl.Index()

# block on derived last_name_1 and month_1
# chose these as these columns wouldn't eliminate strong matches in the indexing stage itself
indexer.block(
    ["passenger_lastname_1", "passenger_month_1"],
    ["patient_lastname_1", "patient_month_1"]
)
candidate_links = indexer.index(flight, health)

In [None]:
%%ag
import datetime

comparer = rl.Compare()

comparer.string(
    "passenger_firstname",
    "patient_firstname",
    method="jarowinkler",
    threshold=0.7, # sometimes middlename is (not)included
    label="firstname",
)

# comparer.string(
#     "passenger_firstname_1",
#     "patient_firstname_1",
#     method="jarowinkler",
#     label="firstname_1",
# )

comparer.string(
    "passenger_phone_number_1",
    "patient_phone_number_1",
    method="jaro_winkler",
    label="phone"
)


comparer.string(
    "passenger_email_address_1",
    "patient_email_address_1",
    method="jaro_winkler",
    label="email"
)


comparer.exact(
    "passenger_date_of_birth_1",
    "patient_date_of_birth_1",
    label="dob"
)


def cmp(date_str1: str , date_str2: str ) -> int:
    # Convert date strings to datetime objects
    date1 = datetime.datetime.strptime(date_str1, "%Y-%m-%d")
    date2 = datetime.datetime.strptime(date_str2, "%Y-%m-%d")

    # Calculate the absolute difference in days
    days_apart = (date2 - date1).days
    # Check if the dates are within two weeks (14 days) apart
    if -14 < days_apart < 14:
        return 2
    else:
        return 0

comparer.custom(cmp, "flight_date", "covidtest_date", label="date_cmp")

In [None]:
%%ag
features = comparer.compute(candidate_links,flight,health)

In [None]:
%%ag
# get an average value using features.sum(axis=1).mean(eps=0.1)
# and then further finetune the threshold value by submitting with nearby values

linked_df = comparer.get_match(4.6)
res = linked_df[["l_flight_number"]]
x = submit_predictions(res)

score: {'leaderboard': 0.9350645603725568, 'logs': {'LIN_EPS': -0.008, 'MCC': 0.9430645603725568}}



In [None]:
# index:last_1, month_1, compare: first(thr:0.7), dob1(exact), email1, phone1, thr: 4.6, score: 0.9350
# index:last_1, month_1, compare: first(thr:0.7), dob1(exact), email1, phone1, thr: 4.65, score: 0.9337
# index:last_1, month_1, compare: first(thr:0.7), dob1(exact), email1, phone1, thr: 4.7, score: 0.9298