In [1]:
import antigranular as ag
session = ag.login(<client_id>,<client_secret>, competition = "Harvard OpenDP Hackathon")

Dataset "Flight Company Dataset" loaded to the kernel as [92mflight_company_dataset[0m

Dataset "Health Organisation Dataset" loaded to the kernel as [92mhealth_organisation_dataset[0m

Connected to Antigranular server session id: 777e08f6-b19c-4446-8f01-0d25c02a5ba6, the session will time out if idle for 25 minutes
Cell magic '%%ag' registered successfully, use `%%ag` in a notebook cell to execute your python code on Antigranular private python server
🚀 Everything's set up and ready to roll!


## Explore

As per the dataset documentation
- flight_company_dataset: 85242 rows × 9 columns
- health_organisation_dataset: 71707 rows × 8 columns

In [2]:
%%ag

flight = flight_company_dataset.copy()
health = health_organisation_dataset.copy()

In [3]:
%%ag

ag_print(f"Flight columns - {flight.columns}")
ag_print(f"Flight dtypes - \n{flight.dtypes}")
ag_print()
ag_print(f"Health columns - {health.columns}")
ag_print(f"Health dtypes - \n{health.dtypes}")

Flight columns - ['flight_number', 'flight_date', 'flight_from', 'flight_to', 'passenger_firstname', 'passenger_lastname', 'passenger_date_of_birth', 'passenger_phone_number', 'passenger_email_address']
Flight dtypes - 
flight_number              object
flight_date                object
flight_from                object
flight_to                  object
passenger_firstname        object
passenger_lastname         object
passenger_date_of_birth    object
passenger_phone_number     object
passenger_email_address    object
dtype: object

Health columns - ['patient_firstname', 'patient_lastname', 'patient_date_of_birth', 'patient_phone_number', 'patient_email_address', 'covidtest_date', 'covidtest_result', 'patient_address']
Health dtypes - 
patient_firstname        object
patient_lastname         object
patient_date_of_birth    object
patient_phone_number     object
patient_email_address    object
covidtest_date           object
covidtest_result         object
patient_address          obj

### Strategy

- All columns are of object type, keep them as objects to minimize epsilon spend.
- 0 epsilon is spent for string manipulations.
- Standardize all dates to YY-MM-DD format (YY allows us to not worry about 1900s vs 2000s date in edge cases)

### Date Cleaning

Convert dates to YY-MM-DD format

In [4]:
%%ag

import re

def is_leap_year(year: int) -> bool:
    """Determine whether a year is a leap year."""
    # https://stackoverflow.com/a/30714165

    return year % 4 == 0 and (year % 100 != 0 or year % 400 == 0)

def clean_months(month_val: str) -> str:
    months = ["jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"]

    count = 1
    for month in months:
        if month in month_val.lower():
            return str(count).rjust(2, "0")
        count = count + 1

    return "00"


def validate_day(year: str, month: str, day: str) -> bool:
    days_map = {
        1: 31, # Jan
        2: 28, # Feb
        3: 31, # March
        4: 30, # April
        5: 31, # May
        6: 30, # Jun
        7: 31,# July
        8: 31, # Aug
        9: 30, # Sept
        10: 31, # Oct
        11: 30, # Nob
        12: 31 # Dec
    }

    # leap year check
    # We assume dates are only valid between 1901 to 2020 (birthdates only one or two people alive that has lived till 120)
    # very rare that they would be taking a flight
    year_int = 0
    if int(year) == 0:
        year = 2000
    else:
        year = int("20" + year) # Years across 100 yrs are still leap

    if is_leap_year(year):
        days_map.update({2: 29})

    day_valid = int(day) <= days_map[int(month)]

    return day_valid   


def clean_dates(value: str, lower: bool) -> str:
    """
    Convert date value to YYYY-MM-DD format

    No date should exist after 31-Jan-2020

    Months are given alphabetically, so we just need to find years and days

    We assume confusing cases like 06 Jan 2001 being represented as 06 Jan 01
    won't exist or will at least be represented in a differentiating way like 6 Jan 01

    But if they do, we take two combinations based on which number is lower.
    We also assume that years would ideally be between 1920 to 2020, so any years like 06 will be treated as 2006.
    """
    delims = [".", " ", "-"]

    # Handle multiple occurences and replace with only one delim
    value = re.sub(r"[-.\s]+", "-", value)

    # Only keep expected characters
    value_final = ""
    for letter in value:
        if letter.isalnum() or letter == "-":
            value_final = value_final + letter

    value_comps = value_final.strip().split("-")

    year, month, day = None, None, None

    remaining = []

    # Find month and year
    for val in value_comps:
        if val.strip().isalpha():
            # Months are always alphabetical
            month = clean_months(val)
        
        elif val.isnumeric():
            if len(val) == 4:
                year = val[2:]

            elif int(val) > 31:
                # no day can exist after 31
                year = val

            elif val == "00":
                # Assign as 2000
                year = val

            elif len(val) == 1:
                # Years would be 2 digit at the least
                day = val.rjust(2, "0")
        
            else:
                remaining.append(val)


    if month:
        if year:
            if not day:
                # if year and month found
                day = remaining[0]

        else:
            if day:
                year = remaining[0]

            else:
                # Only confusing cases would remain
                remaining2 = sorted(remaining, key=lambda x: int(x))

                # If nothing matches for example: 06 Jan 10, ideally day should be first and year should be last
                if lower:
                    year, day = remaining2
                else:
                    day, year  = remaining2

                # really rare edge cases
                day_valid = validate_day(year=year, month=month, day=day)

                if not day_valid:
                    year, day = day, year # swap them if day is not valid

                day = day.rjust(2, "0")
                year = year.rjust(2, "0")

    elif year:
        # We assume this case won't happen
        # as per the organizers
        remaining2 = sorted(remaining, key=lambda x: int(x))

        if lower:
            month, day = remaining2
        else:
            day, month = remaining2


        month = month.rjust(2, "0")
        day = day.rjust(2, "0")

        if int(month) > 12:
            month, day = day, month # swap
        else:
            # really rare edge cases
            day_valid = validate_day(year=year, month=month, day=day)

            if not day_valid:
                month, day = day, month # swap them if day is not valid
            
    return f"{year}-{month}-{day}"


def clean_dates_lower(value: str, ) -> str:
    return clean_dates(value, True)

def clean_dates_upper(value: str, ) -> str:
    return clean_dates(value, False)

def get_age(dob:str) -> str:
    """
    Give age for a person based on year and dob. A way to cover more ground using DOB and also maybe some errors in upper lower dates
    """
    # Assume dob is cleaned in the format YY-MM-DD. We assume all dates are of 1900's as it won't cause problems with our format
    dob_year_month = dob.split("-")[0] + dob.split("-")[1]

    return dob_year_month


### Name cleaning

In [5]:
%%ag

def get_firstname(mix_name: str) -> str:
    mix_name = re.sub(r"\s+", " ", mix_name.strip())

    splitter = None
    if " " in mix_name:
        splitter = " "
    else:
        # Split on first Upper Case letter
        new_str = ""
        for letter in mix_name:
            if letter.isupper() and letter != mix_name[0]:
                new_str = new_str + f" {letter}"
                splitter = " "
            else:
                new_str = new_str + letter
        mix_name = new_str

    if splitter:
        # mix_name = "".join(mix_name.split(splitter)[:-1]).lower()
        mix_name = mix_name.split(splitter)[0].lower()
    else:
        mix_name = mix_name.lower()

    new_str = ""
    for ch in mix_name:
        if ch.isalpha():
            new_str = new_str + ch

    return new_str.lower()
    
def get_middlename(mix_name: str) -> str:
    mix_name = re.sub(r"\s+", " ", mix_name.strip())

    splitter = None
    if " " in mix_name:
        splitter = " "
    else:
        # Split on first Upper Case letter
        new_str = ""
        for letter in mix_name:
            if letter.isupper() and letter != mix_name[0]:
                new_str = new_str + f" {letter}"
                splitter = " "
            else:
                new_str = new_str + letter
        mix_name = new_str

    if splitter:
        return "".join(mix_name.split(splitter)[-1]).lower()
    else:
        return ""


def get_lastname(mix_name: str) -> str:
    mix_name = re.sub(r"\s+", " ", mix_name.strip())
    return mix_name.lower()


def get_firstname2(mix_name: str) -> str:
    """
    Don't worry about middle names convert every valid char to lowercase
    """
    mix_name = re.sub(r"\s+", " ", mix_name.strip())

    new_str = ""
    for ch in mix_name:
        if ch.isalpha():
            new_str = new_str + ch

    return new_str.lower()

In [6]:
%%ag

def clean_phonenumbers(val: str) -> str:
    """
    Ideally phone numbers should be clean and all should have country codes

    If they don't we can strip all country codes and then compare. This function currently
    removes all chars except numbers.

    We also remove leading 0s from the value.
    """
    ls = []
    for letter in val:
        if letter.isnumeric():
            ls.append(letter)

    value = "".join(ls)

    value = re.sub("^0+", "", value)

    return value

In [7]:
%%ag

def get_username(email_str: str) -> str:
    """
    Match usernames exactly
    """
    email_str = email_str.strip()

    username = email_str.split("@")[0]

    return username

def get_domain(email_str: str) -> str:
    """
    Match domains fuzzily.
    """
    email_str = email_str.replace(".", "").strip()

    domain = email_str.split("@")[1]

    return domain

### Flight EDA & Cleaning

Columns:
- flight_number | ROM-LON-459
- flight_date | Format YYYY-MM-DD
- flight_from | BOM Bombay
- flight_to | LON London
- passenger_firstname | Can have middle names (Clean them)
- passenger_lastname | Fuzzy match
- passenger_date_of_birth | Cleanup dates
- passenger_phone_number | Cleanup numbers
- passenger_email_address | Exact match on usernames and fuzzy match on domains

In [8]:
%%ag

flight["flight_fname"] = flight["passenger_firstname"].map(get_firstname, eps=0)
# flight["flight_fname"] = flight["passenger_firstname"].map(get_firstname2, eps=0)
flight["flight_mname"] = flight["passenger_firstname"].map(get_middlename, eps=0)
flight["flight_lname"] = flight["passenger_lastname"].map(get_lastname, eps=0)

In [9]:
%%ag

# flight["flight_dob"] = flight["passenger_date_of_birth"].map(clean_dates, eps=0)
flight["flight_dob_lower"] = flight["passenger_date_of_birth"].map(clean_dates_lower, eps=0)
flight["flight_dob_upper"] = flight["passenger_date_of_birth"].map(clean_dates_upper, eps=0)

In [10]:
%%ag
flight["flight_phone_num"] = flight["passenger_phone_number"].map(clean_phonenumbers, eps=0)
flight["flight_email_user"] = flight["passenger_email_address"].map(get_username, eps=0)
flight["flight_email_domain"] = flight["passenger_email_address"].map(get_domain, eps=0)

flight["flight_age"] = flight["flight_dob_lower"].map(get_age, eps=0)

### Health EDA & Cleaning

Columns:
- patient_firstname - Clean and match fuzzily
- patient_lastname - Clean and match fuzzily
- patient_date_of_birth - Clean dates
- patient_phone_number - Clean numbers
- patient_email_address - Clean usernames and match exact, match domains fuzzily
- covidtest_date - YYYY-MM-DD
- covidtest_result - positive | negative
- patient_address - Not used

In [11]:
%%ag

# Lets remove those passenger records who tested negative. (Taken from docs)
health['covidtest_result'] = health['covidtest_result'].where(health['covidtest_result'] == 'positive')
health = health.dropna().copy()

In [12]:
%%ag

health["health_fname"] = health["patient_firstname"].map(get_firstname, eps=0)
# health["health_fname"] = health["patient_firstname"].map(get_firstname2, eps=0)
health["health_mname"] = health["patient_firstname"].map(get_middlename, eps=0)
health["health_lname"] = health["patient_lastname"].map(get_lastname, eps=0)

In [13]:
%%ag

# health["health_dob"] = health["patient_date_of_birth"].map(clean_dates, eps=0)
health["health_dob_lower"] = health["patient_date_of_birth"].map(clean_dates_lower, eps=0)
health["health_dob_upper"] = health["patient_date_of_birth"].map(clean_dates_upper, eps=0)

In [14]:
%%ag
health["health_phone_num"] = health["patient_phone_number"].map(clean_phonenumbers, eps=0)
health["health_email_user"] = health["patient_email_address"].map(get_username, eps=0)
health["health_email_domain"] = health["patient_email_address"].map(get_domain, eps=0)

health["health_age"] = health["health_dob_lower"].map(get_age, eps=0)

### Record Linking

In [15]:
%%ag

import op_recordlinkage as rl

In [16]:
%%ag
from op_recordlinkage.preprocessing import clean , phonetic

# Some preprocessing for record linking
# For names, the description does not say anything about misspelling in names still
# we use fuzzy matching for firstnames as well as lastnames.

# flight["passenger_firstname2"] = clean(flight["passenger_firstname"], lowercase=True, remove_brackets=True)
flight["flight_fname"] = clean(flight["flight_fname"], lowercase=True, remove_brackets=True, replace_by_none="^(mrs|dr|mr|ms)")
flight["flight_mname"] = clean(flight["flight_mname"], lowercase=True, remove_brackets=True)
flight["flight_lname"] = clean(flight["flight_lname"], lowercase=True, remove_brackets=True)
flight["flight_email_domain"] = clean(flight["flight_email_domain"], lowercase=True, remove_brackets=True)

# health["patient_firstname2"] = clean(health["patient_firstname"], lowercase=True, remove_brackets=True)
health["health_fname"] = clean(health["health_fname"], lowercase=True, remove_brackets=True, replace_by_none="^(mrs|dr|mr|ms)")
health["health_mname"] = clean(health["health_mname"], lowercase=True, remove_brackets=True)
health["health_lname"] = clean(health["health_lname"], lowercase=True, remove_brackets=True)
health["health_email_domain"] = clean(health["health_email_domain"], lowercase=True, remove_brackets=True)

### Indexing

- The cleanest column available for indexing is date of birth in both datasets.
- Although date of birth itself won't always be consistent, due to time constraint and server loads, the indexing in this notebook was done on `date of birth`.
- A way to improve score would be to index on more columns such as `lastname`, `phone_number`, etc. to get more pairs for comparison. But attempts at adding more columns to the index failed due to timeout / server loads.

In [17]:
%%ag

indexer = rl.Index()
# indexer.block(left_on=["flight_lname"], right_on=["health_lname"])
indexer.block(left_on=["flight_dob_lower"], right_on=["health_dob_lower"])

pairs = indexer.index(flight, health)

### Comparing

In [18]:
%%ag

compare_cl = rl.Compare()

# compare_cl.string("passenger_firstname2", "patient_firstname2", method="jarowinkler", threshold=0.85, label="fname") # don't use exact in case spelling mistakes or middle name was not capitalized
compare_cl.string("flight_fname", "health_fname", method="jarowinkler", threshold=0.85, label="fname") # don't use exact in case spelling mistakes or middle name was not capitalized
compare_cl.string("flight_mname", "health_mname", method="jaro", threshold=0.85, label="mname")
compare_cl.string("flight_lname", "health_lname", method="jarowinkler", threshold=0.85, label="lname")

# compare_cl.exact("flight_dob_lower", "health_dob_lower", label="dob_lower", agree_value=0.5) # Already handled by block
compare_cl.exact("flight_dob_upper", "health_dob_upper", label="dob_upper", agree_value=0.5)
compare_cl.string("flight_phone_num", "health_phone_num", method="jarowinkler", threshold=0.85, label="phone_num") # Fuzzy match to deal with missing country codes maybe?
compare_cl.string("passenger_email_address", "patient_email_address", method="damerau_levenshtein", label="email", threshold=0.85)
# compare_cl.exact("flight_email_user", "health_email_user", label="email_user", agree_value=1.5)

# Using a custom compare rule.
import datetime
def cmp(date_str1: str , date_str2: str) -> int:
    # Convert date strings to datetime objects
    date1 = datetime.datetime.strptime(date_str1, "%Y-%m-%d")
    date2 = datetime.datetime.strptime(date_str2, "%Y-%m-%d")

    # Calculate the absolute difference in days
    days_apart = (date2 - date1).days
    # Check if the dates are within two weeks (14 days) apart
    if days_apart <= 14:
        return 2
    else:
        return 0

compare_cl.custom(cmp,"flight_date","covidtest_date",label="date_cmp")


features = compare_cl.compute(pairs, flight, health)

In [19]:
%%ag
thresh = 5

linked_df = compare_cl.get_match(thresh)

In [20]:
%%ag
# Submitting the column containing the filtered set of airlines we should report regarding a covid passenger.
res = linked_df[["l_flight_number"]]
result = submit_predictions(res)

score: {'leaderboard': 0.9264646441803617, 'logs': {'LIN_EPS': -0.0, 'MCC': 0.9264646441803617}}



In [21]:
session.terminate_session()

{'status': 'ok'}