In [1]:
import antigranular as ag
# session = ag.login(<client_id>,<client_secret>, competition = "Harvard OpenDP Hackathon")
session = ag.login("#############", "############", competition = "Harvard OpenDP Hackathon")

server_hostname: ip-100-100-18-120.eu-west-1.compute.internal
tls_cert_name: ip-100-100-18-120.eu-west-1.compute.internal_d2abb397-de0a-4d0f-8749-121ccf7e20f4
local_host_port: d2abb397-de0a-4d0f-8749-121ccf7e20f4
Loading dataset "Flight Company Dataset" to the kernel...
Output: Dataset "Flight Company Dataset" loaded to the kernel as [92mflight_company_dataset[0m

Dataset "Flight Company Dataset" loaded to the kernel as flight_company_dataset
Loading dataset "Health Organisation Dataset" to the kernel...
Output: Dataset "Health Organisation Dataset" loaded to the kernel as [92mhealth_organisation_dataset[0m

Dataset "Health Organisation Dataset" loaded to the kernel as health_organisation_dataset
Connected to Antigranular server session id: 81d9f47e-fc04-4a05-83be-77ac838445f3, the session will time out if idle for 25 minutes
Cell magic '%%ag' registered successfully, use `%%ag` in a notebook cell to execute your python code on Antigranular private python server
🚀 Everything's set 

In [2]:
%%ag
health = health_organisation_dataset
flight = flight_company_dataset

# 1. Data Cleaning

Remove the passenger records who tested negative for covid-19. We will only consider the passenger records who tested positive for covid-19.

In [3]:
%%ag
# Lets remove those passenger records who tested negative.
health['covidtest_result'] = health['covidtest_result'].where(health['covidtest_result'] == 'positive')
health = health.dropna()

## 1.1. First name and middle name

first name and sometimes the middle name of the patient are jointly recorder, sometimes the first and middle name are recorded without any space between them. To handle this we will consider only the first name, so we eliminate the middle name information

In [4]:
%%ag
def clean_first_name(name: str) -> str:
    """
    Extracts the first name from a given string.
    Assumes that the first word is the first name, and if there are multiple capital letters in the first word,
    the first name ends before the second capital letter.
    e.g. "John Smith" -> "John"
    e.g. "JohnSmith" -> "John"
    e.g. "JohnSmithJunior" -> "John"

    :param name: A string representing a full name.
    :return: The extracted first name.
    """
    # Split the name into words based on spaces
    words = name.split()

    # Check the first word for capital letters
    first_word = words[0]
    capital_letters = [char for char in first_word if char.isupper()]

    # If more than one capital letter is present, assume first name ends before the second capital letter
    if len(capital_letters) > 1:
        second_capital_index = first_word.find(capital_letters[1])
        return first_word[:second_capital_index]

    # If only one capital letter, assume the entire word is the first name
    else:
        return first_word

In [5]:
%%ag
health["patient_firstname"] = health[["patient_firstname"]].applymap(clean_first_name, eps=0)
flight["passenger_firstname"] = flight[["passenger_firstname"]].applymap(clean_first_name, eps=0)

## 1.2. Last name

We standardize the last name by capitalizing the first letter and lowercasing the rest of the letters.

In [6]:
%%ag
def clean_last_name(name: str) -> str:
    """
    Standardizes the last name by capitalizing the first letter and lowercasing the rest of the letters.
    e.g. "SMITH" -> "Smith"

    :param name: A string representing a last name.
    :return: The standardized last name.
    """
    return name.capitalize()

In [7]:
%%ag
health["patient_lastname"] = health[["patient_lastname"]].applymap(clean_first_name, eps=0)
flight["passenger_lastname"] = flight[["passenger_lastname"]].applymap(clean_first_name, eps=0)

## 1.3. Date of birth

Date of birth have different formats, we will convert them to the same format. The type of format are the following: 
    01.Sep.1990
    1 September 90
    1-Sept-1990
We will convert them to the following format: YYYY-MM-DD

In [8]:
%%ag
import datetime 
def standardize_date(date_str: str) -> str:
    """
    Converts dates from various formats to a standardized 'YYYY-MM-DD' format.

    Supported formats:
    - '01.Sep.1990'
    - '1 September 90'
    - '1-Sept-1990'

    :param date_str: A string representing a date.
    :return: The date in 'YYYY-MM-DD' format
    """
    # Replace various separators with a standard one (space)
    for separator in ['.', '/', '-']:
        date_str = date_str.replace(separator, ' ')
    date_parts = date_str.split()  # Splitting the date into parts
    day = date_parts[0]   # day is always first
    month_str = date_parts[1] # month is always second
    year = date_parts[2]  # year is always third
    
    # Correct two-digit year if necessary
    if len(year) == 2:
        year = '20' + year if int(year) <= 23 else '19' + year
    
    # Transform month into a number
    month_str = month_str[:3]
    month = datetime.datetime.strptime(month_str, "%b").month
    month_formatted = f"{month:02d}"  # Ensuring two-digit format
    
    # Adjust day if necessary
    day_formatted = f"{int(day):02d}"  # Ensuring two-digit format
        
    # Reconstruct the date string in the identified format
    output_str = f"{year}-{month_formatted}-{day_formatted}"
    return output_str

In [9]:
%%ag
health["patient_date_of_birth"] = health[["patient_date_of_birth"]].applymap(clean_first_name, eps=0)
flight["passenger_date_of_birth"] = flight[["passenger_date_of_birth"]].applymap(clean_first_name, eps=0)

## Total positive cases

In [10]:
%%ag
ag_print(f"Total covid positive health records : {health['patient_firstname'].count(eps=0.1)}")

Total covid positive health records : 3248


# 2. Setting Indexing Rule

In [13]:
%%ag
import op_recordlinkage as rl
# Create an indexer and block on the date of birth
indexer = rl.Index()
indexer.block('passenger_date_of_birth','patient_date_of_birth')
candidate_links = indexer.index(flight, health)
# total number of links based on this indexing choice.
ag_print("Number of candidate links", candidate_links.count(eps=0.1))

Number of candidate links 589671


The number of candidate is too high, we will reduce the number of candidate links by blocking on and last name as well.

In [14]:
%%ag
import op_recordlinkage as rl
# Create an indexer and block on the date of birth
indexer = rl.Index()
indexer.block(['passenger_date_of_birth', "passenger_lastname"], 
              ['patient_date_of_birth', "patient_lastname"])
candidate_links = indexer.index(flight, health)
# total number of links based on this indexing choice.
ag_print("Number of candidate links", candidate_links.count(eps=0.1))

Number of candidate links 11747


The number of candidate links is reduced by a factor of 10, this is sufficient to continue

# 3. Comparing the records

Once the candidate links are formed , we can set compare rules against them to refine our linking process. In these rules , we can set a weight for each compare rule that we define.
  - Lets fuzzy match the firstnames ( using default value of either 1 or 0 )
  - Allow links on the positive covid result happening within two weeks (14 days before the flight or 14 days after the flight) ( with weight = 2 )

In [15]:
%%ag
import datetime 

comparer = rl.Compare()

# Using inbuilt string linking via fuzzy match. ( keeping threshold for last_name to get stronger links )
comparer.string("passenger_firstname" , "patient_firstname" ,method='jarowinkler', label="firstname")

# Using a custom compare rule.

def cmp(date_str1: str , date_str2: str ) -> int:
    # Convert date strings to datetime objects
    date1 = datetime.datetime.strptime(date_str1, "%Y-%m-%d")
    date2 = datetime.datetime.strptime(date_str2, "%Y-%m-%d")
    
    # Calculate the absolute difference in days
    days_apart = (date2 - date1).days
    # Check if the dates are within two weeks (14 days) apart
    if -14 <= days_apart <= 14:
        return 2
    else:
        return 0

comparer.custom(cmp, "flight_date", "covidtest_date", label="date_cmp")

In [16]:
%%ag
features = comparer.compute(candidate_links,flight,health)
# Lets find out the average matching weights obtained based on the compare rules we set.
ag_print(f"Average weight : {features.sum(axis=1).mean(eps=0.79)}")

Average weight : 1.7149788448174417


# 4. Linking the datasets

In [17]:
%%ag
linked_df = comparer.get_match(1.72)

# 5. Submission

In [18]:
%%ag
# Submitting the column containing the filtered set of airlines we should report regarding a covid passenger.
res = linked_df[["l_flight_number"]]
x = submit_predictions(res)

score: {'leaderboard': 0.584383717161067, 'logs': {'LIN_EPS': -0.0005, 'MCC': 0.5848837171610669}}
