In [1]:
!pip install antigranular



In [2]:
import antigranular as ag
session = ag.login(<client_id>,<client_secret>, competition = "Harvard OpenDP Hackathon")

Dataset "Flight Company Dataset" loaded to the kernel as [92mflight_company_dataset[0m

Dataset "Health Organisation Dataset" loaded to the kernel as [92mhealth_organisation_dataset[0m

Connected to Antigranular server session id: 586a33b1-a3e0-438e-ba5e-01a0d376aa90, the session will time out if idle for 25 minutes
Cell magic '%%ag' registered successfully, use `%%ag` in a notebook cell to execute your python code on Antigranular private python server
🚀 Everything's set up and ready to roll!


In [3]:
%%ag
health = health_organisation_dataset
flight = flight_company_dataset

## Basic metadata analysis
Estimate the size of the datasets , the column names and their metadatas.

In [4]:
%%ag
# Printing the column names using ag_print
ag_print(f"Flight Dataset \n {flight.columns} \n")
ag_print(f"Health Datset \n {health.columns}")

Flight Dataset 
 ['flight_number', 'flight_date', 'flight_from', 'flight_to', 'passenger_firstname', 'passenger_lastname', 'passenger_date_of_birth', 'passenger_phone_number', 'passenger_email_address'] 

Health Datset 
 ['patient_firstname', 'patient_lastname', 'patient_date_of_birth', 'patient_phone_number', 'patient_email_address', 'covidtest_date', 'covidtest_result', 'patient_address']



In [None]:
%%ag
# Getting the differentially private count
ag_print(f"Total health records : {health['patient_firstname'].count(eps=0.1)}")
ag_print(f"Total flight records :  {flight['passenger_firstname'].count(eps=0.1)}")

Total health records : 71705

Total flight records :  85237



To filter out those flights which may contain passenger which was reported covid 19 recently , we will need to link both datasets in an efficient way. One way to do it is by using the `recordlinkage` library.

In [5]:
%%ag
# Lets remove those passenger records who tested negative.
health['covidtest_result'] = health['covidtest_result'].where(health['covidtest_result'] == 'positive')
health = health.dropna()

Sample visualization of how the filtered health data looks
![](https://content.antigranular.com/image/notebook_content/health_dataset_rlsand.png)

In [None]:
%%ag
ag_print(f"Total covid positive health records : {health['patient_firstname'].count(eps=0.1)}")

Total covid positive health records : 3236



In [23]:
%%ag
import re
def extract_month_number(month_str: str) -> str:

    months_mapping = {
        'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04', 'May': '05', 'Jun': '06',
        'Jul': '07', 'Aug': '08', 'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12',
        'January': '01', 'February': '02', 'March': '03', 'April': '04', 'May': '05',
        'June': '06', 'July': '07', 'August': '08', 'September': '09', 'October': '10',
        'November': '11', 'December': '12',
        'Sept': '09', 'Sep': '09',
    }
    if(months_mapping.get(month_str)):
      return ""
    return ""

def parseDate(date_str: str) ->str:
    months = {
        'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04', 'May': '05', 'Jun': '06',
        'Jul': '07', 'Aug': '08', 'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12',
        'January': '01', 'February': '02', 'March': '03', 'April': '04', 'May': '05',
        'June': '06', 'July': '07', 'August': '08', 'September': '09', 'October': '10',
        'November': '11', 'December': '12',
        'Sept': '09', 'Sep': '09'
    }

    separators = [' ', '/', '.', '-']

    for separator in separators:
        parts = date_str.split(separator)
        if len(parts) == 3:
            month = months.get(parts[0])
            if month:
                year = parts[2]
                if len(year) == 2:
                    if int(year) < 21:
                        year = '20' + year
                    else:
                        year = '19' + year
                day = int(parts[1])
                if (1 <= day <= 31 ):
                  return f"{year}-{month}-{parts[1].zfill(2)}"

    for separator in separators:
        parts = date_str.split(separator)
        if len(parts) == 3:
            month = months.get(parts[0])
            if month:
                year = parts[1]
                if len(year) == 2:
                    if int(year) < 21:
                        year = '20' + year
                    else:
                        year = '19' + year
                day = int(parts[2])
                if (1 <= day <= 31 ):
                  return f"{year}-{month}-{parts[2].zfill(2)}"

    for separator in separators:
        parts = date_str.split(separator)
        if len(parts) == 3:
            month = months.get(parts[1])
            if month:
                year = parts[2]
                if len(year) == 2:
                    if int(year) < 21:
                        year = '20' + year
                    else:
                        year = '19' + year
                day = int(parts[0])
                if (1 <= day <= 31 ):
                  return f"{year}-{month}-{parts[0].zfill(2)}"

    for separator in separators:
        parts = date_str.split(separator)
        if len(parts) == 3:
            month = months.get(parts[1])
            if month:
                year = parts[0]
                if len(year) == 2:
                    if int(year) < 21:
                        year = '20' + year
                    else:
                        year = '19' + year
                day = int(parts[2])
                if (1 <= day <= 31 ):
                  return f"{year}-{month}-{parts[2].zfill(2)}"

    for separator in separators:
        parts = date_str.split(separator)
        if len(parts) == 3:
            month = months.get(parts[2])
            if month:
                year = parts[0]
                if len(year) == 2:
                    if int(year) < 21:
                        year = '20' + year
                    else:
                        year = '19' + year
                day = int(parts[1])
                if (1 <= day <= 31 ):
                  return f"{year}-{month}-{parts[1].zfill(2)}"

    for separator in separators:
        parts = date_str.split(separator)
        if len(parts) == 3:
            month = months.get(parts[2])
            if month:
                year = parts[1]
                if len(year) == 2:
                    if int(year) < 21:
                        year = '20' + year
                    else:
                        year = '19' + year
                day = int(parts[0])
                if (1 <= day <= 31 ):
                  return f"{year}-{month}-{parts[0].zfill(2)}"

    for separator in separators:
        parts = date_str.split(separator)
        if len(parts) == 3:
            month = parts[0]
            if 1 <= int(month) <= 12:
                year = parts[2]
                if len(year) == 2:
                    if int(year) < 21:
                        year = '20' + year
                    else:
                        year = '19' + year
                day = int(parts[1])
                if (1 <= day <= 31 ):
                  return f"{year}-{month}-{parts[1].zfill(2)}"

    for separator in separators:
        parts = date_str.split(separator)
        if len(parts) == 3:
            month = parts[0]
            if 1 <= int(month) <= 12:
                year = parts[1]
                if len(year) == 2:
                    if int(year) < 21:
                        year = '20' + year
                    else:
                        year = '19' + year
                day = int(parts[2])
                if (1 <= day <= 31 ):
                  return f"{year}-{month}-{parts[2].zfill(2)}"

    for separator in separators:
        parts = date_str.split(separator)
        if len(parts) == 3:
            month = parts[1]
            if 1 <= int(month) <= 12:
                year = parts[2]
                if len(year) == 2:
                    if int(year) < 21:
                        year = '20' + year
                    else:
                        year = '19' + year
                day = int(parts[0])
                if (1 <= day <= 31 ):
                  return f"{year}-{month}-{parts[0].zfill(2)}"

    for separator in separators:
        parts = date_str.split(separator)
        if len(parts) == 3:
            month = parts[1]
            if 1 <= int(month) <= 12:
                year = parts[0]
                if len(year) == 2:
                    if int(year) < 21:
                        year = '20' + year
                    else:
                        year = '19' + year
                day = int(parts[2])
                if (1 <= day <= 31 ):
                  return f"{year}-{month}-{parts[2].zfill(2)}"

    for separator in separators:
        parts = date_str.split(separator)
        if len(parts) == 3:
            month = parts[2]
            if 1 <= int(month) <= 12:
                year = parts[0]
                if len(year) == 2:
                    if int(year) < 21:
                        year = '20' + year
                    else:
                        year = '19' + year
                day = int(parts[1])
                if (1 <= day <= 31 ):
                  return f"{year}-{month}-{parts[1].zfill(2)}"

    for separator in separators:
        parts = date_str.split(separator)
        if len(parts) == 3:
            month = parts[2]
            if 1 <= int(month) <= 12:
                year = parts[1]
                if len(year) == 2:
                    if int(year) < 21:
                        year = '20' + year
                    else:
                        year = '19' + year
                day = int(parts[0])
                if (1 <= day <= 31 ):
                  return f"{year}-{month}-{parts[0].zfill(2)}"

    return ""

flight['passenger_date_of_birth'] = flight['passenger_date_of_birth'].map(parseDate)
health['patient_date_of_birth'] = health['patient_date_of_birth'].map(parseDate)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df[key] = value._series



### Setting Indexing Rules

> Indented block



When using the `recordlinkage` library , make sure you index both datasets against a column which you will might be the most similar in nature. If you do not index both datasets on a similar column , then the unique MultiIndexes generated can be of very high order.

In [7]:
%%ag
import op_recordlinkage as rl
# A full indexing is a complete cartesian product
indexer  = rl.Index()
indexer.full()




Lets index both datasets based on the date of birth, phone_number and email_address .


In [8]:
%%ag
import op_recordlinkage as rl
indexer = rl.Index()
indexer.block('passenger_date_of_birth','patient_date_of_birth')
candidate_links = indexer.index(flight,health)

In [None]:
%%ag
# total number of links based on this indexing choice.
ag_print(candidate_links.count(eps=0.1))

34310



### Setting Comparison Rules

Once the candidate links are formed , we can set compare rules against them to refine our linking process. In these rules , we can set a weight for each compare rule that we define.
  - Lets fuzzy match the firstnames and lastnames(thresholded) of the passenger. ( using default value of either 1 or 0 )
  - Allow links on the positive covid result happening atleast 14 before flight departure. ( with weight = 2 )

**click [here](https://recordlinkage.readthedocs.io/en/latest/ref-compare.html#recordlinkage.Compare) to learn about more compare rules.**

We currently support:
- String
- Numeric
- Exact
- Geo
- Date
- Custom Compares
    
    

In [12]:
%%ag
compare = rl.Compare()

import datetime

def cmp_date(date_str1:str , date_str2:str)->int: # datetime and regex are pre-imported in isolated environment.
    # Convert date strings to datetime objects
    date1 = datetime.datetime.strptime(date_str1, "%Y-%m-%d")
    date2 = datetime.datetime.strptime(date_str2, "%Y-%m-%d")

    # Calculate the absolute difference in days
    days_apart = (date2 - date1).days
    # Check if the dates are within two weeks (14 days) apart
    if -14 <= days_apart <= 14:
        return 2
    else:
        return 0

def cmp2(date_str1:str , date_str2:str)->int:
    # Convert date strings to datetime objects
    date1 = datetime.datetime.strptime(date_str1, "%Y-%m-%d")
    date2 = datetime.datetime.strptime(date_str2, "%Y-%m-%d")

    if(date1==date2):
      return 1
    return 0

import re
def cmp3(name1:str,name2:str)->int:

  name1 = re.sub(r"([a-z])([A-Z])", r"\1 \2", name1)
  name2 = re.sub(r"([a-z])([A-Z])", r"\1 \2", name2)

  if(name1==name2):
    return 1

  name1_parts = name1.split()
  name2_parts = name2.split()

  first_name1 = name1_parts[0]
  first_name2 = name2_parts[0]

  if len(name1_parts) > 1 and len(name2_parts) > 1:
    second_part_name1 = name1_parts[1]
    second_part_name2 = name2_parts[1]
    if second_part_name1 == second_part_name2 and first_name1==first_name2:
        return 1
    return 0

  if(first_name1==first_name2):
    return 1

  return 0

def cmp4(name1: str, name2: str) -> int:

    if name1==name2:
      return 1
    return 0

compare.custom(cmp_date,"flight_date","covidtest_date",label="date_cmp")
compare.custom(cmp2,"passenger_date_of_birth","patient_date_of_birth",label="dateofbirth")
compare.custom(cmp3,"passenger_firstname","patient_firstname",label="firstname")
compare.custom(cmp4,"passenger_lastname","patient_lastname",label="lastname")

In [10]:
%%ag
features = compare.compute(candidate_links,flight,health)

In [11]:
%%ag
linked_df = compare.get_match(5)

Sample Visualization of the linked PrivateDataFrame
![](https://content.antigranular.com/image/notebook_content/rl_linked_rlsand.png)

In [None]:
%%ag
# Submitting the column containing the filtered set of airlines we should report regarding a covid passenger.
res = linked_df[["l_flight_number"]]
x = submit_predictions(res)

score: {'leaderboard': 0.9479466659121429, 'logs': {'LIN_EPS': -0.0005, 'MCC': 0.9484466659121429}}



In [None]:
session.terminate_session()

{'status': 'ok'}