In [1]:
import pandas as pd
from collections import Counter
import os

pd.set_option('display.width', 1000)



In [6]:
root = './data/'
features_path = os.path.join(root, 'features.csv')
patient_notes_path = os.path.join(root, 'patient_notes.csv')
sample_submission_path = os.path.join(root, 'sample_submission.csv')
test_path = os.path.join(root, 'test.csv')
train_path = os.path.join(root, 'train.csv')

In [7]:
features = pd.read_csv(features_path, sep=',', header=0)
features

Unnamed: 0,feature_num,case_num,feature_text
0,0,0,Family-history-of-MI-OR-Family-history-of-myoc...
1,1,0,Family-history-of-thyroid-disorder
2,2,0,Chest-pressure
3,3,0,Intermittent-symptoms
4,4,0,Lightheaded
...,...,...,...
138,912,9,Family-history-of-migraines
139,913,9,Female
140,914,9,Photophobia
141,915,9,No-known-illness-contacts


In [8]:
patient_notes = pd.read_csv(patient_notes_path, sep=',', header=0)
patient_notes

Unnamed: 0,pn_num,case_num,pn_history
0,0,0,"17-year-old male, has come to the student heal..."
1,1,0,17 yo male with recurrent palpitations for the...
2,2,0,Dillon Cleveland is a 17 y.o. male patient wit...
3,3,0,a 17 yo m c/o palpitation started 3 mos ago; \...
4,4,0,17yo male with no pmh here for evaluation of p...
...,...,...,...
42141,95330,9,Ms. Madden is a 20 yo female presenting w/ the...
42142,95331,9,A 20 YO F CAME COMPLAIN A DULL 8/10 HEADACHE T...
42143,95332,9,Ms. Madden is a 20yo female who presents with ...
42144,95333,9,Stephanie madden is a 20 year old woman compla...


In [9]:
sample_submission = pd.read_csv(sample_submission_path, sep=',', header=0)
sample_submission

Unnamed: 0,id,location
0,00016_000,0 100
1,00016_001,
2,00016_002,200 250;300 400
3,00016_003,
4,00016_004,75 110


# intro 
- `case_num`: 0~9, each num belongs their groups ... ? 
- `pn_num`: the id in patient_notes.csv which is 'pn_history', present the note of each case 
- `feature_num`: the id in features.csv which is 'feature_num', present the feature of each case 
- `location`: 

In [73]:
import re
def df_string2list_of_ints(df_string: str):
    df_string = df_string.strip("[]")
    if df_string == "":
        return []
    entries = re.split(",|;", df_string)
    entries = [entry.strip(" '") for entry in entries]
    ranges = [tuple(int(num_as_str) for num_as_str in entry.split(" ")) for entry in entries]
    return ranges

In [83]:
pd.read_csv(train_path, sep=',', header=0)

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location
0,00016_000,0,16,0,['dad with recent heart attcak'],['696 724']
1,00016_001,0,16,1,"['mom with ""thyroid disease']",['668 693']
2,00016_002,0,16,2,['chest pressure'],['203 217']
3,00016_003,0,16,3,"['intermittent episodes', 'episode']","['70 91', '176 183']"
4,00016_004,0,16,4,['felt as if he were going to pass out'],['222 258']
...,...,...,...,...,...,...
14295,95333_912,9,95333,912,[],[]
14296,95333_913,9,95333,913,[],[]
14297,95333_914,9,95333,914,['photobia'],['274 282']
14298,95333_915,9,95333,915,['no sick contacts'],['421 437']


In [74]:
train = pd.read_csv(train_path, sep=',', header=0)
train = train.merge(features, on=['feature_num', 'case_num'], how='left')
train = train.merge(patient_notes, on=['pn_num', 'case_num'], how='left')

train["location"] = train["location"].apply(df_string2list_of_ints)

train

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,feature_text,pn_history
0,00016_000,0,16,0,['dad with recent heart attcak'],"[(696, 724)]",Family-history-of-MI-OR-Family-history-of-myoc...,HPI: 17yo M presents with palpitations. Patien...
1,00016_001,0,16,1,"['mom with ""thyroid disease']","[(668, 693)]",Family-history-of-thyroid-disorder,HPI: 17yo M presents with palpitations. Patien...
2,00016_002,0,16,2,['chest pressure'],"[(203, 217)]",Chest-pressure,HPI: 17yo M presents with palpitations. Patien...
3,00016_003,0,16,3,"['intermittent episodes', 'episode']","[(70, 91), (176, 183)]",Intermittent-symptoms,HPI: 17yo M presents with palpitations. Patien...
4,00016_004,0,16,4,['felt as if he were going to pass out'],"[(222, 258)]",Lightheaded,HPI: 17yo M presents with palpitations. Patien...
...,...,...,...,...,...,...,...,...
14295,95333_912,9,95333,912,[],[],Family-history-of-migraines,Stephanie madden is a 20 year old woman compla...
14296,95333_913,9,95333,913,[],[],Female,Stephanie madden is a 20 year old woman compla...
14297,95333_914,9,95333,914,['photobia'],"[(274, 282)]",Photophobia,Stephanie madden is a 20 year old woman compla...
14298,95333_915,9,95333,915,['no sick contacts'],"[(421, 437)]",No-known-illness-contacts,Stephanie madden is a 20 year old woman compla...


In [89]:
train["feature_num"]

0          0
1          1
2          2
3          3
4          4
        ... 
14295    912
14296    913
14297    914
14298    915
14299    916
Name: feature_num, Length: 14300, dtype: int64

In [82]:
num_rows_without_location = train["location"].apply(lambda row: len(row) == 0).sum() / len(train)
print(f" Percentage of rows without locations: {round(100 * num_rows_without_location)}%")

train[train["location"].apply(lambda row: len(row) == 0)]

 Percentage of rows without locations: 31%


Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,feature_text,pn_history
5,00016_005,0,16,5,[],[],No-hair-changes-OR-no-nail-changes-OR-no-tempe...,HPI: 17yo M presents with palpitations. Patien...
7,00016_007,0,16,7,[],[],Shortness-of-breath,HPI: 17yo M presents with palpitations. Patien...
8,00016_008,0,16,8,[],[],Caffeine-use,HPI: 17yo M presents with palpitations. Patien...
13,00041_000,0,41,0,[],[],Family-history-of-MI-OR-Family-history-of-myoc...,17 Y/O M CAME TO THE CLINIC C/O HEART POUNDING...
17,00041_004,0,41,4,[],[],Lightheaded,17 Y/O M CAME TO THE CLINIC C/O HEART POUNDING...
...,...,...,...,...,...,...,...,...
14290,95333_907,9,95333,907,[],[],No-rash,Stephanie madden is a 20 year old woman compla...
14292,95333_909,9,95333,909,[],[],viral-symptoms-OR-rhinorrhea-OR-scratchy-throat,Stephanie madden is a 20 year old woman compla...
14294,95333_911,9,95333,911,[],[],Meningococcal-vaccine-status-unknown,Stephanie madden is a 20 year old woman compla...
14295,95333_912,9,95333,912,[],[],Family-history-of-migraines,Stephanie madden is a 20 year old woman compla...


In [None]:
train[train["location"]]

In [52]:
print(train['annotation'][0])
print(eval(train['annotation'][0])[0])

['dad with recent heart attcak']
dad with recent heart attcak


In [61]:
start_idx = train['pn_history'][0].find(eval(train['annotation'][0])[0])
end_idx = start_idx + len(eval(train['annotation'][0])[0])
print(start_idx, end_idx)
print(train['location'].head(1))

696 724
0    ['696 724']
Name: location, dtype: object


In [34]:
patient_notes

Unnamed: 0,pn_num,case_num,pn_history
0,0,0,"17-year-old male, has come to the student heal..."
1,1,0,17 yo male with recurrent palpitations for the...
2,2,0,Dillon Cleveland is a 17 y.o. male patient wit...
3,3,0,a 17 yo m c/o palpitation started 3 mos ago; \...
4,4,0,17yo male with no pmh here for evaluation of p...
...,...,...,...
42141,95330,9,Ms. Madden is a 20 yo female presenting w/ the...
42142,95331,9,A 20 YO F CAME COMPLAIN A DULL 8/10 HEADACHE T...
42143,95332,9,Ms. Madden is a 20yo female who presents with ...
42144,95333,9,Stephanie madden is a 20 year old woman compla...


In [37]:
test = pd.read_csv(test_path, sep=',', header=0)
test = test.merge(features, on=['feature_num', 'case_num'], how='left') # 2 primary key (feature_num, case_num) to join 2 tables 
test = test.merge(patient_notes, on=['case_num', 'pn_num'], how='left') 
test

Unnamed: 0,id,case_num,pn_num,feature_num,feature_text,pn_history
0,00016_000,0,16,0,Family-history-of-MI-OR-Family-history-of-myoc...,HPI: 17yo M presents with palpitations. Patien...
1,00016_001,0,16,1,Family-history-of-thyroid-disorder,HPI: 17yo M presents with palpitations. Patien...
2,00016_002,0,16,2,Chest-pressure,HPI: 17yo M presents with palpitations. Patien...
3,00016_003,0,16,3,Intermittent-symptoms,HPI: 17yo M presents with palpitations. Patien...
4,00016_004,0,16,4,Lightheaded,HPI: 17yo M presents with palpitations. Patien...
