In [2]:
import pandas as pd
from collections import Counter
import os

pd.set_option('display.width', 1000)



In [3]:
root = '../../Data/nbme-score-clinical-patient-notes/'
features_path = os.path.join(root, 'features.csv')
patient_notes_path = os.path.join(root, 'patient_notes.csv')
sample_submission_path = os.path.join(root, 'sample_submission.csv')
test_path = os.path.join(root, 'test.csv')
train_path = os.path.join(root, 'train.csv')

In [4]:
features = pd.read_csv(features_path, sep=',', header=0)
features

Unnamed: 0,feature_num,case_num,feature_text
0,0,0,Family-history-of-MI-OR-Family-history-of-myoc...
1,1,0,Family-history-of-thyroid-disorder
2,2,0,Chest-pressure
3,3,0,Intermittent-symptoms
4,4,0,Lightheaded
...,...,...,...
138,912,9,Family-history-of-migraines
139,913,9,Female
140,914,9,Photophobia
141,915,9,No-known-illness-contacts


In [5]:
patient_notes = pd.read_csv(patient_notes_path, sep=',', header=0)
patient_notes

Unnamed: 0,pn_num,case_num,pn_history
0,0,0,"17-year-old male, has come to the student heal..."
1,1,0,17 yo male with recurrent palpitations for the...
2,2,0,Dillon Cleveland is a 17 y.o. male patient wit...
3,3,0,a 17 yo m c/o palpitation started 3 mos ago; \...
4,4,0,17yo male with no pmh here for evaluation of p...
...,...,...,...
42141,95330,9,Ms. Madden is a 20 yo female presenting w/ the...
42142,95331,9,A 20 YO F CAME COMPLAIN A DULL 8/10 HEADACHE T...
42143,95332,9,Ms. Madden is a 20yo female who presents with ...
42144,95333,9,Stephanie madden is a 20 year old woman compla...


In [6]:
sample_submission = pd.read_csv(sample_submission_path, sep=',', header=0)
sample_submission

Unnamed: 0,id,location
0,00016_000,0 100
1,00016_001,
2,00016_002,200 250;300 400
3,00016_003,
4,00016_004,75 110


# intro 
- `case_num`: 0~9, each num belongs their groups ... ? 
- `pn_num`: the id in patient_notes.csv which is 'pn_history', present the note of each case 
- `feature_num`: the id in features.csv which is 'feature_num', present the feature of each case 
- `location`: the index of the `annotation` in the `pn_history`. 

In [7]:
train = pd.read_csv(train_path, sep=',', header=0)
train = train.merge(features, on=['feature_num', 'case_num'], how='left')
train = train.merge(patient_notes, on=['pn_num', 'case_num'], how='left')
train

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,feature_text,pn_history
0,00016_000,0,16,0,['dad with recent heart attcak'],['696 724'],Family-history-of-MI-OR-Family-history-of-myoc...,HPI: 17yo M presents with palpitations. Patien...
1,00016_001,0,16,1,"['mom with ""thyroid disease']",['668 693'],Family-history-of-thyroid-disorder,HPI: 17yo M presents with palpitations. Patien...
2,00016_002,0,16,2,['chest pressure'],['203 217'],Chest-pressure,HPI: 17yo M presents with palpitations. Patien...
3,00016_003,0,16,3,"['intermittent episodes', 'episode']","['70 91', '176 183']",Intermittent-symptoms,HPI: 17yo M presents with palpitations. Patien...
4,00016_004,0,16,4,['felt as if he were going to pass out'],['222 258'],Lightheaded,HPI: 17yo M presents with palpitations. Patien...
...,...,...,...,...,...,...,...,...
14295,95333_912,9,95333,912,[],[],Family-history-of-migraines,Stephanie madden is a 20 year old woman compla...
14296,95333_913,9,95333,913,[],[],Female,Stephanie madden is a 20 year old woman compla...
14297,95333_914,9,95333,914,['photobia'],['274 282'],Photophobia,Stephanie madden is a 20 year old woman compla...
14298,95333_915,9,95333,915,['no sick contacts'],['421 437'],No-known-illness-contacts,Stephanie madden is a 20 year old woman compla...


In [8]:
print(train['annotation'][0])
print(eval(train['annotation'][0])[0])

['dad with recent heart attcak']
dad with recent heart attcak


In [9]:
start_idx = train['pn_history'][0].find(eval(train['annotation'][0])[0])
end_idx = start_idx + len(eval(train['annotation'][0])[0])
print(start_idx, end_idx)
print(train['location'].head(1))

696 724
0    ['696 724']
Name: location, dtype: object


In [10]:
patient_notes

Unnamed: 0,pn_num,case_num,pn_history
0,0,0,"17-year-old male, has come to the student heal..."
1,1,0,17 yo male with recurrent palpitations for the...
2,2,0,Dillon Cleveland is a 17 y.o. male patient wit...
3,3,0,a 17 yo m c/o palpitation started 3 mos ago; \...
4,4,0,17yo male with no pmh here for evaluation of p...
...,...,...,...
42141,95330,9,Ms. Madden is a 20 yo female presenting w/ the...
42142,95331,9,A 20 YO F CAME COMPLAIN A DULL 8/10 HEADACHE T...
42143,95332,9,Ms. Madden is a 20yo female who presents with ...
42144,95333,9,Stephanie madden is a 20 year old woman compla...


# So, (annotation, location) is target what we should predict well
- we need to based on the `feature_text` and `pn_history` to predict the `annotation` and `location`. 

In [11]:
test = pd.read_csv(test_path, sep=',', header=0)
test = test.merge(features, on=['feature_num', 'case_num'], how='left') # 2 primary key (feature_num, case_num) to join 2 tables 
test = test.merge(patient_notes, on=['case_num', 'pn_num'], how='left') 
test

Unnamed: 0,id,case_num,pn_num,feature_num,feature_text,pn_history
0,00016_000,0,16,0,Family-history-of-MI-OR-Family-history-of-myoc...,HPI: 17yo M presents with palpitations. Patien...
1,00016_001,0,16,1,Family-history-of-thyroid-disorder,HPI: 17yo M presents with palpitations. Patien...
2,00016_002,0,16,2,Chest-pressure,HPI: 17yo M presents with palpitations. Patien...
3,00016_003,0,16,3,Intermittent-symptoms,HPI: 17yo M presents with palpitations. Patien...
4,00016_004,0,16,4,Lightheaded,HPI: 17yo M presents with palpitations. Patien...


# 2. Understand Submission.csv
- `case_num`: 0~9, each num belongs their groups ... ?

In [26]:
train_case_num_group = train.groupby(train['case_num'])
for idx, (case_num, each_case) in enumerate(train_case_num_group):    
    print(f'idx: {idx} \t case_num: {case_num} \t len: {len(each_case)}')
    # print('case_num:', case_num, 'len:', len(each_case))
    # display(each_case.head(2))

idx: 0 	 case_num: 0 	 len: 1300
idx: 1 	 case_num: 1 	 len: 1300
idx: 2 	 case_num: 2 	 len: 1700
idx: 3 	 case_num: 3 	 len: 1600
idx: 4 	 case_num: 4 	 len: 1000
idx: 5 	 case_num: 5 	 len: 1800
idx: 6 	 case_num: 6 	 len: 1200
idx: 7 	 case_num: 7 	 len: 900
idx: 8 	 case_num: 8 	 len: 1800
idx: 9 	 case_num: 9 	 len: 1700


In [36]:
train_feature_num_group = train.groupby(train['feature_num'])
for idx, (feature_num, each_feature) in enumerate(train_feature_num_group):    
    print('feature_num:',feature_num , '\t len:', len(each_feature) , '\tName:', each_feature['feature_text'].iloc[0])
    # display(each_feature.head(2))

feature_num: 0 	 len: 100 	Name: Family-history-of-MI-OR-Family-history-of-myocardial-infarction
feature_num: 1 	 len: 100 	Name: Family-history-of-thyroid-disorder
feature_num: 2 	 len: 100 	Name: Chest-pressure
feature_num: 3 	 len: 100 	Name: Intermittent-symptoms
feature_num: 4 	 len: 100 	Name: Lightheaded
feature_num: 5 	 len: 100 	Name: No-hair-changes-OR-no-nail-changes-OR-no-temperature-intolerance
feature_num: 6 	 len: 100 	Name: Adderall-use
feature_num: 7 	 len: 100 	Name: Shortness-of-breath
feature_num: 8 	 len: 100 	Name: Caffeine-use
feature_num: 9 	 len: 100 	Name: heart-pounding-OR-heart-racing
feature_num: 10 	 len: 100 	Name: Few-months-duration
feature_num: 11 	 len: 100 	Name: 17-year
feature_num: 12 	 len: 100 	Name: Male
feature_num: 100 	 len: 100 	Name: No-vaginal-discharge
feature_num: 101 	 len: 100 	Name: Weight-loss
feature_num: 102 	 len: 100 	Name: Not-sexually-active
feature_num: 103 	 len: 100 	Name: Prior-episodes-of-diarrhea
feature_num: 104 	 len: 1

# Search specific Patient 

In [47]:
PATIENT_IDX = 74087
patient_df = train[train["pn_num"] == PATIENT_IDX]
print(f'patient_history: {patient_df["pn_history"].iloc[0]}')
patient_df

patient_history: Angela Tompkins is a 35 yo female presenting w/ abnormal menstraul cycles for the past 6 months. LMP was 2 months ago and she has only had 2 cycles in the past 5 months. She has had heavy flow during recent cycles and will last about 7 days. They used to only last 3-4 days and had regular intervals. She has hardly had any pain w/ her cycles. She used to take oral contraceptives but stopped eleven years ago to try and get pregnant. She has been unsuccessful and not been able to get pregnant despite having sexual intercourse with her boyfriend and trying "for years". She denies trying to become pregnant now and in the recent past. She has not restarted contraception since trying to become pregnant. She had her first pap smear 6 months ago that did not have any abnormal findings. She has regular obgyn follow up. Her aunt has breast cancer and her grandmother had cervical cancer. Family has not done any genetic testing.


Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,feature_text,pn_history
10791,74087_700,7,74087,700,['female'],['27 33'],Female,Angela Tompkins is a 35 yo female presenting w...
10792,74087_701,7,74087,701,[],[],Weight-Gain,Angela Tompkins is a 35 yo female presenting w...
10793,74087_702,7,74087,702,"['only had 2 cycles in the past 5 months', 'he...","['130 168', '182 192', '218 240', '48 73']",heavy-periods-OR-irregular-periods,Angela Tompkins is a 35 yo female presenting w...
10794,74087_703,7,74087,703,['LMP was 2 months ago'],['97 117'],Last-menstrual-period-2-months-ago,Angela Tompkins is a 35 yo female presenting w...
10795,74087_704,7,74087,704,['has not restarted contraception'],['641 672'],Unprotected-Sex,Angela Tompkins is a 35 yo female presenting w...
10796,74087_705,7,74087,705,[],[],Fatigue,Angela Tompkins is a 35 yo female presenting w...
10797,74087_706,7,74087,706,['not been able to get pregnant'],['465 494'],Infertility-HX-OR-Infertility-history,Angela Tompkins is a 35 yo female presenting w...
10798,74087_707,7,74087,707,['35 yo'],['21 26'],35-year,Angela Tompkins is a 35 yo female presenting w...
10799,74087_708,7,74087,708,['6 months'],['87 95'],symptoms-for-6-months,Angela Tompkins is a 35 yo female presenting w...


In [75]:
len(patient_df['location'])

9

In [51]:
FEATURE_IDX = 000 
features[features['feature_num'] == FEATURE_IDX]

Unnamed: 0,feature_num,case_num,feature_text
0,0,0,Family-history-of-MI-OR-Family-history-of-myoc...


# What is the id meaning in test.csv? 
- `id`: 00016_000
  - 00016: `pn_num` in `patient_notes.csv`
  - 001: `feature_num` in `features.csv`


For example, if test id is 74087_708
- we need to find annotation '6 months' in `pn_history` Scripts which located at `Patient_notes.csv` where `pn_num` = 74087
- also need to consider 708 Features in `Feature.csv` which express that '`symptoms-for-6-months`'

In [65]:
TEST_PATIENT_IDX = 1
test_id = test.head(1)['id'].iloc[0].split('_')
test_note_id = test_id[0]
test_feature_id = test_id[1]
print(f'test_note_id: {test_note_id} \t test_feature_id: {test_feature_id}')

# find the patient note by patient note id
test_patient_note = patient_notes[patient_notes['pn_num'] == int(test_note_id)].iloc[0]
test_feature_nane = features[features['feature_num'] == int(test_feature_id)]

test_note_id: 00016 	 test_feature_id: 000
test_feature_nane: Family-history-of-MI-OR-Family-history-of-myocardial-infarction
test_patient_note: pn_num                                                       16
case_num                                                      0
pn_history    HPI: 17yo M presents with palpitations. Patien...
Name: 16, dtype: object


In [66]:
import re
def df_string2list_of_ints(df_string: str):
    df_string = df_string.strip("[]")
    if df_string == "":
        return []
    entries = re.split(",|;", df_string)
    entries = [entry.strip(" '") for entry in entries]
    ranges = [tuple(int(num_as_str) for num_as_str in entry.split(" ")) for entry in entries]
    return ranges

In [67]:
df_string2list_of_ints("['696 724']")


[(696, 724)]

In [72]:
features[features['feature_text'] == 'Intermittent']

Unnamed: 0,feature_num,case_num,feature_text
55,312,3,Intermittent
