In [1]:
# ====================================================
# Library
# ====================================================
import os
import ast
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# ====================================================
# Data Loading
# ====================================================
INPUT_DIR = './drive/MyDrive/Colab Notebooks/NBME/input/nbme-score-clinical-patient-notes/'
train = pd.read_csv(os.path.join(INPUT_DIR, 'train.csv'))
train['annotation'] = train['annotation'].apply(ast.literal_eval)
train['location'] = train['location'].apply(ast.literal_eval)
features = pd.read_csv(os.path.join(INPUT_DIR, 'features.csv'))
def preprocess_features(features):
    features.loc[27, 'feature_text'] = "Last-Pap-smear-1-year-ago"
    return features
features = preprocess_features(features)
patient_notes = pd.read_csv(os.path.join(INPUT_DIR, 'patient_notes.csv'))

print(f"train.shape: {train.shape}")
display(train.head())
print(f"features.shape: {features.shape}")
display(features.head())
print(f"patient_notes.shape: {patient_notes.shape}")
display(patient_notes.head())

train.shape: (14300, 6)


Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location
0,00016_000,0,16,0,[dad with recent heart attcak],[696 724]
1,00016_001,0,16,1,"[mom with ""thyroid disease]",[668 693]
2,00016_002,0,16,2,[chest pressure],[203 217]
3,00016_003,0,16,3,"[intermittent episodes, episode]","[70 91, 176 183]"
4,00016_004,0,16,4,[felt as if he were going to pass out],[222 258]


features.shape: (143, 3)


Unnamed: 0,feature_num,case_num,feature_text
0,0,0,Family-history-of-MI-OR-Family-history-of-myoc...
1,1,0,Family-history-of-thyroid-disorder
2,2,0,Chest-pressure
3,3,0,Intermittent-symptoms
4,4,0,Lightheaded


patient_notes.shape: (42146, 3)


Unnamed: 0,pn_num,case_num,pn_history
0,0,0,"17-year-old male, has come to the student heal..."
1,1,0,17 yo male with recurrent palpitations for the...
2,2,0,Dillon Cleveland is a 17 y.o. male patient wit...
3,3,0,a 17 yo m c/o palpitation started 3 mos ago; \...
4,4,0,17yo male with no pmh here for evaluation of p...


In [4]:
pn_unique = patient_notes['pn_num'].unique()
feature_unique = features['feature_num'].unique()

In [5]:
pseudo_list = []
for features_idx in feature_unique:
    temp = patient_notes.copy()
    temp['feature_num'] = features_idx
    pseudo_list.append(temp)

In [6]:
pseudo_df = pd.concat(pseudo_list)
display(pseudo_df)
display(pseudo_df.shape)

Unnamed: 0,pn_num,case_num,pn_history,feature_num
0,0,0,"17-year-old male, has come to the student heal...",0
1,1,0,17 yo male with recurrent palpitations for the...,0
2,2,0,Dillon Cleveland is a 17 y.o. male patient wit...,0
3,3,0,a 17 yo m c/o palpitation started 3 mos ago; \...,0
4,4,0,17yo male with no pmh here for evaluation of p...,0
...,...,...,...,...
42141,95330,9,Ms. Madden is a 20 yo female presenting w/ the...,916
42142,95331,9,A 20 YO F CAME COMPLAIN A DULL 8/10 HEADACHE T...,916
42143,95332,9,Ms. Madden is a 20yo female who presents with ...,916
42144,95333,9,Stephanie madden is a 20 year old woman compla...,916


(6026878, 4)

In [7]:
pseudo_df = pseudo_df.merge(features, on=['feature_num', 'case_num'], how='left')
display(pseudo_df.shape)

(6026878, 5)

In [8]:
# caseに該当していないfeature_textはnullになっているので除く
pseudo_df = pseudo_df.dropna().reset_index(drop=True)
display(pseudo_df.shape)

(626902, 5)

In [9]:
pseudo_df['id'] = pseudo_df['pn_num'].map(lambda x:format(x, '0>5')) + '_' + pseudo_df['feature_num'].map(lambda x:format(x, '0>3'))

In [10]:
pseudo_df = pseudo_df[['id', 'case_num', 'pn_num', 'feature_num', 'feature_text', 'pn_history']].sort_values(by=['pn_num', 'feature_num', 'case_num']).reset_index(drop=True)

In [11]:
train_ids = train['id'].tolist()
pseudo_ids = pseudo_df['id'].tolist()

In [12]:
# trainの14300減少確認OK
display(pseudo_df.shape)
pseudo_df = pseudo_df[~pseudo_df['id'].isin(train_ids)].reset_index(drop=True)
display(pseudo_df.shape)

(626902, 6)

(612602, 6)

In [13]:
display(pseudo_df)
pseudo_df.to_pickle('./drive/MyDrive/Colab Notebooks/NBME/pseudo/pseudo_plain.pkl')

Unnamed: 0,id,case_num,pn_num,feature_num,feature_text,pn_history
0,00000_000,0,0,0,Family-history-of-MI-OR-Family-history-of-myoc...,"17-year-old male, has come to the student heal..."
1,00000_001,0,0,1,Family-history-of-thyroid-disorder,"17-year-old male, has come to the student heal..."
2,00000_002,0,0,2,Chest-pressure,"17-year-old male, has come to the student heal..."
3,00000_003,0,0,3,Intermittent-symptoms,"17-year-old male, has come to the student heal..."
4,00000_004,0,0,4,Lightheaded,"17-year-old male, has come to the student heal..."
...,...,...,...,...,...,...
612597,95334_912,9,95334,912,Family-history-of-migraines,patient is a 20 yo F who presents with a heada...
612598,95334_913,9,95334,913,Female,patient is a 20 yo F who presents with a heada...
612599,95334_914,9,95334,914,Photophobia,patient is a 20 yo F who presents with a heada...
612600,95334_915,9,95334,915,No-known-illness-contacts,patient is a 20 yo F who presents with a heada...
