In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
import re
from tqdm import tqdm
import string
import torch
import os

from datasets import load_dataset
from sklearn.model_selection import train_test_split
from dataset_processing import FewShotSampler, encode_classes

look at loading mednli dataset

In [2]:
data_dir = "/mnt/sdg/niallt/mimic_iii/processed/HADM_ID_split/"
save_dir = "/mnt/sdg/niallt/mimic_iii/processed/HADM_ID_split/pseudo_classification/"

In [3]:
train_df = pd.read_csv(f"{data_dir}/train_df_notes_interim_preprocessed.csv")
test_df = pd.read_csv(f"{data_dir}/test_df_notes_interim_preprocessed.csv")

  train_df = pd.read_csv(f"{data_dir}/train_df_notes_interim_preprocessed.csv")
  test_df = pd.read_csv(f"{data_dir}/test_df_notes_interim_preprocessed.csv")


In [4]:
train_df.dtypes

ROW_ID           int64
SUBJECT_ID       int64
HADM_ID        float64
CHARTDATE       object
CHARTTIME       object
STORETIME       object
CATEGORY        object
DESCRIPTION     object
CGID           float64
ISERROR        float64
TEXT            object
dtype: object

For the NOTE CATEGORY classification we just want to pull out the CATEGORY and TEXT

In [4]:
train_df_cat = train_df[["TEXT","CATEGORY"]]
test_df_cat =  test_df[["TEXT","CATEGORY"]]

In [6]:
train_df_cat.shape

(1079151, 2)

In [9]:
train_df_cat.CATEGORY.value_counts().index.tolist()

['Nursing/other',
 'Radiology',
 'Nursing',
 'Physician ',
 'Discharge summary',
 'Echo',
 'Respiratory ',
 'Nutrition',
 'General',
 'Rehab Services',
 'Pharmacy',
 'Consult']

In [22]:
train_df_cat.head()

Unnamed: 0,TEXT,CATEGORY,label
0,service: addendum: radiologic studies: radiolo...,Discharge summary,4
1,sex: f service: micu and then to medicine hist...,Discharge summary,4
2,sex: m service: medicine allergies: patient re...,Discharge summary,4
3,sex: f service: neurosurgery allergies: no kno...,Discharge summary,4
4,sex: m service: neurosurgery allergies: no kno...,Discharge summary,4


In [5]:
if not os.path.exists(f"{save_dir}"):
    os.makedirs(f"{save_dir}")   

In [44]:
train_df.head(50000).label.value_counts()

1    38324
2    11676
Name: label, dtype: int64

In [6]:
# take just the top N 
n_classes = 8
classes_to_keep = list(train_df.CATEGORY.value_counts().keys()[:n_classes])

In [18]:
train_df_cat.CATEGORY.value_counts()

Nursing/other        515874
Radiology            237373
Nursing              141452
Physician             89750
Discharge summary     38324
Echo                  21541
Respiratory           20102
Nutrition              5898
General                5344
Rehab Services         3360
Pharmacy                 69
Consult                  64
Name: CATEGORY, dtype: int64

In [40]:
classes_to_keep

['Nursing/other',
 'Radiology',
 'Nursing',
 'Physician ',
 'Discharge summary',
 'Echo',
 'Respiratory ',
 'Nutrition']

In [7]:
# now encode the labels - and sort by the value counts rather than string value - this well help keep ordering when subetting by class frequency

class_list, idx_to_class, class_to_idx = encode_classes(train_df_cat, label_col="CATEGORY", sort_by_value_count=True)

In [7]:
class_list

['Nursing/other',
 'Radiology',
 'Nursing',
 'Physician ',
 'Discharge summary',
 'Echo',
 'Respiratory ',
 'Nutrition',
 'General',
 'Rehab Services',
 'Pharmacy',
 'Consult']

In [8]:
train_df_cat['label'] = train_df_cat['CATEGORY'].map(class_to_idx)
test_df_cat['label'] = test_df_cat['CATEGORY'].map(class_to_idx)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df_cat['label'] = train_df_cat['CATEGORY'].map(class_to_idx)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df_cat['label'] = test_df_cat['CATEGORY'].map(class_to_idx)


In [9]:
train_df_cat.to_csv(f"{save_dir}/train.csv", index = False)
test_df_cat.to_csv(f"{save_dir}/test.csv", index = False)

In [19]:
classes_to_keep

['Nursing/other',
 'Radiology',
 'Nursing',
 'Physician ',
 'Discharge summary',
 'Echo',
 'Respiratory ',
 'Nutrition']

In [10]:
# get new DFs with only the top N classes in

subset_train_df = train_df_cat[train_df_cat['CATEGORY'].isin(classes_to_keep)]
subset_test_df = test_df_cat[test_df_cat['CATEGORY'].isin(classes_to_keep)]


In [23]:
subset_train_df.CATEGORY.value_counts()

Nursing/other        515874
Radiology            237373
Nursing              141452
Physician             89750
Discharge summary     38324
Echo                  21541
Respiratory           20102
Nutrition              5898
Name: CATEGORY, dtype: int64

In [25]:
subset_test_df.CATEGORY.value_counts()

Nursing/other        306623
Radiology            284906
Nursing               82104
Physician             51874
Echo                  24253
Discharge summary     21328
Respiratory           11637
Nutrition              3520
Name: CATEGORY, dtype: int64

In [11]:
final_train_df = subset_train_df.copy()

#### Creat val/test splits

In [34]:
final_test_df.shape

(393122, 3)

In [35]:
final_train_df.shape

(1070314, 3)

In [12]:
final_test_df, final_val_df = train_test_split(subset_test_df, test_size=0.5)

In [23]:
final_val_df.shape

(393123, 3)

### Create few shot datasets

here we want to create some balanced fewshot datasets - althoughi n reality we can also make just big balanced sub samples

In [17]:
few_shot_n = 500
label_col = "label"

In [18]:
# initialise the sampler
support_sampler = FewShotSampler(num_examples_per_label = few_shot_n, 
                                 also_sample_dev=False, 
                                 label_col = label_col)
# now apply to each dataframe but convert to dictionary in records form first
fewshot_train_df = support_sampler(final_train_df.to_dict(orient="records"), seed = 1)

# do we actually want to resample the val and test sets - probably not? 
fewshot_val_df = support_sampler(final_val_df.to_dict(orient="records"), seed = 1)
fewshot_test_df = support_sampler(final_test_df.to_dict(orient="records"), seed = 1)

In [19]:
fewshot_reduced_class_save_dir = f"{save_dir}/class_reduced_{n_classes}/fewshot_{few_shot_n}"
if not os.path.exists(fewshot_reduced_class_save_dir):
    os.makedirs(f"{fewshot_reduced_class_save_dir}")   

In [20]:
# now write to file
fewshot_train_df.to_csv(f"{fewshot_reduced_class_save_dir}/train.csv", index = False)
fewshot_val_df.to_csv(f"{fewshot_reduced_class_save_dir}/valid.csv", index = False)

fewshot_test_df.to_csv(f"{fewshot_reduced_class_save_dir}/test.csv", index = False)



In [21]:
fewshot_val_df.label.value_counts()

4    500
5    500
7    500
0    500
6    500
1    500
2    500
3    500
Name: label, dtype: int64

In [38]:
fewshot_val_df.head()

Unnamed: 0,TEXT,CATEGORY,label
0,"name: , unit no: date of birth: sex: m service...",Discharge summary,4
1,demographics day of intubation: day of mechani...,Respiratory,6
2,micu nursing progress note 0700-1900 events: t...,Nursing/other,0
3,subjective patient recently extubated- per fam...,Nutrition,7
4,"chief complaint: sdh, resp failure i saw and e...",Physician,3


In [39]:
fewshot_train_df.head()

Unnamed: 0,TEXT,CATEGORY,label
0,"patient remains on mechanical ventilation,went...",Nursing/other,0
1,3:55 am chest (portable ap) clip # reason: int...,Radiology,1
2,airway tube type tracheostomy tube: type: uncu...,Respiratory,6
3,subjective patient unable objective pertinent ...,Nutrition,7
4,"subjective patient somnolent, absent gag objec...",Nutrition,7


: 

## ICD9-Triage task


In [23]:
data_dir = "/mnt/sdg/niallt/mimic3-icd9-data/intermediary-data/triage/"

In [30]:
train_df = pd.read_csv(f"{data_dir}/train.csv")
val_df = pd.read_csv(f"{data_dir}/valid.csv")
test_df = pd.read_csv(f"{data_dir}/test.csv")

In [27]:
train_df.head()

Unnamed: 0,text,label,triage-category
0,: : : Sex: F Service: CARDIOTHORACIC Allergies...,4240,Cardiology
1,: : : Sex: F Service: NEONATOLOGY HISTORY: wee...,V3001,Obstetrics
2,: : : Sex: M Service: CARDIOTHORACIC Allergies...,41041,Cardiology
3,: : : Sex: F Service: MEDICINE Allergies: Peni...,51881,Respiratory
4,: : : Sex: M Service: ADMISSION DIAGNOSIS: . S...,41401,Cardiology


In [32]:
train_df.shape, val_df.shape, test_df.shape

((9559, 3), (3114, 3), (3172, 3))

In [34]:
train_df["triage-category"].value_counts()

Cardiology          4981
Obstetrics          1788
Respiratory         1219
Neurology            744
Gastroenterology     417
AcuteMedicine        213
Oncology             197
Name: triage-category, dtype: int64

In [39]:
val_df["triage-category"].value_counts()

Cardiology          1571
Obstetrics           635
Respiratory          388
Neurology            231
Gastroenterology     148
Oncology              75
AcuteMedicine         66
Name: triage-category, dtype: int64

In [40]:
test_df["triage-category"].value_counts()

Cardiology          1636
Obstetrics           603
Respiratory          396
Neurology            272
Gastroenterology     142
Oncology              63
AcuteMedicine         60
Name: triage-category, dtype: int64

In [35]:
class_list, idx_to_class, class_to_idx = encode_classes(train_df, label_col="triage-category", sort_by_value_count=True)

In [38]:
class_list, idx_to_class, class_to_idx

(['Cardiology',
  'Obstetrics',
  'Respiratory',
  'Neurology',
  'Gastroenterology',
  'AcuteMedicine',
  'Oncology'],
 {0: 'Cardiology',
  1: 'Obstetrics',
  2: 'Respiratory',
  3: 'Neurology',
  4: 'Gastroenterology',
  5: 'AcuteMedicine',
  6: 'Oncology'},
 {'Cardiology': 0,
  'Obstetrics': 1,
  'Respiratory': 2,
  'Neurology': 3,
  'Gastroenterology': 4,
  'AcuteMedicine': 5,
  'Oncology': 6})

In [44]:
# read in the fewshot one

fs_train_df = pd.read_csv(f"/mnt/sdg/niallt/mimic_iii/processed/HADM_ID_split//icd9-triage/TESTING_CODE/train.csv")

In [47]:
fs_train_df.label.value_counts()

3    200
2    200
4    200
5    200
0    200
1    200
6    197
Name: label, dtype: int64

## MIMIC TOP ICD9-50

