<a href="https://colab.research.google.com/github/TheMachineSaysHello/DeepLearning101/blob/main/MajorDiagnoses.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Is this a "Medical-Grade" Word Embedding ??
This notebook runs an experiment that does the followings:
Downloads CMS, U.S. Centers for Medicare & Medicaid  DRG Defitions Manual

A few things to know about the data 

+ MDC: Major Diagnoses Category (25 Exisits) high level grouping of diseases
    
    
    MDC exmaple: "Diseases and disorders of the eye"

+ MSDRG: Medicare Severity Diagnosis Related Group  (several hundred exist), can be mapped to only one MDC, often just called DRG  and is a finer grain than MDC
  

    MSDRG example: "DRG 121 - ACUTE MAJOR EYE INFECTIONS WITH CC/MCC" 

Note on DRG:      
    CC = Complication or Comorbidity, 
    MCC = Major Complication or Comorbidity 

+ ICD10cm : Billing Diagnoses Code (US version), (10s of thousands exists)
In this experiement, only icd10cm that be mapped to one or more DRGs, and only one MDC are used. 

 
    ICD10Cm example: "H16.011 Central corneal ulcer, right eye"



In [None]:
!pip install tensorflow
!pip install wget
!pip install scikit-learn
!pip install plotly
!pip install biobert-embedding




In [None]:
from biobert_embedding.embedding import BiobertEmbedding
import pandas as pd
import re
import wget
import os
import numpy as np
from torch import nn
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

import plotly.express as px

In [None]:


## Downloads CMS DRG Definitions Manual 
url = "https://www.cms.gov/files/zip/icd-10-ms-drg-definitions-manual-files-v39.zip"
wget.download(url)

## Unzips Downloaded File into working directory 
cwd = os.getcwd()
cmd = F'unzip MSDRGv39.0ICD10_R0_DefinitionsManual_TEXT.zip -d {cwd}'
print(F'running {cmd}')
os.system(cmd)
os.listdir()

running unzip MSDRGv39.0ICD10_R0_DefinitionsManual_TEXT.zip -d /content


['.config',
 'mdcs_22_25.txt',
 'MSDRGv39.0ICD10_R0_DefinitionsManual_TEXT.zip',
 'appendix_C.txt',
 'appendix_A.txt',
 'appendix_B.txt',
 'mdcs_08_11.txt',
 'app.log',
 'appendix_F_J.txt',
 'appendix_D_E.txt',
 'mdcs_00_07.txt',
 'MSDRGv39.0ICD10_R0_DefinitionsManual_TEXT (2).zip',
 'mdcs_12_21.txt',
 'MSDRGv39.0ICD10_R0_DefinitionsManual_TEXT (1).zip',
 'sample_data']

In [None]:

## reads in appended D_E and create a pandas dataframe with MDC  Major Diagnoses Group Defintions
def read_mcd_map(inputs):
    results = []
    for line in inputs:
        mdc_search =  re.search('MDC [0-9]{2,3}.+\n',line)
        drg_search = re.search(' ?[0-9]{3,3}-?[0-9]{0,3}.+\n', line)
        if mdc_search:
            MDC = re.sub('\n', '', mdc_search.group())
        if drg_search:
            DRG =  re.sub('^ ', '', re.sub('\n', '',drg_search.group()))
            if re.search('[0-9]{3,3}-[0-9]{3,3}', DRG):
                #drg_range = list(range(int(DRG[0:4], int(DRG[5:])+1)))
                results.append((MDC, DRG))
    index =[]
    min_vals = []
    max_vals = []
    drg_desc = []
    for v in results:
        index.append(v[0])
        min_vals.append(int(v[1][0:3]))
        drg_desc.append(v[1])
        try:
            max_vals.append(int(v[1][4:7]))
        except ValueError:
            max_vals.append(int(v[1][0:3]))
    df = pd.DataFrame({'mdc_dsc': index, 'drg_group_desc':drg_desc, 'min_drg':  min_vals, 'max_drg':max_vals})
    df.loc[:, 'drg_list'] = [list(range(x,y+1)) for (x,y) in zip(df.loc[:,'min_drg' ], df.loc[:, 'max_drg'])]
    df.loc[:, 'mdc'] = [int(re.search('[0-9]+', v).group()) for v in df.loc[:, 'mdc_dsc']]
    return df.drop(['min_drg', 'max_drg'], axis=1).set_index('mdc')

os.listdir()
path ='appendix_D_E.txt'
with open(path, 'r') as f:
    lines = f.readlines()
inputs = lines[23:230]
drg_to_mdc = read_mcd_map(inputs).reset_index()
mdc_def = drg_to_mdc.groupby(['mdc', 'mdc_dsc']).count().reset_index().drop(['drg_group_desc', 'drg_list'], axis=1)
mdc_def.head()


  val = np.array(val, copy=False)
  return array(a, dtype, copy=False, order=order)


Unnamed: 0,mdc,mdc_dsc
0,0,MDC 00 Pre-MDC
1,1,MDC 01 Diseases and disorders of the nervous s...
2,2,MDC 02 Diseases and disorders of the eye
3,3,"MDC 03 Diseases and disorders of the ear, nose..."
4,4,MDC 04 Diseases and disorders of the respirato...


In [None]:
path ='appendix_A.txt'
with open(path, 'r') as f:
    lines = f.readlines()
inputs = lines[11:]


def read_drgs(inputs):
    results = []
    for line in inputs:
        results.append((line[0:3], line[4:6], line[8:9], line[11:].strip().replace('\n', '')))
    return pd.DataFrame(results, columns=['drg', 'mdc', 'ms', 'desc'])

drg_def = read_drgs(inputs)
drg_def

Unnamed: 0,drg,mdc,ms,desc
0,001,,P,HEART TRANSPLANT OR IMPLANT OF HEART ASSIST SY...
1,002,,P,HEART TRANSPLANT OR IMPLANT OF HEART ASSIST SY...
2,003,,P,ECMO OR TRACHEOSTOMY WITH MV >96 HOURS OR PRIN...
3,004,,P,TRACHEOSTOMY WITH MV >96 HOURS OR PRINCIPAL DI...
4,005,,P,LIVER TRANSPLANT WITH MCC OR INTESTINAL TRANSP...
...,...,...,...,...
762,987,,P,NON-EXTENSIVE O.R. PROCEDURES UNRELATED TO PRI...
763,988,,P,NON-EXTENSIVE O.R. PROCEDURES UNRELATED TO PRI...
764,989,,P,NON-EXTENSIVE O.R. PROCEDURES UNRELATED TO PRI...
765,998,,,PRINCIPAL DIAGNOSIS INVALID AS DISCHARGE DIAGN...


In [None]:
path ='appendix_B.txt'
with open(path, 'r') as f:
    lines = f.readlines()
inputs = lines[9:]
inputs[:10]

def read_dg_icd10cm_map(inputs):
    results =[]
    for line in inputs:
        line = re.sub('  ', ' ', line).strip()
        icd10_search =  re.search('[A-Z][0-9]{2,3}[A-Z0-9]+',line)
        mdc_search = re.search(' [0-9]{2,2} ', line)
        icd10_desc_search =  re.search('[A-Za-z]{2,2}.*', line)
        drg_search = re.search(' [0-9]{3,3}-?[0-9]{0,3}', line)
        
        
        if icd10_search:
            ICD10 = icd10_search.group().strip()
        if mdc_search:
            MDC =  int(mdc_search.group().strip())
        if icd10_desc_search:
            ICD10_DESC = icd10_desc_search.group().strip()
        if drg_search:
            DRG =  drg_search.group().strip()
            results.append((MDC, DRG, ICD10,   ICD10_DESC ))
    min_vals = []
    max_vals = []
    for v in results:
        min_vals.append(int(v[1][0:3]))
        try:
            max_vals.append(int(v[1][4:7]))
        except ValueError:
            max_vals.append(int(v[1][0:3]))
    
    index = [v[2] for v in results]
    idc10_desc =  [v[3] for v in results]
    mdcs = [v[0] for v in results]
    
    df = pd.DataFrame({'icd10': index,
                       'icd10_desc': idc10_desc, 
                       'mdc': mdcs ,  
                       'min_drg':  min_vals, 
                       'max_drg':max_vals})
    df.loc[:, 'drg_list'] = [set(list(range(x,y+1))) for (x,y) in zip(df.loc[:,'min_drg' ], df.loc[:, 'max_drg'])]
    df.loc[:, 'drg_list'] = df.loc[:, 'drg_list'].apply(lambda x: list(x))
    df_g = df.explode('drg_list')
    results = df_g.groupby(['icd10', 'icd10_desc', 'mdc']).agg({'drg_list':'unique'})
    return results.reset_index()
icd10_to_drg_map = read_dg_icd10cm_map(inputs)
icd10_to_drg_map.head()
# Joins the mdc defitions to the icdo10 Codes to create a data sets of text and labels 
df = icd10_to_drg_map.merge(mdc_def, on='mdc', how='inner') 
df.head()

Unnamed: 0,icd10,icd10_desc,mdc,drg_list,mdc_dsc
0,A000,"Cholera due to Vibrio cholerae 01, biovar chol...",6,"[371, 372, 373]",MDC 06 Diseases and disorders of the digestive...
1,A001,"Cholera due to Vibrio cholerae 01, biovar eltor",6,"[371, 372, 373]",MDC 06 Diseases and disorders of the digestive...
2,A009,"Cholera, unspecified",6,"[371, 372, 373]",MDC 06 Diseases and disorders of the digestive...
3,A010,Benign carcinoid tumor of the duodenum,6,"[393, 394, 395]",MDC 06 Diseases and disorders of the digestive...
4,A010,Malignant carcinoid tumor of the duodenum,6,"[376, 374, 375]",MDC 06 Diseases and disorders of the digestive...


#### Encoder Inputs and Outputs 

In [None]:

text = df.loc[:, 'icd10_desc'].apply(lambda x: re.sub('  ', ' ', x).strip()).values
labels = df.loc[:, 'mdc'].values
labels_text =  df.loc[:, 'mdc_dsc'].values

# counts the max number of tokens 
max_sequence_len = np.max([len(doc.split(' ')) for doc in text])
print(F'Max Sequence Len: {max_sequence_len}')
print('Example Data, with MDC as labels :')
print(list(zip(text[0:3], labels[0:3])))

Max Sequence Len: 30
Example Data, with MDC as labels :
[('Cholera due to Vibrio cholerae 01, biovar cholerae', 6), ('Cholera due to Vibrio cholerae 01, biovar eltor', 6), ('Cholera, unspecified', 6)]


In [120]:
biobert = BiobertEmbedding()

def predict(text_array):
  output = np.zeros((len(text_array), 768))
  for i, sentence in enumerate(text_array):
    output[i, :] = biobert.sentence_vector(sentence)
  return output


In [121]:
sample_size = 10000
sample_index = np.random.choice(np.arange(text.shape[0]),sample_size)
labels_text_sampled = labels_text[sample_index]
text_sampled = text[sample_index]
features_sampled = predict(text_sampled)
features_sampled.shape

(10000, 768)

In [122]:

pca_components = PCA(10).fit_transform(features_sampled)
tnse_comps =  pd.DataFrame(TSNE(2).fit_transform(pca_components))
tnse_comps.loc[:, 'mdc'] = labels_text_sampled
tnse_comps.loc[:, 'icd10'] = text_sampled
tnse_comps.head()

Unnamed: 0,0,1,mdc,icd10
0,-4.731775,54.531094,"MDC 21 Injuries, poisonings and toxic effects ...",XS 21 913-914 Unspecified injury of unspecifi...
1,12.063105,3.045809,"MDC 09 Diseases and disorders of the skin, sub...",Other local lupus erythematosus
2,-8.38784,-45.485188,"MDC 21 Injuries, poisonings and toxic effects ...",Laceration of deep palmar arch of unspecified ...
3,-14.040623,-48.646206,"MDC 09 Diseases and disorders of the skin, sub...",Burn of unspecified degree of single right fin...
4,30.611233,-49.141846,MDC 23 Factors influencing health status and o...,"Insect bite (nonvenomous) of breast, left brea..."


In [123]:

fig = px.scatter(tnse_comps, x=0, y=1, color='mdc', hover_data=['icd10'])

fig.show()


#### So What Happens if I train this thing?


In [None]:
bi  