In [212]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup, SoupStrainer
from urllib.parse import urljoin

In [4]:
codes_website = "https://icdcodelookup.com/icd-10/common-codes"

In [1]:
print("Hello")

Hello


In [5]:
def get_medical_specialties(url: str) -> str | None:
    res = requests.get(url)
    if res.status_code in range(200, 300):
        soup = BeautifulSoup(
            res.content,
            "lxml",
            parse_only=SoupStrainer("ul", class_="specialtyList"),
        )
        specialties = soup.findAll("li")
        specialty_dict = {specialty.a.text: specialty.a["href"] for specialty in specialties}
        return specialty_dict
    return None

def get_icd10_codes(url: str) -> str | None:
    res = requests.get(url)
    if res.status_code in range(200, 300):
        soup = BeautifulSoup(
            res.content,
            "lxml",
            parse_only=SoupStrainer("ul", class_="chapterList"),
        )
        codes = soup.findAll("div", class_="code")
        codes_list = [code.span.text for code in codes]
        return codes_list
    else:
        return None


specialty_dict = get_medical_specialties(codes_website)
specialty_codes_dict = {}
for key, value in specialty_dict.items():
    print(key)
    webpage_specialty = urljoin(codes_website, value)
    specialty_codes_dict[key] = get_icd10_codes(webpage_specialty)

df = pd.DataFrame([[key,value] for key, value in specialty_codes_dict.items()], columns=["specialty", "codes"])
df = df.explode("codes").reset_index(drop=True)
df.head()
df.tail()

Anesthesiology
Audiology
Cardiology
Cardiothoracic & Vascular
Chiropractic
Dermatology
Emergency Medicine
Endocrinology
Family Practice
Gastroenterology
General Surgery
Hematology
Infectious Disease
Internal Medicine
Mental & Behavioral Health
Nephrology
Neurology
Obstetrics & Gynaecology
Oncology
Ophthalmology
Orthopedics - Lower
Orthopedics - Upper
Otolaryngology
Pain Management
Pathology
Pediatrics
Physical Therapy
Plastic Surgery
Podiatry
Primary Care
Pulmonology
Radiology
Speech Pathology
Urology


Unnamed: 0,specialty,codes
7371,Urology,R10.32
7372,Urology,R10.33
7373,Urology,R10.84
7374,Urology,R10.9
7375,Urology,R97.2


In [237]:
df_mapping_icd9_icd10 = pd.read_csv("../data/raw/icd_mapping/icd9to10.txt", sep="\s+", header=None, usecols=[0,1], names=["icd9", "icd10"])

print(df_mapping_icd9_icd10.shape)
df_mapping_icd9_icd10.head()


(24860, 2)


Unnamed: 0,icd9,icd10
0,10,A000
1,11,A001
2,19,A009
3,20,A0100
4,21,A011


In [240]:
df_mapping_icd9_icd10["icd9"] = df_mapping_icd9_icd10["icd9"].str[:3]
df_mapping_icd9_icd10["icd10"] = df_mapping_icd9_icd10["icd10"].str[:3]
df_mapping_icd9_icd10 = df_mapping_icd9_icd10[df_mapping_icd9_icd10["icd10"] != "NoD"].copy()

print(df_mapping_icd9_icd10.shape)
df_mapping_icd9_icd10.head()


(24438, 2)


Unnamed: 0,icd9,icd10
0,1,A00
1,1,A00
2,1,A00
3,2,A01
4,2,A01


In [241]:
df_mapping_icd9_icd10[df_mapping_icd9_icd10["icd9"] == "E86"]

Unnamed: 0,icd9,icd10


In [227]:
df_mapping_grouped = df_mapping_icd9_icd10.groupby('icd9', as_index=False).icd10.agg(lambda x: x.mode()[0])
print(df_mapping_grouped.shape)
df_mapping_grouped.head()

(1038, 2)


Unnamed: 0,icd9,icd10
0,1,A00
1,2,A01
2,3,A02
3,4,A03
4,5,A05


In [228]:
df_mapping_grouped[df_mapping_grouped["icd9"] == "365"]
df_mapping_grouped[df_mapping_grouped["icd10"] == "NoD"]

Unnamed: 0,icd9,icd10


In [230]:
df_mimic = pd.read_csv("../data/raw/mimic_iii/DIAGNOSES_ICD.csv")
df_mimic = df_mimic.dropna(subset=["ICD9_CODE"], axis=0)
df_mimic["ICD9_CODE"] = df_mimic["ICD9_CODE"].str[:3]
df_mimic.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE
0,1297,109,172335,1.0,403
1,1298,109,172335,2.0,486
2,1299,109,172335,3.0,582
3,1300,109,172335,4.0,585
4,1301,109,172335,5.0,425


In [231]:
df_mimic.shape

(651000, 5)

In [232]:
df_mimic["ROW_ID"].nunique()

651000

In [190]:
df_mapping_grouped.head()

Unnamed: 0,icd9,icd10
0,1,A00
1,2,A01
2,3,A02
3,4,A03
4,5,A05


In [235]:
df_test = pd.merge(df_mimic, df_mapping_grouped, left_on="ICD9_CODE", right_on="icd9", how="left")
print(df_test.shape)
df_test.head()

df_test[df_test["icd10"].isna()]["ICD9_CODE"].unique()

(651000, 7)


array(['E93', 'E85', 'E94', 'E86'], dtype=object)

In [192]:
df_test.icd10.nunique()

815

In [193]:
df_test.dtypes

ROW_ID          int64
SUBJECT_ID      int64
HADM_ID         int64
SEQ_NUM       float64
ICD9_CODE      object
icd9           object
icd10          object
dtype: object

In [194]:
# Apply custom mapping
# merge first part of the dataframe with the rest
# split into two parts
# drop rows for na df
# add +1 to them
# merge again
# combine both dataframes
# check for duplicates in row id
# only take first 3 digits of icd9 code

In [195]:
df_10 = pd.read_csv("../data/interim/icd10_codes_and_des.csv")
df_10.head()

Unnamed: 0,category_codes,category_codes_des,block_codes,block_codes_des,chapter_codes,chapter_codes_des
0,A00,Cholera,A00-A09,Intestinal infectious diseases (A00-A09),1,Certain infectious and parasitic diseases (A00...
1,A01,Typhoid and paratyphoid fevers,A00-A09,Intestinal infectious diseases (A00-A09),1,Certain infectious and parasitic diseases (A00...
2,A02,Other salmonella infections,A00-A09,Intestinal infectious diseases (A00-A09),1,Certain infectious and parasitic diseases (A00...
3,A03,Shigellosis,A00-A09,Intestinal infectious diseases (A00-A09),1,Certain infectious and parasitic diseases (A00...
4,A04,Other bacterial intestinal infections,A00-A09,Intestinal infectious diseases (A00-A09),1,Certain infectious and parasitic diseases (A00...


In [204]:
df_results = pd.merge(df_test, df_10, left_on="icd10", right_on="category_codes", how="left")
df_results.head()

df_results[df_results["block_codes"].isna()]["icd10"].unique()

array(['NoD'], dtype=object)

In [125]:
df_results.shape

(21173212, 13)

In [85]:
df_9 = pd.read_csv("../data/interim/icd9_codes_and_des.csv")
df_9.tail()

Unnamed: 0,category_codes,category_codes_des,block_codes,block_codes_des,chapter_codes,chapter_codes_des
1229,V87,Other specified personal exposures and history...,V87-V87,Other Specified Personal Exposures And History...,V01-V91,Supplementary Classification Of Factors Influe...
1230,V88,Acquired absence of other organs and tissue,V88-V88,Acquired Absence Of Other Organs And Tissue,V01-V91,Supplementary Classification Of Factors Influe...
1231,V89,Other suspected conditions not found,V89-V89,Other Suspected Conditions Not Found,V01-V91,Supplementary Classification Of Factors Influe...
1232,V90,Retained foreign body,V90-V90,Retained Foreign Body,V01-V91,Supplementary Classification Of Factors Influe...
1233,V91,Multiple gestation placenta status,V91-V91,Multiple Gestation Placenta Status,V01-V91,Supplementary Classification Of Factors Influe...


TypeError: 'int' object is not subscriptable