# Imports

In [332]:
import pandas as pd
import requests
import simple_icd_10_cm as cm
from bs4 import BeautifulSoup, SoupStrainer
import lxml
import re
import json

# Make ICD 10 Dataframe

In [261]:
def check_valid(ser, func):
    for index, value in ser.items():
        if not func(value):
            ser[index] = cm.get_ancestors(value)[-1]
    return ser

In [262]:
def get_parent_vals(df, child, parent, func):
    df[parent] = df[child].apply(cm.get_parent)
    df[parent] = check_valid(df[parent], func)
    df[f"{parent}_des"] = df[parent].apply(cm.get_description)
    return df
    

In [349]:
all_codes = cm.get_all_codes(with_dots=False)
category_codes = [code for code in all_codes if cm.is_category(code)]
df = pd.DataFrame({"category_codes": category_codes})
df["category_codes_des"] = df["category_codes"].apply(cm.get_description)
df = get_parent_vals(df, "category_codes", "block_codes", cm.is_block)
df = get_parent_vals(df, "block_codes", "chapter_codes", cm.is_chapter)


df.tail()

Unnamed: 0,category_codes,category_codes_des,block_codes,block_codes_des,chapter_codes,chapter_codes_des
1944,Z96,Presence of other functional implants,Z77-Z99,Persons with potential health hazards related ...,21,Factors influencing health status and contact ...
1945,Z97,Presence of other devices,Z77-Z99,Persons with potential health hazards related ...,21,Factors influencing health status and contact ...
1946,Z98,Other postprocedural states,Z77-Z99,Persons with potential health hazards related ...,21,Factors influencing health status and contact ...
1947,Z99,"Dependence on enabling machines and devices, n...",Z77-Z99,Persons with potential health hazards related ...,21,Factors influencing health status and contact ...
1948,U07,Emergency use of U07,U00-U49,Provisional assignment of new diseases of unce...,22,Codes for special purposes (U00-U85)


In [350]:
# Check validity
def check_valid_again(ser, func):
    for val in ser:
        if not func(val):
            print(val)

check_valid_again(df["chapter_codes"], cm.is_chapter)
check_valid_again(df["block_codes"], cm.is_block)
check_valid_again(df["category_codes"], cm.is_category)

# Scrape Wikipedia

In [362]:
BASE = "https://en.wikipedia.org/wiki"

try: 
  with open("../data/interim/symptoms.json") as file:
      # Load its content and make a new dictionary
      symptoms_cache = json.load(file)
except FileNotFoundError:
  symptoms_cache = {}

def get_symptoms_from_wiki(url):
    res = requests.get(url)
    if res.status_code in range(200,300):
      soup = BeautifulSoup(res.content, "lxml",
                        parse_only = SoupStrainer(
                        'table', class_ = 'infobox'))
      try:
        text = soup.findAll("th", string='Symptoms')[0].next_sibling.text
        text_cleaned = re.sub("[\(\[].*?[\)\]]", "", text)
        return text_cleaned
      except IndexError:
        return IndexError
    return None

# Build cache
def get_symptoms(code_des):
    code_des = code_des.replace(" ", "_")
    if code_des not in symptoms_cache:
      symptoms_cache[code_des] = get_symptoms_from_wiki(f"{BASE}/{code_des}")
    
    with open("../data/interim/symptoms.json", 'w') as fp:
      json.dump(symptoms_cache, fp)
    
    return symptoms_cache[code_des]
    

In [360]:
df = df.head(10)
df.head()

Unnamed: 0,category_codes,category_codes_des,block_codes,block_codes_des,chapter_codes,chapter_codes_des,symptoms
0,A00,Cholera,A00-A09,Intestinal infectious diseases (A00-A09),1,Certain infectious and parasitic diseases (A00...,"Large amounts of watery diarrhea, vomiting, mu..."
1,A01,Typhoid and paratyphoid fevers,A00-A09,Intestinal infectious diseases (A00-A09),1,Certain infectious and parasitic diseases (A00...,
2,A02,Other salmonella infections,A00-A09,Intestinal infectious diseases (A00-A09),1,Certain infectious and parasitic diseases (A00...,
3,A03,Shigellosis,A00-A09,Intestinal infectious diseases (A00-A09),1,Certain infectious and parasitic diseases (A00...,"Diarrhea, fever, abdominal pain"
4,A04,Other bacterial intestinal infections,A00-A09,Intestinal infectious diseases (A00-A09),1,Certain infectious and parasitic diseases (A00...,


In [363]:
df["symptoms"] = df["category_codes_des"].apply(get_symptoms)
df

Unnamed: 0,category_codes,category_codes_des,block_codes,block_codes_des,chapter_codes,chapter_codes_des,symptoms
0,A00,Cholera,A00-A09,Intestinal infectious diseases (A00-A09),1,Certain infectious and parasitic diseases (A00...,"Large amounts of watery diarrhea, vomiting, mu..."
1,A01,Typhoid and paratyphoid fevers,A00-A09,Intestinal infectious diseases (A00-A09),1,Certain infectious and parasitic diseases (A00...,
2,A02,Other salmonella infections,A00-A09,Intestinal infectious diseases (A00-A09),1,Certain infectious and parasitic diseases (A00...,
3,A03,Shigellosis,A00-A09,Intestinal infectious diseases (A00-A09),1,Certain infectious and parasitic diseases (A00...,"Diarrhea, fever, abdominal pain"
4,A04,Other bacterial intestinal infections,A00-A09,Intestinal infectious diseases (A00-A09),1,Certain infectious and parasitic diseases (A00...,
5,A05,"Other bacterial foodborne intoxications, not e...",A00-A09,Intestinal infectious diseases (A00-A09),1,Certain infectious and parasitic diseases (A00...,
6,A06,Amebiasis,A00-A09,Intestinal infectious diseases (A00-A09),1,Certain infectious and parasitic diseases (A00...,"Bloody diarrhea, abdominal pain"
7,A07,Other protozoal intestinal diseases,A00-A09,Intestinal infectious diseases (A00-A09),1,Certain infectious and parasitic diseases (A00...,
8,A08,Viral and other specified intestinal infections,A00-A09,Intestinal infectious diseases (A00-A09),1,Certain infectious and parasitic diseases (A00...,
9,A09,"Infectious gastroenteritis and colitis, unspec...",A00-A09,Intestinal infectious diseases (A00-A09),1,Certain infectious and parasitic diseases (A00...,


In [354]:
df.to_csv("../data/interim/icd10_symptoms.csv", index=False)

In [300]:
# https://www.cdc.gov/nocardiosis/symptoms/index.html

# if no symptoms found then go one level deeper
# If Indexerror then go to other website

Unnamed: 0,category_codes,category_codes_des,block_codes,block_codes_des,chapter_codes,chapter_codes_des
732,J30,Vasomotor and allergic rhinitis,J30-J39,Other diseases of upper respiratory tract (J30...,10,Diseases of the respiratory system (J00-J99)


In [366]:
type(cm.is_block)

function