In [12]:
import pandas as pd
import numpy as np
import os

RAW_DATA_PATH = '../data/raw/diabetic_data.csv'
ID_MAP_PATH = '../data/raw/IDs_mapping.csv'
CLEANED_DATA_PATH = '../data/processed/diabetic_data_clean.csv'

## Phase 2: Data Enrichment — Target Identification

### Top 20 Primary Diagnosis Codes (`diag_1`)

To efficiently enrich ICD-9 codes, we first identified the 20 most frequent primary diagnoses in the dataset. These codes will serve as our scraping targets.

**Top 20 ICD-9 Codes (for scraping):**



In [13]:
import pandas as pd

# Load cleaned dataset
df = pd.read_csv(CLEANED_DATA_PATH , low_memory=False)

# ---------------- 1️ Identify Top 20 diag_1 Codes ---------------- #
top_20_diag_counts = df['diag_1'].value_counts().head(20)
top_20_diag_codes = top_20_diag_counts.index.tolist()

# Display the top 20 codes as a list (for scraping targets)
print("Top 20 ICD-9 Codes (for scraping):")
print(top_20_diag_codes)
print("\n")

# ---------------- 2️ Display Frequency of Each Code ---------------- #
print("Top 20 ICD-9 Codes with Frequency:")
for code, count in top_20_diag_counts.items():
    print(f"{code:8} → {count} occurrences")


Top 20 ICD-9 Codes (for scraping):
['428', '414', '786', '410', '486', '427', '491', '715', '682', '780', '434', '996', '276', '250.8', '599', '38', '584', 'V57', '250.6', '820']


Top 20 ICD-9 Codes with Frequency:
428      → 6735 occurrences
414      → 6555 occurrences
786      → 4016 occurrences
410      → 3477 occurrences
486      → 3413 occurrences
427      → 2729 occurrences
491      → 2252 occurrences
715      → 2147 occurrences
682      → 2030 occurrences
780      → 2012 occurrences
434      → 1958 occurrences
996      → 1944 occurrences
276      → 1861 occurrences
250.8    → 1667 occurrences
599      → 1581 occurrences
38       → 1522 occurrences
584      → 1482 occurrences
V57      → 1204 occurrences
250.6    → 1178 occurrences
820      → 1075 occurrences


In [14]:
import requests
from bs4 import BeautifulSoup
import time

# Dictionary to store code → description mapping
icd9_description_map = {}

def fetch_icd9_description(code):
    """
    Fetch ICD-9 long description from icd9.chrisendres.com
    """
    url = f"http://icd9.chrisendres.com/index.php?srchtext={code}&Submit=Search"
    
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # The first <td> with class 'description' contains the disease name
        desc_cell = soup.find('td', class_='description')
        
        if desc_cell:
            return desc_cell.text.strip()
        else:
            return "Description not found"
    
    except Exception as e:
        print(f"Error fetching code {code}: {e}")
        return "Error"


In [15]:
for code in top_20_diag_codes:
    print(f"Scraping ICD-9 code: {code}")
    description = fetch_icd9_description(code)
    icd9_description_map[code] = description
    time.sleep(1)  # Wait 1 second between requests


Scraping ICD-9 code: 428
Scraping ICD-9 code: 414
Scraping ICD-9 code: 786
Scraping ICD-9 code: 410
Scraping ICD-9 code: 486
Scraping ICD-9 code: 427
Scraping ICD-9 code: 491
Scraping ICD-9 code: 715
Scraping ICD-9 code: 682
Scraping ICD-9 code: 780
Scraping ICD-9 code: 434
Scraping ICD-9 code: 996
Scraping ICD-9 code: 276
Scraping ICD-9 code: 250.8
Scraping ICD-9 code: 599
Scraping ICD-9 code: 38
Scraping ICD-9 code: 584
Scraping ICD-9 code: V57
Scraping ICD-9 code: 250.6
Scraping ICD-9 code: 820


In [16]:
df[['diag_1', 'Primary_Diagnosis_Desc']].head(10)


KeyError: "['Primary_Diagnosis_Desc'] not in index"

In [None]:
import pandas as pd
pd.DataFrame(list(icd9_description_map.items()), columns=['ICD9_Code', 'Description']).to_csv('icd9_mapping_top20.csv', index=False)
