In [0]:
import requests
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, DateType, BooleanType

# Initialize Spark session
spark = SparkSession.builder.appName("ICD Code Extract").getOrCreate()

# Current date for metadata fields
current_date = datetime.now().date()

# WHO API authentication details
client_id = 'c1b31e75-de89-4e9e-9738-87a4779e98e9_b7b0ac50-33a3-4ba5-abc2-4f4b1050ab15'
client_secret = 'UtTb8EIodKWT2g3BoIkgGsNbMcv0ClZKJbEij4ySzTs='
auth_url = 'https://icdaccessmanagement.who.int/connect/token'
base_url = 'https://id.who.int/icd/'

# Obtain access token
auth_response = requests.post(auth_url, data={
    'client_id': client_id,
    'client_secret': client_secret,
    'grant_type': 'client_credentials'
})

if auth_response.status_code != 200:
    raise Exception(f"Auth failed: {auth_response.status_code} - {auth_response.text}")

access_token = auth_response.json().get('access_token')

# Prepare headers for API calls
headers = {
    'Authorization': f'Bearer {access_token}',
    'API-Version': 'v2',
    'Accept-Language': 'en',
}

# Recursive function to fetch and extract ICD codes
def fetch_icd_codes(url):
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        raise Exception(f"Failed to fetch data: {response.status_code} - {response.text}")
    return response.json()

def extract_codes(url):
    data = fetch_icd_codes(url)
    codes = []
    if 'child' in data:
        for child_url in data['child']:
            codes.extend(extract_codes(child_url))
    else:
        if 'code' in data and 'title' in data:
            codes.append({
                'icd_code': data['code'],
                'icd_code_type': 'ICD-10',
                'code_description': data['title'].get('@value', ''),
                'inserted_date': current_date,
                'updated_date': current_date,
                'is_current_flag': True
            })
    return codes

# Start URL (example chapter/category)
root_url = 'https://id.who.int/icd/release/10/2019/A00-A09'
icd_codes = extract_codes(root_url)

# Define schema for DataFrame
schema = StructType([
    StructField("icd_code", StringType(), True),
    StructField("icd_code_type", StringType(), True),
    StructField("code_description", StringType(), True),
    StructField("inserted_date", DateType(), True),
    StructField("updated_date", DateType(), True),
    StructField("is_current_flag", BooleanType(), True),
])

# Create DataFrame and write to Parquet
df = spark.createDataFrame(icd_codes, schema=schema)
df.write.format("parquet").mode("overwrite").save("/mnt/bronze/icd_codes/")


In [0]:
display(df)