In [0]:
import requests
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from datetime import datetime
from pyspark.sql.types import *

In [0]:
ClientId='3e060761-9bd2-4256-a439-94687fb13001_0d8f8fa5-6064-417d-a086-0837cfd882f7'
ClientSecret='CzEEzr6LkDk/qy30y/Hy5qL9qPwbMmAyzJSa/OI7sdc='
baseURL='https://id.who.int/icd'

authURL='https://icdaccessmanagement.who.int/connect/token'
auth_response=requests.post(authURL,data={
    'grant_type':'client_credentials','client_id':ClientId,'client_secret':ClientSecret
    })

# get the outh token from ths authURL 
if auth_response.status_code==200:
    print('Authentication successful')
    access_token=auth_response.json()['access_token']
else:
    raise Exception(f"Failed to obtain access token: {auth_response.status_code} - {auth_response.text}")


In [0]:
headers={'Authorization':f'Bearer {access_token}',
         'API-Version':'v2',
         'Accept-Language':'en'}

def fetch_icd_response(url):
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return response.json()
    else:
        raise Exception(f"Failed to fetch data: {response.status_code} - {response.text}")

def extract_codes(url):
    data=fetch_icd_response(url)
    codes=[]

    current_date=datetime.now().date()
    
    if 'child' in data:
        for child_url in data['child']:
            codes.extend(extract_codes(child_url)) # base URL has nested URL hence need to call the function recursively
    else:
        if 'code' in data and 'title' in data:
            codes.append({
                'icd_code': data['code'],
                'icd_code_type': 'ICD-10',
                'code_description': data['title']['@value'],
                'inserted_date': current_date,
                'updated_date': current_date,
                'is_current_flag': True
            })
    return codes

In [0]:
root_url = 'https://id.who.int/icd/release/10/2019/A00-A09'
icd_codes = extract_codes(root_url)

In [0]:
# Define the schema explicitly
schema = StructType([
    StructField("icd_code", StringType(), True),
    StructField("icd_code_type", StringType(), True),
    StructField("code_description", StringType(), True),
    StructField("inserted_date", DateType(), True),
    StructField("updated_date", DateType(), True),
    StructField("is_current_flag", BooleanType(), True)
])

In [0]:
# Create a DataFrame using the defined schema
len(icd_codes)

In [0]:
df = spark.createDataFrame(icd_codes, schema=schema)
df.coalesce(1).write.format("parquet").mode("append").save("/mnt/bronze/icd_codes/")