<a href="https://colab.research.google.com/github/SatwikMidya/MedMap/blob/main/MedMap.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import json

In [None]:
with open("/content/icd-10-2025-hierarchy.json", "r", encoding="utf-8") as f:
    data = json.load(f)


In [None]:
def extract_codes(node, results):
    if 'id' in node and 'description' in node:
        results.append({'code': node['id'], 'description': node['description']})
    if 'children' in node:
        for child in node['children']:
            extract_codes(child, results)

In [None]:
results = []
extract_codes(data['tree'], results)

In [None]:
df = pd.DataFrame(results)
df.drop_duplicates(inplace=True)

In [None]:
df = df.sort_values('code').reset_index(drop=True)

In [None]:
df.to_csv('icd10_reference.csv', index=False)
print(df.head())

    code                                        description
0    A00                                            Cholera
1  A00.0  Cholera due to Vibrio cholerae 01, biovar chol...
2  A00.1    Cholera due to Vibrio cholerae 01, biovar eltor
3  A00.9                               Cholera, unspecified
4    A01                     Typhoid and paratyphoid fevers


In [None]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-MiniLM-L6-v2')


icd_embeddings = model.encode(df['description'].tolist(), convert_to_tensor=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
diagnoses_df=pd.read_csv('/content/Diagnoses_list - Sheet1.csv')

In [None]:
mapped_codes = []
justifications = []
confidences = []
manual_review = []
alternatives_list = []

CONFIDENCE_THRESHOLD = 0.7  # adjust based on testing

for diag in diagnoses_df['Diagnoses_list']:
    diag_embedding = model.encode(diag, convert_to_tensor=True)
    scores = util.pytorch_cos_sim(diag_embedding, icd_embeddings)[0]

    top_score = scores.max().item()
    top_idx = scores.argmax().item()

    if top_score >= CONFIDENCE_THRESHOLD:
        # High confidence match
        code = df.iloc[top_idx]['code']
        desc = df.iloc[top_idx]['description']

        mapped_codes.append((code, desc))
        justifications.append(
            f"'{diag}' semantically matches '{desc}' (ICD: {code}) with high confidence (score = {top_score:.2f})."
        )
        confidences.append("High")
        manual_review.append("No")
        alternatives_list.append("")

    else:
        # Low confidence — return top 3 alternatives
        top_indices = scores.argsort(descending=True)[:3].tolist()
        alt_matches = [
            f"{df.iloc[i]['code']} ({df.iloc[i]['description']}, score={scores[i].item():.2f})"
            for i in top_indices
        ]

        mapped_codes.append((None, None))
        justifications.append(
            f"Low confidence match for '{diag}' (top score = {top_score:.2f}). Manual review recommended."
        )
        confidences.append("Low")
        manual_review.append("Yes")
        alternatives_list.append("; ".join(alt_matches))


In [None]:
diagnoses_df['ICD_Code'] = [code for code, desc in mapped_codes]
diagnoses_df['ICD_Description'] = [desc for code, desc in mapped_codes]
diagnoses_df['Confidence'] = confidences
diagnoses_df['Manual_Review'] = manual_review
diagnoses_df['Justification'] = justifications
diagnoses_df['Alternatives'] = alternatives_list

diagnoses_df.to_csv("bert_mapped_with_ambiguity.csv", index=False)

