In [None]:
import pandas as pd

In [None]:
df=pd.read_csv('Formatted_Diagnoses.csv')

In [None]:
df.head()

Unnamed: 0,Diagnosed
0,Diabetes mellitus without mention of complicat...
1,"Long-term (current) use of anticoagulants, Obe..."
2,"Acute respiratory failure, Hypopotassemia, Per..."
3,Antineoplastic and immunosuppressive drugs cau...
4,Personal history of malignant neoplasm of tong...


In [None]:
df['Diagnosed'][0]

'Diabetes mellitus without mention of complication, type II or unspecified type, not stated as uncontrolled, Pure hypercholesterolemia, Unspecified acquired hypothyroidism, Tobacco use disorder, Personal history of malignant melanoma of skin, Hypotension, unspecified, Acute appendicitis with generalized peritonitis, Long-term (current) use of aspirin, Acquired absence of organ, genital organs, Unspecified essential hypertension'

In [None]:
df.shape

(100, 1)

In [None]:
df_code=pd.read_csv('/content/output.csv')

In [None]:
df_code.head()

Unnamed: 0,code,cause
0,I,Certain infectious and parasitic diseases
1,A00-A09,Intestinal infectious diseases
2,A00,Cholera
3,A00.0,"Cholera due to Vibrio cholerae 01, biovar chol..."
4,A00.1,"Cholera due to Vibrio cholerae 01, biovar eltor"


In [None]:
!pip install langchain langchain_community langchain_chroma langchain_google_genai langchain_core


Collecting langchain
  Downloading langchain-0.3.4-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain_community
  Downloading langchain_community-0.3.3-py3-none-any.whl.metadata (2.8 kB)
Collecting langchain_chroma
  Downloading langchain_chroma-0.1.4-py3-none-any.whl.metadata (1.6 kB)
Collecting langchain_google_genai
  Downloading langchain_google_genai-2.0.1-py3-none-any.whl.metadata (3.9 kB)
Collecting langchain_core
  Downloading langchain_core-0.3.12-py3-none-any.whl.metadata (6.3 kB)
Collecting langchain-text-splitters<0.4.0,>=0.3.0 (from langchain)
  Downloading langchain_text_splitters-0.3.0-py3-none-any.whl.metadata (2.3 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.136-py3-none-any.whl.metadata (13 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain_community)
  Downloading pydantic_se

In [None]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [None]:
type(df_code['cause'][0])

str

In [None]:
key='GEMINI_API_KEY'

In [None]:
from langchain.text_splitter import NLTKTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

# Initialize Google Generative AI Embeddings model
embeddings = GoogleGenerativeAIEmbeddings(google_api_key=key, model="models/embedding-001")


# Initialize NLTKTextSplitter to split text into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)

# Initialize a list to store documents (with embeddings)
documents = []

# Function to split text and create Document objects
def process_and_store_documents(cause_text, index):
    chunks = text_splitter.split_text(cause_text)  # Split the text into chunks

    for i, chunk in enumerate(chunks):
        doc = Document(
            page_content=chunk,
            metadata={"original_text": cause_text, "chunk_index": i, "row_index": index}
        )
        documents.append(doc)

# Use df.iterrows() to loop through each row and process the 'cause' column
for index, row in df_code.iterrows():
    process_and_store_documents(row['cause'], index)

# Store documents in Chroma vector store with Google Generative AI embeddings

vectorstore = Chroma.from_documents(documents, embedding=embeddings, persist_directory="./chroma_db")


In [None]:
from langchain_core.messages import SystemMessage
from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate

chat_template = ChatPromptTemplate.from_messages([
    # System Message Prompt Template
SystemMessage(content="""You are an advanced AI specialized in assisting with medical diagnoses and retrieving ICD codes.
                  Your primary task is to accurately map each provided diagnosis to its corresponding ICD-10 code based on the context given.
                  Ensure that your response is both concise and precise, strictly relying on the information provided to avoid inaccuracies.

                  Guidelines for Response:
                  - Prioritize matching the diagnosis to the most relevant ICD-10 code, considering specificity and clinical relevance.
                  - If an exact ICD-10 code is not available, provide the closest relevant code related to the diagnosis.
                  - If no response can be generated, do not respond blank, strive to find any relevant information that can be associated with the diagnosis to enhance the response.
                  - Include a clear description for each diagnosis along with its associated ICD-10 code.
                  - Format your response as a JSON object with two fields: "ICD-10 code" and "description".
                  - Maintain clarity and avoid unnecessary information to ensure the user receives a direct and useful response.
                  """),



    # Human Message Prompt Template
    HumanMessagePromptTemplate.from_template("""Answer the question based on the given context.
    Context: {context}
    Question: {question}
    Answer: """)
])


In [None]:
from langchain_core.output_parsers import StrOutputParser

output_parser = StrOutputParser()

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI

chat_model = ChatGoogleGenerativeAI(google_api_key=key,
                                   model="gemini-1.5-flash")

In [None]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 1})


In [None]:
from langchain_core.runnables import RunnablePassthrough


In [None]:
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | chat_template
    | chat_model
    | output_parser
)

In [None]:
df["Diagnosed"][13]

'Syncope and collapse, Backache, unspecified, Acute kidney failure, unspecified, Diabetes with ketoacidosis, type II or unspecified type, uncontrolled, Percutaneous transluminal coronary angioplasty status, Unspecified essential hypertension, Other and unspecified hyperlipidemia, Intestinal infection due to other organism, not elsewhere classified, Dehydration, Coronary atherosclerosis of unspecified type of vessel, native or graft'

In [None]:
response = rag_chain.invoke(df["Diagnosed"][13])
response


'```json\n{\n  "ICD-10 code": "N18.9",\n  "description": "Acute kidney failure, unspecified"\n}\n```'

# **For making the final Dataset**

In [None]:
df_final=pd.read_csv('/content/data.csv')

In [None]:
df_final.head()


Unnamed: 0,Diagnosed,ICD-10 code,description
0,Diabetes mellitus without mention of complicat...,E11.9,Diabetes mellitus without mention of complicat...
1,"Long-term (current) use of anticoagulants, Obe...",I80.9,Phlebitis and thrombophlebitis of lower extrem...
2,"Acute respiratory failure, Hypopotassemia, Per...",I13.9,"Hypertensive heart disease, unspecified"
3,Antineoplastic and immunosuppressive drugs cau...,I13.9,"Hypertensive heart and renal disease, unspecified"
4,Personal history of malignant neoplasm of tong...,Z85.3,Personal history of malignant neoplasm of tongue


In [None]:
import pandas as pd

# Assume `response` contains the output from RAG in JSON format for each diagnosis
def get_icd_code_and_description(diagnosis):
    # Extract ICD-10 code and description from response
    icd_code = rag_chain.invoke("ICD-10 code")
    description = rag_chain.invoke("description")

    return icd_code, description

# Apply the function to each diagnosis in the 'Diagnosed' column and store the result in new columns
df[['ICD-10 code', 'description']] = df['Diagnosed'].head(30).apply(lambda diag: pd.Series(get_icd_code_and_description(diag)))

# Display the updated DataFrame
print(df.head())


In [None]:
# Strip whitespace from column names
df_final.columns = df_final.columns.str.strip()

# Confirm the cleaned column names
print(df_final.columns.tolist())


['Diagnosed', 'ICD-10 code', 'description']


In [None]:
import pandas as pd
import json

# Function to extract ICD-10 code and description from the RAG model's response
def get_icd_code_and_description(diagnosis):
    response = str(rag_chain.invoke(diagnosis))
    cleaned_response = response.replace("```json\n", "").replace("```", "").strip()

    try:
        response_dict = json.loads(cleaned_response)
        icd_code = response_dict.get("ICD-10 code", "Unknown")
        description = response_dict.get("description", "Description not available")
    except json.JSONDecodeError:
        return "Error", "Invalid JSON"

    return icd_code, description

# Identify rows with errors in the 'ICD-10 code' column
error_rows = df_final[df_final["ICD-10 code"] == "Unknown"]

# Process the first 15 diagnoses with an error
for index, row in error_rows.head(10).iterrows():
    icd_code, description = get_icd_code_and_description(row['Diagnosed'])
    df_final.at[index, 'ICD-10 code'] = icd_code  # Update the ICD-10 code
    df_final.at[index, 'description'] = description  # Update the description

# Display the updated DataFrame for rows with errors
print(df_final[df_final["ICD-10 code"] == "Unknown"])


Empty DataFrame
Columns: [Diagnosed, ICD-10 code, description]
Index: []


In [None]:
df_final.to_csv('final.csv', index=False)