In [1]:
!pip install transformers langchain faiss-cpu datasets sentence-transformers



In [3]:
!pip install langchain-community


Collecting langchain-community
  Downloading langchain_community-0.3.13-py3-none-any.whl.metadata (2.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain<0.4.0,>=0.3.13 (from langchain-community)
  Downloading langchain-0.3.13-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.4.0,>=0.3.27 (from langchain-community)
  Downloading langchain_core-0.3.28-py3-none-any.whl.metadata (6.3 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.7.0-py3-none-any.whl.metadata (3.5 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.23.2-py3-none-any.whl.metadata (7.1 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-

In [5]:
import os
import pandas as pd
import faiss
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

# Paths for saving components
FAISS_INDEX_PATH = "faiss_index"
LLM_MODEL_PATH = "fine_tuned_t5"
DATASET_PATH = "Disease_Info.csv"

# Step 1: Load Disease Dataset
data = pd.read_csv(DATASET_PATH)  # Ensure this file has columns "Disease" and "Description"

# Step 2: Prepare Data for Retrieval
def prepare_retrieval_data(df):
    return df["Description"].tolist(), df["Disease"].tolist()

descriptions, diseases = prepare_retrieval_data(data)

# Step 3: Create and Save FAISS Index
embedding_model_name = "sentence-transformers/all-mpnet-base-v2"
embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_name)

# Build FAISS index
vectorstore = FAISS.from_texts(descriptions, embedding_model)

# Save FAISS index
os.makedirs(FAISS_INDEX_PATH, exist_ok=True)
vectorstore.save_local(FAISS_INDEX_PATH)
print(f"FAISS index saved to {FAISS_INDEX_PATH}")

# Step 4: Load Pre-Trained LLM and Save It
model_name = "t5-small"  # You can replace this with "t5-large" or another model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Save the model and tokenizer
model.save_pretrained(LLM_MODEL_PATH)
tokenizer.save_pretrained(LLM_MODEL_PATH)
print(f"LLM and tokenizer saved to {LLM_MODEL_PATH}")

# Wrap the model in a pipeline for LangChain
generator_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer)
llm = HuggingFacePipeline(pipeline=generator_pipeline)

# Step 5: Define Retrieval-Augmented Generation (RAG) Pipeline
retrieval_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectorstore.as_retriever(),
    return_source_documents=True
)

def query_rag(query):
    # Use the invoke method for multiple output keys
    response = retrieval_chain.invoke({"query": query})
    result = response["result"]  # The generated description
    sources = response["source_documents"]  # The retrieved source documents

    print("\nRetrieved Sources:")
    for i, doc in enumerate(sources):
        print(f"Source {i + 1}: {doc.page_content}")

    return result


# Example Query
query = "diabetes insipidus"
result = query_rag(query)
print(f"Generated Description for {query}: {result}")


FAISS index saved to faiss_index


Device set to use cuda:0


LLM and tokenizer saved to fine_tuned_t5

Retrieved Sources:
Source 1: Diabetes insipidus is a condition in which the kidneys cannot conserve water, leading to excessive urination and dehydration. It is not the same as diabetes mellitus, which is a disorder of blood sugar regulation.
Source 2: Diabetes mellitus is a group of metabolic diseases characterized by high blood sugar levels over a prolonged period. Symptoms of high blood sugar include frequent urination, increased thirst, and increased hunger. If left untreated, diabetes can lead to serious health complications such as heart disease
Source 3: Diabetic peripheral neuropathy is a condition that can develop in people who have diabetes. It occurs when high blood glucose levels (hyperglycemia) damage the nerves that send signals from the brain to other parts of the body. The condition can affect different areas of the body, including
Source 4: Diabetic kidney disease is a kidney disease that develops in people with diabetes. It ca