In [None]:
# Import necessary libraries
import pandas as pd
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA, LLMChain
from langchain.agents import Tool, initialize_agent, AgentType
from transformers import LlamaTokenizer, LlamaForCausalLM
import torch
from langchain import PromptTemplate
# Step 1: Load Data
hospitals_df = pd.read_csv('hospitals.csv')
patients_df = pd.read_csv('patients.csv')
payers_df = pd.read_csv('payers.csv')
physicians_df = pd.read_csv('physicians.csv')
reviews_df = pd.read_csv('reviews.csv')
visits_df = pd.read_csv('visits.csv')

# Handle missing values function
def handle_missing_values(df, strategy='fill'):
    if strategy == 'fill':
        return df.fillna('')
    elif strategy == 'drop':
        return df.dropna()
    else:
        raise ValueError("Strategy not recognized. Use 'fill' or 'drop'.")

# Prepare data (your code adapted)
# 1. Prepare texts for each dataset separately
dataset_texts = {
    'hospitals': [],
    'payers': [],
    'physicians': [],
    'patient_reviews': [],
    'visits': [],
    'physician_reviews': [],
    'patient_data': []
}

# Hospitals
hospitals_df['combined_text'] = (
    "Hospital Name: " + hospitals_df['hospital_name'] +
    ", State: " + hospitals_df['hospital_state']
)
dataset_texts['hospitals'].extend(hospitals_df['combined_text'].dropna().tolist())

# Payers
payers_df['combined_text'] = "Payer Name: " + payers_df['payer_name']
dataset_texts['payers'].extend(payers_df['combined_text'].dropna().tolist())

# Physicians
physicians_df['combined_text'] = (
    "Physician Name: Dr. " + physicians_df['physician_name'] +
    ", Medical School: " + physicians_df['medical_school'] +
    ", Graduation Year: " + physicians_df['physician_grad_year'].astype(str) +
    ", Salary: $" + physicians_df['salary'].astype(str)
)
dataset_texts['physicians'].extend(physicians_df['combined_text'].dropna().tolist())

# Patient Reviews
dataset_texts['patient_reviews'].extend(reviews_df['review'].dropna().tolist())

# Visits
visits_df['combined_text'] = (
    "Chief Complaint: " + visits_df['chief_complaint'] +
    ", Treatment Description: " + visits_df['treatment_description'] +
    ", Primary Diagnosis: " + visits_df['primary_diagnosis'] +
    ", Billing Amount: $" + visits_df['billing_amount'].astype(str)
)
dataset_texts['visits'].extend(visits_df['combined_text'].dropna().tolist())

# Physician Reviews
physician_reviews_df = reviews_df.groupby('physician_name')['review'].apply(lambda x: ' '.join(x)).reset_index()
for idx, row in physician_reviews_df.iterrows():
    physician_name = row['physician_name']
    reviews_text = row['review']
    combined_text = f"Physician Name: Dr. {physician_name}, Reviews: {reviews_text}"
    dataset_texts['physician_reviews'].append(combined_text)

# Patient Data
# Prepare patient data as per your code
patient_reviews = reviews_df.merge(patients_df[['patient_id', 'patient_name']], on='patient_name', how='left')
patient_reviews_grouped = patient_reviews.groupby(['patient_id', 'patient_name'])['review'].apply(lambda x: ' '.join(x)).reset_index()
patient_costs = visits_df.groupby('patient_id')['billing_amount'].agg(['sum', 'mean']).reset_index()
patient_costs.rename(columns={'sum': 'total_cost', 'mean': 'average_cost'}, inplace=True)
patient_costs = patient_costs.merge(patients_df[['patient_id', 'patient_name']], on='patient_id', how='left')
patient_visits_info = visits_df.groupby('patient_id').agg({
    'chief_complaint': lambda x: ' | '.join(x.dropna().unique()),
    'treatment_description': lambda x: ' | '.join(x.dropna().unique())
}).reset_index()
patient_visits_info = patient_visits_info.merge(patients_df[['patient_id', 'patient_name']], on='patient_id', how='left')
patient_data = patient_reviews_grouped.merge(patient_costs, on=['patient_id', 'patient_name'], how='outer')
patient_data = patient_data.merge(patient_visits_info, on=['patient_id', 'patient_name'], how='outer')
patient_data = handle_missing_values(patient_data, strategy='fill')
patient_data['chief_complaint'] = patient_data['chief_complaint'].replace("", "No record")
patient_data['treatment_description'] = patient_data['treatment_description'].replace("", "No record")
# Initialize a list to hold patient texts
patient_texts = []
for idx, row in patient_data.iterrows():
    patient_name = row['patient_name']
    reviews_text = row['review']
    total_cost = row['total_cost']
    average_cost = row['average_cost']
    chief_complaints = row['chief_complaint']
    treatments = row['treatment_description']
    # Create a combined text for the patient
    combined_text = f"Patient Name: {patient_name}, "
    combined_text += f"Total Cost: {total_cost}, Average Cost per Visit: {average_cost}, "
    combined_text += f"Chief Complaints: {chief_complaints}, "
    combined_text += f"Treatments Received: {treatments}, "
    combined_text += f"Reviews: {reviews_text}"
    patient_texts.append(combined_text)
dataset_texts['patient_data'].extend(patient_texts)


# Step 2: Chunk Texts with Metadata
def chunk_text_with_metadata(texts, dataset_name, max_length=500, overlap=50):
    chunks = []
    for text in texts:
        words = text.split()
        start = 0
        while start < len(words):
            end = min(start + max_length, len(words))
            chunk = ' '.join(words[start:end])
            chunks.append({
                'text': chunk,
                'dataset': dataset_name
            })
            start += max_length - overlap
    return chunks

all_chunks = []
for dataset_name, texts in dataset_texts.items():
    # Adjust chunking parameters as needed
    dataset_chunks = chunk_text_with_metadata(texts, dataset_name, max_length=500, overlap=100)
    all_chunks.extend(dataset_chunks)

chunked_texts = [chunk['text'] for chunk in all_chunks]
chunked_metadatas = [{'dataset': chunk['dataset']} for chunk in all_chunks]



# Step 3: Create the Chroma Vector Store with Embeddings
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

vector_store = Chroma.from_texts(
    texts=chunked_texts,
    embedding=embedding_model,
    metadatas=chunked_metadatas,
    persist_directory="chroma_db"
)

vector_store.persist()

# Initialize the retriever
retriever = vector_store.as_retriever()
def retrieve_top_k(query, k=3):
    docs = retriever.get_relevant_documents(query)[:k]
    return docs

# Step 4: Initialize the Local LLaMA Model with CUDA Support

# Load the tokenizer and model
from transformers import LlamaTokenizer, LlamaForCausalLM

# Replace 'YourModelName' with the actual model you want to use
# For example, 'decapoda-research/llama-7b-hf' (if you have access)
tokenizer = LlamaTokenizer.from_pretrained('NousResearch/Llama-2-7b-hf')
model = LlamaForCausalLM.from_pretrained(
    'NousResearch/Llama-2-7b-hf',
    torch_dtype=torch.float16,
    device_map=None
)

# Move the model to CUDA if available
if torch.cuda.is_available():
    device = torch.device('cuda')
    model.to(device)
    print("Using CUDA device")
else:
    device = torch.device('cpu')
    print("Using CPU device")

# Define the text generation function
def generate_text(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=512,
        do_sample=True,
        temperature=0.5,
        top_p=0.95,
        repetition_penalty=1.15
    )
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

# Create a custom LLM class
from langchain.llms.base import LLM
from typing import Optional, List

class CustomLLM(LLM):
    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        return generate_text(prompt)
    
    @property
    def _llm_type(self) -> str:
        return "custom"

# Initialize the custom LLM
llm = CustomLLM()

# Step 5: Create a Custom Prompt Template
template = """
You are an intelligent assistant that provides detailed and accurate answers based on the provided context.

Context:
{context}

Question: {question}

Answer:
"""

prompt = PromptTemplate(
    template=template,
    input_variables=["context", "question"]
)

# Step 6: Create a Custom LLM Chain with the Custom Prompt
llm_chain = LLMChain(
    llm=llm,
    prompt=prompt,
    verbose=True
)

# Step 7: Create the RetrievalQA Chain with the Custom LLM Chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm_chain,
    chain_type="stuff",  # You can explore other chain types like "map_reduce" if needed
    retriever=retriever,
    return_source_documents=True,
    verbose=True
    max_iterations=3,

)


# Step 6: Define Custom Functions (Tools)
# [Your custom functions remain the same]

import time
import random

def get_current_wait_time(hospital_name: str) -> str:
    """Simulate retrieving current wait time for a hospital."""
    if hospital_name not in hospitals_df['hospital_name'].values:
        return f"Hospital '{hospital_name}' does not exist."
    time.sleep(1)  # Simulate API call delay
    wait_time = random.randint(10, 120)  # Simulate wait time in minutes
    return f"The current wait time at {hospital_name} is approximately {wait_time} minutes."

def get_physician_rating(physician_name: str) -> str:
    """Calculate average rating for a physician."""
    physician_reviews = reviews_df[reviews_df['physician_name'] == physician_name]
    if physician_reviews.empty:
        return f"No reviews found for Dr. {physician_name}."
    # Ensure 'rating' column exists; if not, create a simulated one
    if 'rating' not in physician_reviews.columns:
        physician_reviews['rating'] = [random.randint(1, 5) for _ in range(len(physician_reviews))]
    average_rating = physician_reviews['rating'].mean()
    return f"The average rating for Dr. {physician_name} is {average_rating:.1f} out of 5."

# Step 7: Wrap Functions as Tools
tools = [
    Tool(
        name="Get Current Wait Time",
        func=get_current_wait_time,
        description="Use this function to get the current wait time at a hospital. Input should be the hospital name."
    ),
    Tool(
        name="Get Physician Rating",
        func=get_physician_rating,
        description="Use this function to get the average rating of a physician based on patient reviews. Input should be the physician's name."
    )
]

# Step 8: Initialize the Agent
# memory = ConversationBufferMemory(memory_key="chat_history", return_messages=False)
# def preload_memory(memory, query):
#     """
#     Preload the memory with the initial query to set context.
#     """
#     memory.buffer.append({"role": "user", "content": query})


agent = initialize_agent(
    tools=tools,
    llm=qa_chain,
    agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True,
    # memory=memory,
    retriever=retriever,
    max_iterations=3,
)

# Step 9: Test the Agent
def handle_query(query):
    try:
        response = agent.run(query)
    except Exception as e:
        response = "I'm sorry, I encountered an error while processing your request."
        print(f"Error: {str(e)}")
    return response


# Example Usage
if __name__ == "__main__":
    # Test Query 1
    query = "What are the salary and review for Dr. Smith?"
    answer = handle_query(query)
    print("\nQuery 1:")
    print(f"Question: {query}")
    print(f"Answer: {answer}")
    
    # Test Query 2
    query = "What is the current wait time at Mercy Hospital?"
    print("\nQuery 2:")
    print(f"Question: {query}")
    print(f"Answer: {answer}")
    



Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s]


Using CUDA device


[1m> Entering new AgentExecutor chain...[0m


[1m> Entering new RetrievalQA chain...[0m
Error: 'StringPromptValue' object has no attribute 'replace'

Query 1:
Question: What are the salary and review for Dr. Smith?
Answer: I'm sorry, I encountered an error while processing your request.

Query 2:
Question: What is the current wait time at Mercy Hospital?
Answer: I'm sorry, I encountered an error while processing your request.


In [6]:
import torch

print(f"CUDA Available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"Device Name: {torch.cuda.get_device_name(0)}")


CUDA Available: True
Device Name: NVIDIA GeForce RTX 3070


In [7]:
import torch
print(torch.version.cuda)


12.1


In [8]:
# !pip install pandas langchain transformers sentencepiece chromadb
# !pip install -U langchain-community
# !pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
# !pip install sentence-transformers