In [None]:
# import necessary libraries
import pandas as pd
import numpy as np
import math
from langchain.docstore.document import Document
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain.agents import Tool, initialize_agent, AgentType
from transformers import LlamaTokenizer, LlamaForCausalLM
import torch
from langchain.llms.base import LLM
from typing import Optional, List
from transformers import LlamaTokenizer, LlamaForCausalLM


# load Data
hospitals_df = pd.read_csv('hospitals.csv')
patients_df = pd.read_csv('patients.csv')
payers_df = pd.read_csv('payers.csv')
physicians_df = pd.read_csv('physicians.csv')
reviews_df = pd.read_csv('reviews.csv')
visits_df = pd.read_csv('visits.csv')

# missing values
def handle_missing_values(df, strategy='fill'):
    if strategy == 'fill':
        return df.fillna('')
    elif strategy == 'drop':
        return df.dropna()
    else:
        raise ValueError("Strategy not recognized. Use 'fill' or 'drop'.")

dataset_texts = {
    'hospitals': [],
    'payers': [],
    'physicians': [],
    'patient_reviews': [],
    'visits': [],
    'physician_reviews': [],
    'patient_data': []
}

# hospitals
hospitals_df['combined_text'] = (
    "Hospital Name: " + hospitals_df['hospital_name'] +
    ", State: " + hospitals_df['hospital_state']
)
dataset_texts['hospitals'].extend(hospitals_df['combined_text'].dropna().tolist())

# payers
payers_df['combined_text'] = "Payer Name: " + payers_df['payer_name']
dataset_texts['payers'].extend(payers_df['combined_text'].dropna().tolist())

# physicians
physicians_df['combined_text'] = (
    "Physician Name: Dr. " + physicians_df['physician_name'] +
    ", Medical School: " + physicians_df['medical_school'] +
    ", Graduation Year: " + physicians_df['physician_grad_year'].astype(str) +
    ", Salary: $" + physicians_df['salary'].astype(str)
)
dataset_texts['physicians'].extend(physicians_df['combined_text'].dropna().tolist())

# patient reviews
dataset_texts['patient_reviews'].extend(reviews_df['review'].dropna().tolist())

# visits
visits_df['combined_text'] = (
    "Chief Complaint: " + visits_df['chief_complaint'] +
    ", Treatment Description: " + visits_df['treatment_description'] +
    ", Primary Diagnosis: " + visits_df['primary_diagnosis'] +
    ", Billing Amount: $" + visits_df['billing_amount'].astype(str)
)
dataset_texts['visits'].extend(visits_df['combined_text'].dropna().tolist())

# physician reviews
physician_reviews_df = reviews_df.groupby('physician_name')['review'].apply(lambda x: ' '.join(x)).reset_index()
for idx, row in physician_reviews_df.iterrows():
    physician_name = row['physician_name']
    reviews_text = row['review']
    combined_text = f"Physician Name: Dr. {physician_name}, Reviews: {reviews_text}"
    dataset_texts['physician_reviews'].append(combined_text)

# patient data
patient_reviews = reviews_df.merge(patients_df[['patient_id', 'patient_name']], on='patient_name', how='left')
patient_reviews_grouped = patient_reviews.groupby(['patient_id', 'patient_name'])['review'].apply(lambda x: ' '.join(x)).reset_index()
patient_costs = visits_df.groupby('patient_id')['billing_amount'].agg(['sum', 'mean']).reset_index()
patient_costs.rename(columns={'sum': 'total_cost', 'mean': 'average_cost'}, inplace=True)
patient_costs = patient_costs.merge(patients_df[['patient_id', 'patient_name']], on='patient_id', how='left')
patient_visits_info = visits_df.groupby('patient_id').agg({
    'chief_complaint': lambda x: ' | '.join(x.dropna().unique()),
    'treatment_description': lambda x: ' | '.join(x.dropna().unique())
}).reset_index()
patient_visits_info = patient_visits_info.merge(patients_df[['patient_id', 'patient_name']], on='patient_id', how='left')
patient_data = patient_reviews_grouped.merge(patient_costs, on=['patient_id', 'patient_name'], how='outer')
patient_data = patient_data.merge(patient_visits_info, on=['patient_id', 'patient_name'], how='outer')
patient_data = handle_missing_values(patient_data, strategy='fill')
patient_data['chief_complaint'] = patient_data['chief_complaint'].replace("", "No record")
patient_data['treatment_description'] = patient_data['treatment_description'].replace("", "No record")
patient_texts = []
for idx, row in patient_data.iterrows():
    patient_name = row['patient_name']
    reviews_text = row['review']
    total_cost = row['total_cost']
    average_cost = row['average_cost']
    chief_complaints = row['chief_complaint']
    treatments = row['treatment_description']
    combined_text = f"Patient Name: {patient_name}, "
    combined_text += f"Total Cost: {total_cost}, Average Cost per Visit: {average_cost}, "
    combined_text += f"Chief Complaints: {chief_complaints}, "
    combined_text += f"Treatments Received: {treatments}, "
    combined_text += f"Reviews: {reviews_text}"
    patient_texts.append(combined_text)
dataset_texts['patient_data'].extend(patient_texts)

# chunk texts
def chunk_text_with_metadata(texts, dataset_name, max_length=500, overlap=50):
    chunks = []
    for text in texts:
        words = text.split()
        start = 0
        while start < len(words):
            end = min(start + max_length, len(words))
            chunk = ' '.join(words[start:end])
            chunks.append({
                'text': chunk,
                'dataset': dataset_name
            })
            start += max_length - overlap
    return chunks

all_chunks = []
for dataset_name, texts in dataset_texts.items():
    dataset_chunks = chunk_text_with_metadata(texts, dataset_name, max_length=500, overlap=100)
    all_chunks.extend(dataset_chunks)
chunked_texts = [chunk['text'] for chunk in all_chunks]
chunked_metadatas = [{'dataset': chunk['dataset']} for chunk in all_chunks]


# chroma vector
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_store = Chroma.from_texts(
    texts=chunked_texts,
    embedding=embedding_model,
    metadatas=chunked_metadatas,
    persist_directory="chroma_db"
)
vector_store.persist()

retriever = vector_store.as_retriever()

def retrieve_top_k(query, k=3):
    docs = retriever.get_relevant_documents(query)[:k]
    return docs



# model
tokenizer = LlamaTokenizer.from_pretrained('NousResearch/Llama-2-7b-hf')
model = LlamaForCausalLM.from_pretrained(
    'NousResearch/Llama-2-7b-hf',
    torch_dtype=torch.float16,
    device_map=None
)

if torch.cuda.is_available():
    device = torch.device('cuda')
    model.to(device)
    print("Using CUDA device")
else:
    device = torch.device('cpu')
    model.to(device) 
    print("Using CPU device")

def generate_text(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=512,
        do_sample=True,
        temperature=0.5,
        top_p=0.95,
        repetition_penalty=1.15
    )
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

class CustomLLM(LLM):
    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        return generate_text(prompt)
    
    @property
    def _llm_type(self) -> str:
        return "custom"

llm = CustomLLM()

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True
)

import time
import random

def get_current_wait_time(hospital_name: str) -> str:
    if hospital_name not in hospitals_df['hospital_name'].values:
        return f"Hospital '{hospital_name}' does not exist."
    wait_time = random.randint(10, 120)
    return f"The current wait time at {hospital_name} is approximately {wait_time} minutes."

def get_physician_rating(physician_name: str) -> str:
    physician_reviews = reviews_df[reviews_df['physician_name'] == physician_name]
    if physician_reviews.empty:
        return f"No reviews found for Dr. {physician_name}."
    if 'rating' not in physician_reviews.columns:
        physician_reviews['rating'] = [random.randint(1, 5) for _ in range(len(physician_reviews))]
    average_rating = physician_reviews['rating'].mean()
    return f"The average rating for Dr. {physician_name} is {average_rating:.1f} out of 5."

tools = [
    Tool(
        name="Get Current Wait Time",
        func=get_current_wait_time,
        description="Use this function to get the current wait time at a hospital. Input should be the hospital name."
    ),
    Tool(
        name="Get Physician Rating",
        func=get_physician_rating,
        description="Use this function to get the average rating of a physician based on patient reviews. Input should be the physician's name."
    )
]

agent = initialize_agent(
    tools=tools,
    llm=llm,
    agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True,
    retriever=retriever
)

def handle_query(query):
    try:
        response = agent.run(query)
    except Exception as e:
        response = "I'm sorry, I encountered an error while processing your request."
        print(f"Error: {str(e)}")
    return response

if __name__ == "__main__":
    query = "What are the salary and review for Dr. Smith?"
    answer = handle_query(query)
    print("\nQuery 1:")
    print(f"Question: {query}")
    print(f"Answer: {answer}")
    query = "What is the current wait time at Mercy Hospital?"
    answer = handle_query(query)
    print("\nQuery 2:")
    print(f"Question: {query}")
    print(f"Answer: {answer}")

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.93it/s]


Using CUDA device


  agent = initialize_agent(
  response = agent.run(query)




[1m> Entering new AgentExecutor chain...[0m
Error: An output parsing error occurred. In order to pass this error back to the agent and have it try again, pass `handle_parsing_errors=True` to the AgentExecutor. This is the error: Parsing LLM output produced both a final answer and a parse-able action:: Answer the following questions as best you can. You have access to the following tools:

Get Current Wait Time(hospital_name: str) -> str - Use this function to get the current wait time at a hospital. Input should be the hospital name.
Get Physician Rating(physician_name: str) -> str - Use this function to get the average rating of a physician based on patient reviews. Input should be the physician's name.

Use the following format:

Question: the input question you must answer
Thought: you should always think about what to do
Action: the action to take, should be one of [Get Current Wait Time, Get Physician Rating]
Action Input: the input to the action
Observation: the result of the