# Scraping of data
#### This notebook has 2 parts, first scraping and saving the scraped data in a text file. Second, is building a QnA pair through a vector index using Llama

# Part I

In [1]:
#imports
from llama_index import download_loader
import os
import json
import openai
from langchain.embeddings import OpenAIEmbeddings
from llama_index.llms import AzureOpenAI
from llama_index.embeddings import LangchainEmbedding
from llama_index.embeddings import HuggingFaceEmbedding
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
import logging
import sys
from dotenv import load_dotenv
from langchain.embeddings import HuggingFaceEmbeddings
from llama_index import set_global_service_context 




In [2]:
# open AI client config:

load_dotenv()

client = AzureOpenAI(
    api_key=os.getenv('AZURE_OPENAI_API_KEY'),
    azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT'),
    # api_type='azure',
    api_version='2023-05-15',  # this may change in the future
    timeout=20*60,  # 20 minutes
    engine='gpt-4-1106-preview'
)


llm = client

embedding_llm = LangchainEmbedding(
    HuggingFaceEmbeddings(model_name="intfloat/e5-base-v2"), embed_batch_size=1)
service_context = ServiceContext.from_defaults(
    llm=llm,
    embed_model=embedding_llm,
)

set_global_service_context(service_context)

## Getting all associated URL's with the page using HTML's href tag

In [8]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
 
 
url = 'https://www.tenethealth.com/'
reqs = requests.get(url)
soup = BeautifulSoup(reqs.text, 'html5lib') # lxml -> good for speed, html5lib -> good for parsing broken html
 

urls = set() #set to avoid duplicate urls
for link in soup.find_all('a'):
    href = link.get('href')
    if href.startswith('/'):
        full_url = urljoin(url, href)
        urls.add(full_url)
   

print(f"total urls: {len(urls)}")
for saved_url in urls:
    print(saved_url)    

total urls: 26
https://www.tenethealth.com/covid-19-information-and-resources
https://www.tenethealth.com/legal
https://www.tenethealth.com/our-stories
https://www.tenethealth.com/about
https://www.tenethealth.com/
https://www.tenethealth.com/careers/life-at-tenet
https://www.tenethealth.com/locations
https://www.tenethealth.com/home/who-we-are
https://www.tenethealth.com/careers
https://www.tenethealth.com/about/our-history
https://www.tenethealth.com/about/environmental-sustainability
https://www.tenethealth.com/contact
https://www.tenethealth.com/home/giving-back
https://www.tenethealth.com/about/what-we-do
https://www.tenethealth.com/about/ethics-compliance
https://www.tenethealth.com/sitemap
https://www.tenethealth.com/about/our-leadership
https://www.tenethealth.com/notice-of-privacy-practices
https://www.tenethealth.com/home/our-locations
https://www.tenethealth.com/privacy
https://www.tenethealth.com/accessibility
https://www.tenethealth.com/careers/programs-benefits
https://ww

## Scraping and downloading the URL's below

In [16]:
# Slack Web Crawler

from llama_index.readers.web import BeautifulSoupWebReader # change scraper to trafilatura here
from urllib.parse import urlparse
import os


counter = 0
reader = BeautifulSoupWebReader() # change scraper to trafilatura here
filename = "TenetHealth_BeautifulSoup.txt"
directory = "manual_data"
filepath = os.path.join(directory, filename)

with open(filepath, "w") as f:
    for url in urls:
        print(counter, "/", len(urls), "Reading from", url)
        counter += 1
        documents = reader.load_data(urls=[url])
        if (documents is None or documents == []):
            print("Failed to read", url)
            continue
        document = documents[0]
        parsed_url = urlparse(document.doc_id)

        f.write("Source: " + url + '\n\n')
        f.write(document.text + '\n\n')

0 / 26 Reading from https://www.tenethealth.com/covid-19-information-and-resources
1 / 26 Reading from https://www.tenethealth.com/legal
2 / 26 Reading from https://www.tenethealth.com/our-stories
3 / 26 Reading from https://www.tenethealth.com/about
4 / 26 Reading from https://www.tenethealth.com/
5 / 26 Reading from https://www.tenethealth.com/careers/life-at-tenet
6 / 26 Reading from https://www.tenethealth.com/locations
7 / 26 Reading from https://www.tenethealth.com/home/who-we-are
8 / 26 Reading from https://www.tenethealth.com/careers
9 / 26 Reading from https://www.tenethealth.com/about/our-history
10 / 26 Reading from https://www.tenethealth.com/about/environmental-sustainability
11 / 26 Reading from https://www.tenethealth.com/contact
12 / 26 Reading from https://www.tenethealth.com/home/giving-back
13 / 26 Reading from https://www.tenethealth.com/about/what-we-do
14 / 26 Reading from https://www.tenethealth.com/about/ethics-compliance
15 / 26 Reading from https://www.tenethe

## After Scraping the above URLS, we can validate the Text files for completness using the notebook "ScrapingComp"

# Part II

## Building a vector index

In [41]:
#llama Hub connector
reader = SimpleDirectoryReader(input_files=["manual_scraping/TenetHealth.txt"])
documents = reader.load_data()
index = VectorStoreIndex.from_documents(documents)

In [42]:
def collect_answers(question):
    query_engine = index.as_query_engine() #Need to re-run this everytime to get newest index
    response = query_engine.query(question)
    return response

## IMporting the QnA data set (10_qa.json)
This contains Questions and Ground Truth's in the form of context from the target website or hand typed answers.

In [12]:
import json

dataset = '10_qa.json'

with open(dataset, 'r') as file:
    question_answer = json.load(file)

print("Number of question and answer pairs: ", len(question_answer['QA']))
print("\n")
print("Example question: ", question_answer['QA'][0]['question'])
print("\n")
print("Example Answer: ", question_answer['QA'][0]['answer'])

Number of question and answer pairs:  10


Example question:  How to make an appointment?


Example Answer:   To book an appointment, you would typically need to contact the healthcare facility directly. Look for contact information on their official website or any official correspondence you have received from them. If you have a specific department or service in mind, their direct contact details may also be available on the website. If you are a new patient, you might need to provide some personal and medical information during the booking process. If you are an existing patient, you may be able to book an appointment through a patient portal if the facility offers one.


## Generating LLM responses using the index we built 

In [43]:

question_list = []
answer_list = []
gtruth_list = []

for i in range(len(question_answer['QA'])):
    print("_______________________________________")
    print("iteration: ", i)
    question = question_answer['QA'][i]['question']
    question_list.append(question)
    print(f"Query send: {question}\n")
    ground_truth = question_answer['QA'][i]['answer']
    gtruth_list.append(ground_truth)
    gen_response = collect_answers(question) #Calling RAG application
    print(f"Generated Response: {gen_response}\n")
    answer_list.append(str(gen_response))
    print(f"\nGround truth: {ground_truth}\n")
    #gen_str = str(gen_response) #TODO: test if this is required
    print("_______________________________________")


_______________________________________
iteration:  0
Query send: How to make an appointment?

Generated Response: To make an appointment, you can use the online forms provided on the website. These forms may include appointment requests among other options. By filling out the necessary information on the appointment request form, such as your name, address, phone number, email address, and any other required details, you can submit your request to schedule an appointment. If the appointment is for inpatient or outpatient services, you may also have the option to pre-register through an affiliate’s or third party’s website.


Ground truth:  To book an appointment, you would typically need to contact the healthcare facility directly. Look for contact information on their official website or any official correspondence you have received from them. If you have a specific department or service in mind, their direct contact details may also be available on the website. If you are a new pati

## Saving the result in JSON format so we can evaluate it using the "RAG_Eval" notebook

In [44]:
qa_list = [{"question": q, "answer": a, "ground_truth": g} for q, a, g in zip(question_list, answer_list, gtruth_list)]

data = {
    "QA": qa_list
}

#Change the name of the file to the name of the dataset
with open('Manual_Response.json', 'w') as f:
    json.dump(data, f, indent=4)
