In [19]:
! pip install requests beautifulsoup4 langchain chromadb gpt4all

Collecting beautifulsoup4
  Downloading beautifulsoup4-4.12.3-py3-none-any.whl.metadata (3.8 kB)
Collecting soupsieve>1.2 (from beautifulsoup4)
  Downloading soupsieve-2.6-py3-none-any.whl.metadata (4.6 kB)
Downloading beautifulsoup4-4.12.3-py3-none-any.whl (147 kB)
Downloading soupsieve-2.6-py3-none-any.whl (36 kB)
Installing collected packages: soupsieve, beautifulsoup4
Successfully installed beautifulsoup4-4.12.3 soupsieve-2.6


In [1]:
import os

os.environ["LANGCHAIN_TRACING_V2"] = 'True'
os.environ["LANGCHAIN_ENDPOINT"] = 'https://api.smith.langchain.com'
os.environ["LANGCHAIN_API_KEY"] = "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"

In [2]:
local_llm = 'llama2'

In [7]:
import requests
from bs4 import BeautifulSoup
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import GPT4AllEmbeddings

# Define the URLs you want to scrape
urls = [
    "https://www.medscape.com/radiology",
    "https://pubmed.ncbi.nlm.nih.gov/",
    "https://radiopaedia.org/",
    "https://www.myesr.org/",
    "https://www.bmj.com/specialties/radiology"
]

# Function to scrape the content of a given URL
def scrape_content(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        # Extract text content from the page
        text = soup.get_text()
        return Document(page_content=text, metadata={"url": url})
    except requests.exceptions.RequestException as e:
        print(f"Error fetching content from {url}: {e}")
        return None

# Scrape documents from the URLs
docs = [scrape_content(url) for url in urls if scrape_content(url) is not None]

# Split documents
text_spliter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=250,
    chunk_overlap=0
)

docs_list = text_spliter.split_documents(docs)

# Filter out complex metadata and ensure proper document formatting
filtered_docs = []
for doc in docs_list:
    if isinstance(doc, Document) and hasattr(doc, 'metadata'):
        clean_metadata = {k: v for k, v in doc.metadata.items() if isinstance(v, (str, int, float, bool))}
        filtered_docs.append(Document(page_content=doc.page_content, metadata=clean_metadata))

# Add to vector DB
vectorstore = Chroma.from_documents(
    documents=filtered_docs,
    collection_name="rag-chroma",
    embedding=GPT4AllEmbeddings()
)

retriever = vectorstore.as_retriever()


In [8]:
from langchain.prompts import PromptTemplate
from langchain_community.chat_models import ChatOllama
from langchain_core.output_parsers import JsonOutputParser

#LLM
llm = ChatOllama(model=local_llm, format="json", temperature=0)

prompt = PromptTemplate(
    template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a grader assessing relavance
    of a retrived document to a user question. If the document contains keywords realted to the user question,
    grade it as relevent. It dose not need to be a stringent test. The goal is to filter out erroneous retrievals. \n
    Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question. \n
    <|eot_id|><|start_header_id|>user<|end_header_id|>
    Here is the retireved document: \n\n {document} \n\n
    Here is the user question: {question} \n <|eot_id|><|start_header_id|>assistant<|end_header_id|>
    """,
    input_variable=["question","document"]
)

retrival_grader = prompt | llm | JsonOutputParser()
question = "what is radiology"
docs = retriever.invoke(question)
doc_txt = docs[1].page_content
print(retrival_grader.invoke({"question":question, "document":doc_txt}))

{'relevance': 'yes', 'reasoning': "The document contains keywords related to the user question, such as 'radiology', 'membership', and 'benefits'. These keywords suggest that the document is relevant to the user's query about radiology."}
