## Import Libraries


In [1]:
from langchain_community.document_loaders import UnstructuredURLLoader
from langchain_ollama import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_ollama.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever
import warnings
warnings.filterwarnings('ignore')
from IPython.display import display, Markdown
import os
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"

## Load PDF

In [2]:
urls = [
    "https://www.uchicago.edu/",
    "https://www.washington.edu/",
    "https://www.stanford.edu/",
    "https://und.edu/"
]
if urls:
    loader = UnstructuredURLLoader(urls=urls)
    data = loader.load()
    print(f"Website content loaded successfully from {len(urls)} URLs.")
else:
    print("No URLs provided.")

Website content loaded successfully from 4 URLs.


## Split text into chunks

In [3]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(data)
print(f"Text split into {len(chunks)} chunks")

Text split into 13 chunks


## Create vector database

In [4]:
vector_db = Chroma.from_documents(
    documents=chunks,
    embedding=OllamaEmbeddings(model="nomic-embed-text"),
    collection_name="website-rag"
)
print("Vector database created successfully")

Vector database created successfully


## Set up LLM and Retrieval

In [5]:
local_model = "llama3.2"
llm = ChatOllama(model=local_model)

In [6]:
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate 2
    different versions of the given user question to retrieve relevant documents from
    a vector database. By generating multiple perspectives on the user question, your
    goal is to help the user overcome some of the limitations of the distance-based
    similarity search. Provide these alternative questions separated by newlines.
    Original question: {question}""",
)
retriever = MultiQueryRetriever.from_llm(
    vector_db.as_retriever(), 
    llm,
    prompt=QUERY_PROMPT
)

## Create chain

In [7]:
template = """Answer the question based ONLY on the following context:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

In [8]:
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

## Chat with PDF

In [9]:
def chat_with_website(question):
    return display(Markdown(chain.invoke(question)))

In [10]:
chat_with_website("Summarize the key points from of the website.")

Based on the provided context, here is a summary of the key points from the websites:

1. Stanford University:
* Founded almost 150 years ago with a societal mission to contribute to the world through education, research, and innovation.
* Seven schools where students can pursue their passions: Medicine, Law, Humanities & Sciences, Engineering, Education, Sustainability, and Business.
* Emphasizes leadership, contribution, and integrity in its academic programs.
* Has made significant advancements in health care, including precision medicine and innovative treatments.

2. University of North Dakota:
* Consistently ranks among the best for educational quality, affordability, and career outcomes (Best Value College).
* Offers a wide range of undergraduate and graduate programs, including data science, aerospace engineering, and music education.
* Emphasizes diversity, inclusion, and accessibility, with a strong commitment to supporting students with disabilities and veterans.

3. Stanford Medicine:
* Dedicated to advancing human health through biomedical research, education, and clinical enterprises.
* Has made significant advancements in precision health, leveraging expertise and advanced technology to deliver unparalleled care for patients.

4. University of North Dakota's student stories:
* Features profiles of students who have overcome personal challenges or pursued their passions, highlighting the university's commitment to supporting students' success (e.g., Chase Garber, a social work student who returned to school after facing personal challenges).

Overall, both universities emphasize the importance of education, research, and innovation in driving positive change in the world.

## Clean up (optional)

In [16]:
vector_db.delete_collection()
print("Vector database deleted successfully")

Vector database deleted successfully
