In [None]:
pip install langchain langchain-community langchain-text-splitters langchain-huggingface langchain-groq chromadb sentence-transformers unstructured


Collecting unstructured
  Downloading unstructured-0.18.14-py3-none-any.whl.metadata (24 kB)
Collecting filetype (from unstructured)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting python-magic (from unstructured)
  Downloading python_magic-0.4.27-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting emoji (from unstructured)
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Collecting python-iso639 (from unstructured)
  Downloading python_iso639-2025.2.18-py3-none-any.whl.metadata (14 kB)
Collecting langdetect (from unstructured)
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting rapidfuzz (from unstructured)
  Downloading rapidfuzz-3.14.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Collecting unstructured-client (from unstructure

In [None]:
import os


from langchain_community.document_loaders import UnstructuredURLLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_groq import ChatGroq
from langchain.chains import RetrievalQA


GROQ_API_KEY = "your_api_key"
PERSIST_DIR = "./chroma_db"

# ---- Hardcode the URL you want to index ----
URL = "https://en.wikipedia.org/wiki/San_Francisco"

# 1. Load and split
loader = UnstructuredURLLoader(urls=[URL])
docs = loader.load()

splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = splitter.split_documents(docs)

# 2. Embeddings + VectorStore
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectordb = Chroma(persist_directory=PERSIST_DIR, embedding_function=embeddings)
vectordb.add_documents(chunks)
vectordb.persist()

# 3. Retriever + LLM (Groq)
retriever = vectordb.as_retriever(search_kwargs={"k": 3})
llm = ChatGroq(
    api_key=GROQ_API_KEY,
    model="qwen/qwen3-32b",  # adjust model as needed
    temperature=0,
)

qa = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",
    return_source_documents=True,
)

# 4. Ask questions
query = "history of san fransisco"
result = qa({"query": query})

print("\n=== ANSWER ===\n")
print(result["result"])

print("\n=== SOURCES ===\n")
for doc in result["source_documents"]:
    print(doc.metadata.get("source", "unknown"))



=== ANSWER ===

<think>
Okay, the user asked about the history of San Francisco. Let me start by recalling what I know. San Francisco is a city in California, USA. It was founded in the 18th century, right? I think the Spanish established a mission there. Maybe Mission San Francisco de Asís? That was in 1776. Then it became part of Mexico after Mexico's independence from Spain. The Mexican-American War led to the US taking over, so maybe the city became part of the US in the mid-19th century.

The Gold Rush in 1849 brought a lot of people, which made the city grow quickly. The population exploded. Then there was the 1906 earthquake and fire, which destroyed much of the city. After that, they rebuilt. The city hosted the Panama-Pacific Exposition in 1915 to celebrate the opening of the Panama Canal. That's a big event.

In the 20th century, San Francisco became known for the counterculture movement in the 1960s, like the Summer of Love. The LGBTQ+ community also found a home there, esp