In [None]:
pip install langchain langchain-community langchain-text-splitters langchain-huggingface langchain-groq chromadb sentence-transformers unstructured bs4


Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Installing collected packages: bs4
Successfully installed bs4-0.0.2


In [None]:
import requests
from bs4 import BeautifulSoup

import os
from langchain_community.document_loaders import UnstructuredURLLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_groq import ChatGroq
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from sentence_transformers import CrossEncoder

from langchain.chains import RetrievalQA


# ----------- STEP 1: SCRAPE & CLEAN WITH BEAUTIFULSOUP -----------
url = "https://en.wikipedia.org/wiki/San_Francisco"
headers = {"User-Agent": "Mozilla/5.0 (compatible; MyRAGBot/1.0)"}
html = requests.get(url, headers=headers).text

soup = BeautifulSoup(html, "html.parser")

# Extract only paragraph text from the main content
paragraphs = [p.get_text() for p in soup.select("div.mw-parser-output p")]
raw_text = "\n".join(paragraphs)


# ----------- STEP 2: SPLIT INTO CHUNKS -----------
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
docs = splitter.create_documents([raw_text])


# ----------- STEP 3: EMBEDDINGS + VECTOR DB (CHROMA) -----------
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

db = Chroma.from_documents(docs, embedding=embeddings, persist_directory="./chroma_db")

retriever = db.as_retriever(search_kwargs={"k": 5})


# ----------- STEP 4: CROSS-ENCODER RERANKER -----------
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

def rerank(query, docs):
    pairs = [(query, d.page_content) for d in docs]
    scores = reranker.predict(pairs)
    reranked = sorted(zip(docs, scores), key=lambda x: x[1], reverse=True)
    return reranked


# ----------- STEP 5: RAG PIPELINE -----------
llm = ChatGroq(model="qwen/qwen3-32b", api_key="your_api_key")

def rag_query(query):
    # Step 1: Retrieve docs
    retrieved_docs = retriever.get_relevant_documents(query)
    print("\n--- Retrieved Docs Before Rerank ---")
    for d in retrieved_docs:
        print("CHUNK PREVIEW:", d.page_content[:150], "...\n")

    # Step 2: Rerank
    reranked = rerank(query, retrieved_docs)
    print("\n--- Top Docs After Rerank ---")
    for d, score in reranked[:3]:
        print(f"Score: {score:.4f}\nCHUNK PREVIEW: {d.page_content[:150]} ...\n")

    # Step 3: Build context for answer
    top_context = "\n".join([d.page_content for d, _ in reranked[:3]])

    final_answer = llm.predict(
        f"Answer the following question using only the context:\n\nContext:\n{top_context}\n\nQuestion: {query}\n\nAnswer:"
    )

    return final_answer


# ----------- TEST QUERY -----------
print("\n=== FINAL ANSWER ===")
print(rag_query("What is the climate of San Francisco?"))



=== FINAL ANSWER ===

--- Retrieved Docs Before Rerank ---
CHUNK PREVIEW: San Francisco has a warm-summer Mediterranean climate (Köppen: Csb, Trewartha: Csll), characteristic of California's coast, with moist winters and dry ...

CHUNK PREVIEW: As a coastal city, San Francisco will be heavily affected by climate change. As of 2021[update], sea levels are projected to rise by as much as 5 feet ...

CHUNK PREVIEW: Among major U.S. cities, San Francisco has the coolest daily mean, maximum, and minimum temperatures for June, July, and August.[135] During the summe ...

CHUNK PREVIEW: During an average year between 1991 and 2020, San Francisco recorded a warmest night at 64 °F (18 °C) and a coldest day at 49 °F (9 °C).[137] The cold ...

CHUNK PREVIEW: On average, there are 73 rainy days a year, and annual precipitation averages 23.65 inches (601 mm).[137] Variation in precipitation from year to year ...


--- Top Docs After Rerank ---
Score: 9.1307
CHUNK PREVIEW: San Francisco has a warm-

  final_answer = llm.predict(


<think>
Okay, let's tackle this question. The user wants to know the climate of San Francisco based on the provided context. First, I need to look through the context given and extract the relevant information.

The context starts by mentioning that San Francisco has a warm-summer Mediterranean climate, classified as Köppen Csb and Trewartha Csll. That's a key point. Then it says it's characteristic of California's coast with moist winters and dry summers. So I should note that it's a Mediterranean climate with those specific seasons.

Next, the context explains that the Pacific Ocean and San Francisco Bay influence the weather, moderating temperature swings and leading to a mild year-round climate with little seasonal variation. That's important for describing the overall mildness.

Then there are some temperature statistics: the warmest night is 64°F and the coldest day is 49°F during 1991-2020. Also, the coldest daytime high since 1945 was 37°F in 1972. These numbers give specific t