Web Lens - Mini Project

weblens/
├── scraper.py
├── chunker.py
├── extractor.py
├── llm_client.py
├── formatter.py
└── main.py


In [31]:
import requests
from bs4 import BeautifulSoup 
from IPython.display import Markdown, display
from openai import OpenAI
import json

In [32]:
#Scraper
def scrape_page(url:str) -> str:
    response = requests.get(url,timeout=10)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, "html.parser")

    for tag in soup(['script','style','nav','footer','header']):
        tag.decompose()
    
    text = soup.get_text(separator=" ")
    text_output= " ".join(text.split())
    return ((text_output))

In [33]:
scrape_page("https://www.myscheme.gov.in/")

'myScheme Something went wrong. Please try again later. Ok Are you sure you want to sign out? Cancel Sign Out #GOVERNMENTSCHEMES / #SCHEMESFORYOU Find Schemes For You Categories States/UTs Central Ministries Find schemes based on categories How it works Easy steps to apply for Government Schemes Enter Details Start by entering your basic details! Search Our search engine will find the relevant schemes ! Select & Apply Select and apply for the best suited scheme About myScheme is a National Platform that aims to offer one-stop search and discovery of the Government schemes. It provides an innovative, technology-based solution to discover scheme information based upon the eligibility of the citizen. The platform helps the citizen to find the right Government schemes for them. It also guides on how to apply for different Government schemes. Thus no need to visit multiple Government websites. View More Frequently Asked Questions Checkout our knowledge base for some of your answers! What is

In [34]:
#validation
def validate_raw_text(text: str):
    if not isinstance(text, str):
        raise TypeError("raw_text must be a string")

    if len(text.split()) < 200:
        return False, "Page content too thin to analyze"

    return True, None


In [35]:
#detect prompt type
def detect_page_type(text: str) -> str:
    t = text.lower()

    if any(x in t for x in ["eligibility", "apply", "benefits"]):
        return "policy"

    if any(x in t for x in ["requirements", "responsibilities"]):
        return "job"

    if any(x in t for x in ["installation", "usage", "api"]):
        return "documentation"

    return "general"


In [48]:
#Chunker
def chunk_text(text:str, max_words:int=5000)->list[str]:
    text_st = str(text)
    words = text_st.split()
    chunks=[]

    for i in range(0,len(words),max_words):
        chunks.append(" ".join(words[i:i + max_words]))
    return chunks

In [37]:
ollama = OpenAI(base_url="http://localhost:11434/v1/",api_key='Ollama')
system_prompt = """You are an strict information normaliser. 
Rules:
-Don't add opinions or invent facts.
Only use information in the source text.
Output must be valid json
Do not use markdown
Do not wrap output in '''json
"""

#llm_client
def call_llm(system_prompt: str, user_prompt: str)-> str:
    response = ollama.chat.completions.create(
        model="dolphin3:8b",
        messages =[
            {
                "role":"system", "content":system_prompt
            },
            {
                "role":"user","content":user_prompt
            }
        ],
        temperature=0.2,
    )

    return response.choices[0].message.content

In [38]:


#Extractor
def extract_structure(text_chunk:str,page_type:str)->dict:

    user_prompt = f"""Normalize the following webpage text into this JSON schema:

{{
  "page_type": "{page_type}",
  "summary": "",
  "key_points": [],
  "constraints": [],
  "not_covered": []
}}

Instructions:
- Use ONLY the provided text.
- Do NOT add external knowledge.
- Leave fields empty or "Not stated in the source" if unsupported.

Text:
\"\"\"{text_chunk}\"\"\"
"""
    response = call_llm (system_prompt, user_prompt)

    try:
        return json.loads(response)
    except json.JSONDecodeError:
        return {
            "error":"Invalid JSON returned by LLM", "raw_output": response
        }
   



In [39]:
def merge_outputs (extracted_chunks: list[str])->str:
    return "\n\n".join(extracted_chunks)

In [40]:
#formatter
def run(url: str):
    raw_text = scrape_page(url)

    is_valid, error= validate_raw_text(raw_text)
    if not is_valid:
        return {"error":error}
    
    page_type =detect_page_type(raw_text)

    chunks = chunk_text(raw_text)
   
    extracted = []
    for chunk in chunks:
        extracted.append(extract_structure(chunk,page_type))
    
    return{
        "page_type":page_type,
        "chunk_processed":len(chunks),
        "results":extracted

    }
    #return Markdown(merge_outputs(extracted))
    

In [47]:
run("https://up.gov.in/en")

{'page_type': 'general',
 'chunk_processed': 2,
 'results': [{'error': 'Invalid JSON returned by LLM',
   'raw_output': '```json\n{\n  "page_type": "general",\n  "summary": "",\n  "key_points": [\n    "Uttar Pradesh is a state in India with an area of 2,43,286 square km.",\n    "The population of Uttar Pradesh is 24 crores (year 2011).",\n    "There are 75 districts in the state.",\n    "E-sandesh is a news platform that provides updates on various events and projects in Uttar Pradesh."\n  ],\n  "constraints": [],\n  "not_covered": []\n}\n```'},
  {'error': 'Invalid JSON returned by LLM',
   'raw_output': '```json\n{\n  "page_type": "general",\n  "summary": "",\n  "key_points": [],\n  "constraints": [],\n  "not_covered": []\n}\n```'}]}

In [49]:
run("https://www.msn.com/en-in/health/other/russian-cardiologist-shares-5-normal-things-people-do-daily-that-secretly-damage-heart-and-body-too-much-sitting/ar-AA1PxsVS?ocid=msedgntp&pc=U531&cvid=6967c40c9aef475ab08d47268049e1ca&ei=10")

{'error': 'Page content too thin to analyze'}

In [50]:
run("https://upgovernor.gov.in/")

{'page_type': 'general',
 'chunk_processed': 1,
 'results': [{'error': 'Invalid JSON returned by LLM',
   'raw_output': '```json\n{\n  "page_type": "general",\n  "summary": "",\n  "key_points": [\n    "The Governor of UP, Smt. Anandiben Patel, met the President of India, Smt. Draupadi Murmu.",\n    "Congratulations to Hon\'ble Governor Smt. Anandiben Patel ji on completing six years of service to Uttar Pradesh.",\n    "Raj Bhawan premises are open for visitors from 4:00 PM to 6:00 PM.",\n    "Presentation given regarding the proposed musical fountain in Gangotri to Gangasagar replica and renovation of fountains at Raj Bhavan.",\n    "Gilli-Danda competition organized on the 18th day of traditional sports competition.",\n    "Governor visited the Centre for Advanced Studies at AKTU and inaugurated B.Tech program.",\n    "Tribute paid by Governor Smt. Anandiben Patel to Swami Vivekananda\'s birth anniversary.",\n    "Presentation on Digital Examination Ecosystem given before the Governor

In [51]:
run("https://priceindia.in/step-by-step-ac-installation/")

{'page_type': 'policy',
 'chunk_processed': 1,
 'results': [{'error': 'Invalid JSON returned by LLM',
   'raw_output': '```json\n{\n  "page_type": "policy",\n  "summary": "",\n  "key_points": [\n    {\n      "title": "Checklist: What to Check Before the Technician Leaves",\n      "details": [\n        {\n          "sub_title": "Indoor Unit Placement (For Split ACs)",\n          "points": [\n            "Height Matters: The indoor unit should be at least 7 feet high for better cooling distribution.",\n            "No Direct Sunlight: Avoid placing the unit near windows where direct sunlight can overheat it.",\n            "Correct Tilt: The AC should be slightly tilted backward so that condensation drains properly.",\n            "No Obstructions: Keep the AC away from doors, curtains, and furniture that can block airflow."\n          ]\n        },\n        {\n          "sub_title": "Outdoor Unit Placement (For Split ACs)",\n          "points": [\n            "Good Ventilation: The outd

In [52]:
run("https://ollama.com/")

{'page_type': 'general',
 'chunk_processed': 1,
 'results': [{'error': 'Invalid JSON returned by LLM',
   'raw_output': '```json\n{\n  "page_type": "general",\n  "summary": "",\n  "key_points": [\n    {\n      "name": "Nemotron-3-Nano",\n      "details": {\n        "description": "A new Standard for Efficient, Open, and Intelligent Agentic Models cloud",\n        "size": "30b",\n        "pulls": "101.7K",\n        "tags": "6",\n        "last_updated": "4 weeks ago"\n      }\n    },\n    {\n      "name": "FunctionGemma",\n      "details": {\n        "description": "Fine-tuned explicitly for function calling",\n        "size": "270M",\n        "pulls": "33.4K",\n        "tags": "4",\n        "last_updated": "3 weeks ago"\n      }\n    },\n    {\n      "name": "Olmo-3",\n      "details": {\n        "description": "Open language models designed to enable the science of language models",\n        "size": "7b 32b",\n        "pulls": "67.5K",\n        "tags": "15",\n        "last_updated": "4

In [45]:
if __name__ == "__main__":
    url = input("enter webpage url:")
    result = run(url)

    print ("\n-- Structured Output ---\n")
    print(result)


-- Structured Output ---

{'page_type': 'general', 'chunk_processed': 1, 'results': [{'error': 'Invalid JSON returned by LLM', 'raw_output': '```json\n{\n  "page_type": "general",\n  "summary": "",\n  "key_points": [\n    {\n      "title": "Number of Jobs",\n      "description": "75,000+ jobs in Bengaluru East"\n    },\n    {\n      "title": "Job Types",\n      "description": "Full-time (72,856), Part-time (275), Contract (1,337), Temporary (76), Volunteer (33)"\n    },\n    {\n      "title": "Experience Levels",\n      "description": "Internship (924), Entry level (12,555), Associate (4,268), Mid-Senior level (47,357), Director (1,825)"\n    },\n    {\n      "title": "Locations",\n      "description": "Bengaluru (62,818), Bengaluru East (7,175), Bengaluru South (373), Bengaluru North (142), Hosur (89)"\n    }\n  ],\n  "constraints": [],\n  "not_covered": []\n}\n```'}]}
