<a href="https://colab.research.google.com/github/Subham15-11/justfor/blob/main/DSPy_Practical_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [96]:
!pip install dspy-ai



In [97]:
import dspy
# print(dspy.__version__)

In [98]:
import importlib.metadata
print(importlib.metadata.version("dspy-ai"))

3.0.4


In [99]:
import json
import copy
from typing import List, Optional
from typing import Literal, Dict, Union

In [100]:
API_KEY="ak_2KH0D27yJ60F6sh4vA8Dy5fr7zz0L"
main_lm = dspy.LM("openai/LongCat-Flash-Chat", api_key=API_KEY,api_base="https://api.longcat.chat/openai/v1")

dspy.settings.configure(lm=main_lm,adapter=dspy.XMLAdapter())

In [101]:
test = dspy.Predict("question -> answer")
print(test(question="Reply with OK only").answer)

OK


In [102]:
from typing import List, Dict, Tuple
from pydantic import BaseModel, Field
import requests

In [103]:
# ENTITY + ATTRIBUTE EXTRACTION
# code uses Pydantic models inside a DSPy Signature.
# If the LLM returns broken JSON or the wrong format, DSPy's internal logic and the Pydantic catches it
class EntityWithAttr(BaseModel):
    entity: str = Field(description="the named entity")
    attr_type: str = Field(description="semantic type of the entity (e.g. Drug, Disease, Symptom, etc.)")

class ExtractEntities(dspy.Signature):
    """From the paragraph extract all relevant entities and their semantic attribute types."""
    paragraph: str = dspy.InputField(desc="input paragraph")
    entities: List[EntityWithAttr] = dspy.OutputField(desc="list of entities and their attribute types")

In [104]:
# DEDUPLICATOR
# LLMs are probabilistic. They can make mistakes.
class DeduplicateEntities(dspy.Signature):
    """Given a list of (entity, attr_type) decide which ones are duplicates.
    Return a deduplicated list and a confidence that the remaining items are ALL distinct."""
    items: List[EntityWithAttr] = dspy.InputField(desc="batch of entities to deduplicate")
    deduplicated: List[EntityWithAttr] = dspy.OutputField(desc="deduplicated list")
    confidence: float = dspy.OutputField(
        desc="confidence (0-1) that every item in deduplicated is semantically distinct"
    )

dedup_predictor = dspy.ChainOfThought(DeduplicateEntities)

def deduplicate_with_lm(
    items: List[EntityWithAttr],
    *,
    batch_size: int = 10,
    target_confidence: float = 0.9,
) -> List[EntityWithAttr]:
    """
    Recursively deduplicate using the LM with confidence loop.
    """
    if not items:
        return []

    # helper to process one batch
    def _process_batch(batch: List[EntityWithAttr]) -> List[EntityWithAttr]:
        attempts = 0
        while attempts < 3: # Safety break to prevent infinite loops
        # The LLM is asked to deduplicate the list and rate its own confidence (0.0 to 1.0). If the confidence is below 0.9 (90%), the code rejects the answer and tries again (up to 3 times).
            pred = dedup_predictor(items=batch)
            if pred.confidence >= target_confidence:
                return pred.deduplicated
            attempts += 1
        # If confidence never reached, return last attempt or original
        return pred.deduplicated if 'pred' in locals() else batch

    # split into batches and process
    results = []
    for i in range(0, len(items), batch_size):
        batch = items[i : i + batch_size]
        results.extend(_process_batch(batch))
    return results

In [105]:
# RELATION EXTRACTION
class Relation(BaseModel):
    subj: str = Field(description="subject entity (exact string as in deduplicated list)")
    pred: str = Field(description="short predicate / relation phrase")
    obj:  str = Field(description="object entity (exact string as in deduplicated list)")

class ExtractRelations(dspy.Signature):
    """Given the original paragraph and a list of unique entities, extract all factual (subject, predicate, object) triples that are explicitly stated or clearly implied."""
    paragraph: str = dspy.InputField(desc="original paragraph")
    entities:  List[str] = dspy.InputField(desc="list of deduplicated entity strings")
    relations: List[Relation] = dspy.OutputField(desc="list of subject-predicate-object triples")

rel_predictor = dspy.ChainOfThought(ExtractRelations)

In [106]:
import re

In [107]:
# MERMAID SERIALISER
def triples_to_mermaid(triples: list[Relation], entity_list: list[str], max_label_len: int = 40) -> str:
    """Convert triples to a VALID Mermaid flowchart LR diagram."""
    entity_set = {e.strip().lower() for e in entity_list}
    lines = ["flowchart LR"]

    def _make_id(s: str) -> str:
        # Create valid Mermaid node ID
        return re.sub(r'[^a-zA-Z0-9]', '', s)

    for t in triples:
        subj_norm, obj_norm = t.subj.strip().lower(), t.obj.strip().lower()

        # Logic to ensure we only graph valid entities
        if obj_norm in entity_set and subj_norm in entity_set:
            src, dst, lbl = t.subj, t.obj, t.pred
        elif obj_norm in entity_set:
            src, dst, lbl = t.subj, t.obj, t.pred # Keep original direction if valid
        elif subj_norm in entity_set:
            src, dst, lbl = t.subj, t.obj, t.pred
        else:
            continue

        lbl = lbl.strip()
        if len(lbl) > max_label_len:
            lbl = lbl[:max_label_len - 3] + "..."

        src_id, dst_id = _make_id(src), _make_id(dst)
        if src_id and dst_id:
             lines.append(f'    {src_id}["{src}"] -->|{lbl}| {dst_id}["{dst}"]')

    return "\n".join(lines)

In [108]:
! pip install beautifulsoup4 trafilatura



In [109]:
from bs4 import BeautifulSoup
import os
import time

In [110]:
# MANUAL OVERRIDES
# ScienceDirect (URLs 3 and 7) is aggressively blocking the Google Colab IP address (Error 403)
# it fails 1.Requests + BeautifulSoup method 2.trafilatura
MANUAL_OVERRIDES = {
    # URL 3
    "https://www.sciencedirect.com/science/article/pii/S1043661820315152": """
    Ivermectin is a macrolide antiparasitic drug with a 16-membered ring that is widely used for the treatment of many parasitic diseases such as river blindness, elephantiasis and scabies. Satoshi ōmura and William C. Campbell won the 2015 Nobel Prize in Physiology or Medicine for the discovery of the excellent efficacy of ivermectin against parasitic diseases. Recently, ivermectin has been reported to inhibit the proliferation of several tumor cells by regulating multiple signaling pathways. This suggests that ivermectin may be an anticancer drug with great potential. Here, we reviewed the related mechanisms by which ivermectin inhibited the development of different cancers and promoted programmed cell death and discussed the prospects for the clinical application of ivermectin as an anticancer drug for neoplasm therapy.
    """,

    # URL 7:
    "https://www.sciencedirect.com/science/article/pii/S0378378220307088": """
    There is a significant relationship between ambient temperature and mortality. In healthy individuals with no underlying co-morbid conditions, there is an efficient heat regulation system which enables the body to effectively handle thermal stress. However, in vulnerable groups, especially in elderly over the age of 65 years, infants and individuals with co-morbid cardiovascular and/or respiratory conditions, there is a deficiency in thermoregulation. When temperatures exceed a certain limit, being cold winter spells or heat waves, there is an increase in the number of deaths. In particular, it has been shown that at temperatures above 27 °C, the daily mortality rate increases more rapidly per degree rise compared to when it drops below 27 °C.
    """
}

In [111]:
import requests
from bs4 import BeautifulSoup
import trafilatura

def scrape_url(url, max_paragraphs=7):
    # 1. Check Manual Override first
    if url in MANUAL_OVERRIDES:
        print("Using Manual Override text.")
        return MANUAL_OVERRIDES[url]

    print(f"Scraping...")

    # 2. Try Requests+BS4
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
    try:
        resp = requests.get(url, headers=headers, timeout=10)
        if resp.status_code == 200:
            soup = BeautifulSoup(resp.content, "html.parser")
            paras = [p.text.strip() for p in soup.find_all("p") if len(p.text) > 50]
            if paras: return " ".join(paras[:max_paragraphs])
    except: pass

    # 3. Try Trafilatura
    try:
        downloaded = trafilatura.fetch_url(url)
        if downloaded:
            text = trafilatura.extract(downloaded)
            if text: return text[:4000]
    except: pass

    return ""

In [112]:
# URLs to Scrape
URLS = [
    "https://en.wikipedia.org/wiki/Sustainable_agriculture",
    "https://www.nature.com/articles/d41586-025-03353-5",
    "https://www.sciencedirect.com/science/article/pii/S1043661820315152",
    "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10457221/",
    "https://www.fao.org/3/y4671e/y4671e06.htm",
    "https://www.medscape.com/viewarticle/time-reconsider-tramadol-chronic-pain-2025a1000ria",
    "https://www.sciencedirect.com/science/article/pii/S0378378220307088",
    "https://www.frontiersin.org/news/2025/09/01/rectangle-telescope-finding-habitable-planets",
    "https://www.medscape.com/viewarticle/second-dose-boosts-shingles-protection-adults-aged-65-years-2025a1000ro7",
    "https://www.theguardian.com/global-development/2025/oct/13/astro-ambassadors-stargazers-himalayas-hanle-ladakh-india"
]

In [113]:
import pandas as pd

In [114]:
# MAIN EXECUTION LOOP
extractor = dspy.Predict(ExtractEntities)
csv_rows = []

print("Starting Processing Pipeline...\n")

for i, url in enumerate(URLS):
    print(f"[{i+1}/10] Processing: {url}")
    text = scrape_url(url)

    if not text:
        print("Empty text or scrape error. Skipping.")
        continue

    # A. Extract
    try:
        pred_extract = extractor(paragraph=text)
        raw_entities = pred_extract.entities
    except Exception as e:
        print(f"Extraction error: {e}")
        continue

    if not raw_entities:
        print("No entities found.")
        continue

    # B. Deduplicate
    unique_entities_objs = deduplicate_with_lm(raw_entities)
    unique_entity_strings = [e.entity for e in unique_entities_objs]

    # Add to CSV Data
    seen_tags = set()
    for e in unique_entities_objs:
        if e.entity not in seen_tags:
            csv_rows.append({"link": url, "tag": e.entity, "tag_type": e.attr_type})
            seen_tags.add(e.entity)

    # C. Extract Relations
    try:
        pred_rel = rel_predictor(paragraph=text, entities=unique_entity_strings)
        triples = pred_rel.relations
    except Exception as e:
        print(f"Relation error: {e}")
        triples = []

    # D. Generate Mermaid
    mermaid_content = triples_to_mermaid(triples, unique_entity_strings)
    filename = f"mermaid_{i+1}.md"
    with open(filename, "w") as f:
        f.write(mermaid_content)
    print(f"Saved {filename}")

Starting Processing Pipeline...

[1/10] Processing: https://en.wikipedia.org/wiki/Sustainable_agriculture
Scraping...
Saved mermaid_1.md
[2/10] Processing: https://www.nature.com/articles/d41586-025-03353-5
Scraping...
Saved mermaid_2.md
[3/10] Processing: https://www.sciencedirect.com/science/article/pii/S1043661820315152
Using Manual Override text.
Saved mermaid_3.md
[4/10] Processing: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10457221/
Scraping...
Saved mermaid_4.md
[5/10] Processing: https://www.fao.org/3/y4671e/y4671e06.htm
Scraping...
Saved mermaid_5.md
[6/10] Processing: https://www.medscape.com/viewarticle/time-reconsider-tramadol-chronic-pain-2025a1000ria
Scraping...
Saved mermaid_6.md
[7/10] Processing: https://www.sciencedirect.com/science/article/pii/S0378378220307088
Using Manual Override text.
Saved mermaid_7.md
[8/10] Processing: https://www.frontiersin.org/news/2025/09/01/rectangle-telescope-finding-habitable-planets
Scraping...
Saved mermaid_8.md
[9/10] Processing: 

In [115]:
df = pd.DataFrame(csv_rows).drop_duplicates(subset=['link', 'tag'])
df.to_csv("tags.csv", index=False)
print("\nPipeline Complete!")
print("Generated files:")
print(" tags.csv")
print(f"mermaid_1.md ... mermaid_{len(URLS)}.md")
print("\nSample CSV Data:")
print(df.head())


Pipeline Complete!
Generated files:
 tags.csv
mermaid_1.md ... mermaid_10.md

Sample CSV Data:
                                                link  \
0  https://en.wikipedia.org/wiki/Sustainable_agri...   
1  https://en.wikipedia.org/wiki/Sustainable_agri...   
2  https://en.wikipedia.org/wiki/Sustainable_agri...   
3  https://en.wikipedia.org/wiki/Sustainable_agri...   
4  https://en.wikipedia.org/wiki/Sustainable_agri...   

                        tag             tag_type  
0   sustainable agriculture              Concept  
1        ecosystem services              Concept  
2            climate change  Environmental Issue  
3  greenhouse gas emissions  Environmental Issue  
4            water scarcity  Environmental Issue  
