In [None]:
import json
import regex as re
from crawl4ai import AsyncWebCrawler
import transformers
import torch

# MODEL ZOO
MODEL_ZOO = {
    "mistral": "mistralai/Mistral-7B-Instruct-v0.2",
    "llama": "meta-llama/Llama-2-7b-chat-hf",
    "qwen": "Qwen/Qwen2.5-7B-Instruct",
    "phi2": "HuggingFaceTB/SmolLM2-1.7B-Instruct"
}

# Load model/tokenizer once
def load_model(model_name: str):
    device = 0 if torch.cuda.is_available() else -1
    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
    model = transformers.AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
        device_map="auto" if torch.cuda.is_available() else None
    )
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        model.config.pad_token_id = tokenizer.eos_token_id

    pipe = transformers.pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer
    )
    return pipe

async def fetch_markdown(url: str) -> str:
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(url=url)
        return result.markdown
    
# Extraction function
def extract_structured_info(pipe, text: str, temp, top_p) -> dict:
    prompt = text


    output = pipe(
        prompt,
        max_new_tokens=512,
        do_sample=True,
        temperature=temp,
        top_p=top_p,
        repetition_penalty=1.2,
        no_repeat_ngram_size=3
    )[0]['generated_text']
    
    # Extract first valid JSON block
    match = re.search(r'\{(?:[^{}]|(?R))*\}', output, re.DOTALL)
    if match:
        try:
            return json.loads(match.group()), output
        except json.JSONDecodeError:
            print("[ERROR] JSON decoding failed. Raw match:\n", match.group())
    else:
        print("[ERROR] No JSON object found in output.\n", output)
    return {}, output



In [3]:
pipe = load_model(MODEL_ZOO["phi2"])

Device set to use cuda:0


In [151]:
import nest_asyncio

nest_asyncio.apply()

# Now safe to call await
markdown = await fetch_markdown("https://de.wikipedia.org/wiki/K%C3%BCnstliche_Intelligenz")


In [150]:
temp = 0.9
top_p = 0.8
prompt = (
    "You are given the raw markdown content of a website.\n"
    "Analyze the content and extract the most meaningful and relevant structured information.\n"
    "Respond only with a single JSON object containing fields that make sense based on the content.\n"
    "Do not explain anything. Only return the JSON object."
)
result = extract_structured_info(pipe, prompt + "\n\n" + markdown[2000:4000], temp=1.0, top_p=0.95)
print(json.dumps(result, indent=2, ensure_ascii=False))



result, output = extract_structured_info(pipe, prompt, temp, top_p)

import json
print(json.dumps(result, indent=2, ensure_ascii=False))

[ERROR] JSON decoding failed. Raw match:
 {| class="wikitable"
     |+ {{infobox}}
     +|- name=[[SpecialPage(MeineWerke_E)|Konstlіchtete_Zeichentafel]]
     +|}
[
  {},
  "You are given the raw markdown content of a website.\nAnalyze the content and extract the most meaningful and relevant structured information.\nRespond only with a single JSON object containing fields that make sense based on the content.\nDo not explain anything. Only return the JSON object.\n\nate&wmf_medium=sidebar&wmf_campaign=de.wikipedia.org&uselang=de)\n  * [Benutzerkonto erstellen](https://de.wikipedia.org/w/index.php?title=Spezial:Benutzerkonto_anlegen&returnto=K%C3%BCnstliche+Intelligenz \"Wir ermutigen dich dazu, ein Benutzerkonto zu erstellen und dich anzumelden. Es ist jedoch nicht zwingend erforderlich.\")\n  * [Anmelden](https://de.wikipedia.org/w/index.php?title=Spezial:Anmelden&returnto=K%C3%BCnstliche+Intelligenz \"Anmelden ist zwar keine Pflicht, wird aber gerne gesehen. \\[alt-shift-o\\]\")\n\n\