In [1]:
%pip install -Uq "smolagents[mcp,litellm,openai]" huggingface_hub python-dotenv

Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

True

## Get Research Plan

In [3]:
PLANNER_SYSTEM_INSTRUCTIONS= """
You will be given a research task by a user. Your job is to produce a set of
instructions for a researcher that will complete the task. Do NOT complete the
task yourself, just provide instructions on how to complete it.

GUIDELINES:
1. Maximize specificity and detail. Include all known user preferences and
   explicitly list key attributes or dimensions to consider.
2. If essential attributes are missing, explicitly state that they are open-ended.
3. Avoid unwarranted assumptions. Treat unspecified dimensions as flexible.
4. Use the first person (from the user's perspective).
5. When helpful, explicitly ask the researcher to include tables.
6. Include the expected output format (e.g. structured report with headers).
7. Preserve the input language unless the user explicitly asks otherwise.
8. Sources: prefer primary / official / original sources.
"""


from huggingface_hub import InferenceClient

def generate_research_plan(user_query: str) -> str:
    MODEL_ID = "Qwen/Qwen3-Next-80B-A3B-Thinking"
    PROVIDER = "auto"

    print("Generating the research plan for the query: ", user_query)
    print("MODEL: ", MODEL_ID)
    print("PROVIDER: ", PROVIDER)

    planner_client = InferenceClient(
        api_key=os.environ["HF_TOKEN"],
        provider=PROVIDER,
    )
    completion = planner_client.chat.completions.create(
        model=MODEL_ID,
        messages=[
            {"role": "system", "content": PLANNER_SYSTEM_INSTRUCTIONS},
            {"role": "user", "content": user_query},
        ],
    )

    research_plan = completion.choices[0].message.content

    print("\033[93mGenerated Research Plan\033[0m")
    print(f"\033[93m{research_plan}\033[0m")

    return research_plan

research_plan = generate_research_plan("research about the climate in northern france")

Generating the research plan for the query:  research about the climate in northern france
MODEL:  Qwen/Qwen3-Next-80B-A3B-Thinking
PROVIDER:  auto
[93mGenerated Research Plan[0m
[93m### Research Instructions: Climate Analysis of Northern France  

**Objective**:  
Produce a detailed, evidence-based report on the current climate characteristics of northern France, using primary sources and structured analysis. The report must clearly define the geographical scope, analyze key climate metrics, and cite all data sources.  

---

### Step 1: Define Geographical Scope (Critical First Step)  
- **Clarify "northern France"**:  
  - Use **administrative regions** as the standard definition. Include:  
    - **Hauts-de-France** (departments: Nord, Pas-de-Calais, Somme, Aisne, Oise)  
    - **Normandy** (departments: Seine-Maritime, Eure, Calvados, Manche, Orne)  
    - **Clarify exclusions**: Exclude Grand Est (e.g., Alsace, Lorraine) unless explicitly referenced in sources. If sources conf

## Split the task into sub-tasks

In [None]:
import json
from pydantic import BaseModel, Field
from typing import List
from pprint import pprint

TASK_SPLITTER_SYSTEM_INSTRUCTIONS = """
You will be given a set of research instructions (a research plan).
Your job is to break this plan into a set of coherent, non-overlapping
subtasks that can be researched independently by separate agents.

Requirements:
- For the number of subtasks, use your judgment (3 - 12 is ideal). 
- The number of subtasks should cover the full scope of the research plan.
- Each subtask should have:
  - an 'id' (short string),
  - a 'title' (short descriptive title),
  - a 'description' (clear, detailed instructions for the sub-agent).
- Subtasks should collectively cover the full scope of the original plan
  without unnecessary duplication.
- Prefer grouping by dimensions: time periods, regions, actors, themes,
  causal mechanisms, etc., depending on the topic.
- Each description should be very clear and detailed about everything that
  the agent needs to research to cover that topic.
- Do not include a final task that will put everything together.
  This will be done later in another step.

Output format:
Return ONLY valid JSON with this schema:

{
  "subtasks": [
    {
      "id": "string",
      "title": "string",
      "description": "string"
    }
  ]
}
"""

class Subtask(BaseModel):
    id: str = Field(
        ...,
        description="Short identifier for the subtask (e.g. 'A', 'history', 'drivers').",
    )
    title: str = Field(
        ...,
        description="Short descriptive title of the subtask.",
    )
    description: str = Field(
        ...,
        description="Clear, detailed instructions for the sub-agent that will research this subtask.",
    )

class SubtaskList(BaseModel):
    subtasks: List[Subtask] = Field(
        ...,
        description="List of subtasks that together cover the whole research plan.",
    )

TASK_SPLITTER_JSON_SCHEMA = {
    "name": "subtaskList",
    "schema": SubtaskList.model_json_schema(),
    "strict": True,
}


def _extract_json(text: str) -> str:
    text = text.strip()
    if text.startswith("{") and text.endswith("}"):
        return text
    start = text.find("{")
    end = text.rfind("}")
    if start != -1 and end != -1 and end > start:
        return text[start:end + 1]
    return ""


def split_into_subtasks(research_plan: str):

    MODEL_ID = "zai-org/GLM-4.7"
    PROVIDER = "novita"

    print("Splitting the research plan into subtasks...")
    print("MODEL: ", MODEL_ID)
    print("PROVIDER: ", PROVIDER)

    client = InferenceClient(
      api_key=os.environ["HF_TOKEN"],
      provider=PROVIDER,
    )

    completion = client.chat.completions.create(
        model=MODEL_ID,
        messages=[
            {"role": "system", "content": TASK_SPLITTER_SYSTEM_INSTRUCTIONS},
            {"role": "user", "content": research_plan},
        ],
        response_format={
            "type": "json_schema",
            "json_schema": TASK_SPLITTER_JSON_SCHEMA,
        },
        temperature=0.2,
    )

    message = completion.choices[0].message
    print(message)

    content = (message.content or "").strip()
    if not content:
        raise ValueError("Empty model response. Try a different provider/model or rerun the cell.")

    try:
        payload = json.loads(content)
    except json.JSONDecodeError:
        extracted = _extract_json(content)
        if not extracted:
            raise
        payload = json.loads(extracted)

    subtasks = payload["subtasks"]

    print("\033[93mGenerated The Following Subtasks\033[0m")
    for task in subtasks:
      print(f"\033[93m{task['title']}\033[0m")
      pprint(f"\033[93m{task['description']}\033[0m")
      print()

    return subtasks

subtasks = split_into_subtasks(research_plan)

Splitting the research plan into subtasks...
MODEL:  zai-org/GLM-4.7
PROVIDER:  novita
ChatCompletionOutputMessage(role='assistant', content='{\n  "subtasks": [\n    {\n      "id": "scope_definition",\n      "title": "Define Geographical Scope and Administrative Boundaries",\n      "description": "Define the geographical scope of \'northern France\' using administrative regions as the standard. Specifically, include the regions of Hauts-de-France (departments: Nord, Pas-de-Calais, Somme, Aisne, Oise) and Normandy (departments: Seine-Maritime, Eure, Calvados, Manche, Orne). Explicitly exclude Grand Est (e.g., Alsace, Lorraine) unless primary sources necessitate its inclusion. Justify this scope by citing official French government or Météo-France definitions. If sources use inconsistent boundaries (e.g., including parts of Île-de-France), document this discrepancy and prioritize the most authoritative sources."\n    },\n    {\n      "id": "primary_data_collection",\n      "title": "Coll

In [20]:
subtasks

[{'id': 'scope_definition',
  'title': 'Define Geographical Scope and Administrative Boundaries',
  'description': "Define the geographical scope of 'northern France' using administrative regions as the standard. Specifically, include the regions of Hauts-de-France (departments: Nord, Pas-de-Calais, Somme, Aisne, Oise) and Normandy (departments: Seine-Maritime, Eure, Calvados, Manche, Orne). Explicitly exclude Grand Est (e.g., Alsace, Lorraine) unless primary sources necessitate its inclusion. Justify this scope by citing official French government or Météo-France definitions. If sources use inconsistent boundaries (e.g., including parts of Île-de-France), document this discrepancy and prioritize the most authoritative sources."},
 {'id': 'primary_data_collection',
  'title': 'Collect Primary Climate Data for Key Cities',
  'description': "Extract current climate data (1991–2020 normals) from primary sources, specifically Météo-France's 'Climatologie' portal, for the key cities of Lill

## Create subagents & coordinator

In this step, create a tool that launches a dedicated sub-agent for each subtask. The Coordinator agent will receive this tool and call it whenever a new subtask needs processing. Each sub-agent will conduct detailed research on its assigned task and return its results upon completion. The Coordinator will then combine all of the sub-agent outputs into a single, comprehensive research report.

In [21]:
SUBAGENT_PROMPT_TEMPLATE = """
You are a specialized research sub-agent.

Global user query:
{user_query}

Overall research plan:
{research_plan}

Your specific subtask (ID: {subtask_id}, Title: {subtask_title}) is:

\"\"\"{subtask_description}\"\"\"

Instructions:
- Focus ONLY on this subtask, but keep the global query in mind for context.
- Use the available tools to search for up-to-date, high-quality sources.
- Prioritize primary and official sources when possible.
- Be explicit about uncertainties, disagreements in the literature, and gaps.
- Return your results as a MARKDOWN report with this structure:

# [Subtask ID] [Subtask Title]

## Summary
Short overview of the main findings.

## Detailed Analysis
Well-structured explanation with subsections as needed.

## Key Points
- Bullet point
- Bullet point

## Sources
- [Title](url) - short comment on why this source is relevant

Now perform the research and return ONLY the markdown report.
"""

In [22]:
COORDINATOR_PROMPT_TEMPLATE = """
You are the LEAD RESEARCH COORDINATOR AGENT.

The user has asked:
\"\"\"{user_query}\"\"\"

A detailed research plan has already been created:

\"\"\"{research_plan}\"\"\"

This plan has been split into the following subtasks (JSON):

```json
{subtasks_json}
```
Each element has the shape:
{{
“id”: “timeframe_confirmation”,
“title”: “Confirm Research Scope Parameters”,
“description”: “Analyze the scope parameters…”
}}

You have access to a tool called:
• initialize_subagent(subtask_id: str, subtask_title: str, subtask_description: str)

Your job:
1. For EACH subtask in the JSON array, call initialize_subagent exactly once
with:
• subtask_id       = subtask[“id”]
• subtask_title    = subtask[“title”]
• subtask_description = subtask[“description”]
2. Wait for all sub-agent reports to come back. Each tool call returns a
markdown report for that subtask.
3. After you have results for ALL subtasks, synthesize them into a SINGLE,
coherent, deeply researched report addressing the original user query
("{user_query}").

Final report requirements:
• Integrate all sub-agent findings; avoid redundancy.
• Make the structure clear with headings and subheadings.
• Highlight:
• key drivers and mechanisms of insecurity,
• historical and temporal evolution,
• geographic and thematic patterns,
• state capacity, public perception, and socioeconomic correlates,
• open questions and uncertainties.
• Include final sections:
• Open Questions and Further Research
• Bibliography / Sources: merge and deduplicate the key sources from all sub-agents.

Important:
• DO NOT expose internal tool-call mechanics to the user.
• Your final answer to the user should be a polished markdown report.
"""


In [None]:
from smolagents import InferenceClientModel, MCPClient, tool, ToolCallingAgent
import os

FIRECRAWL_API_KEY = os.environ["FIRECRAWL_API_KEY"]
MCP_URL = f"https://mcp.firecrawl.dev/{FIRECRAWL_API_KEY}/v2/mcp"

COORDINATOR_MODEL_ID = "MiniMaxAI/MiniMax-M1-80k"
SUBAGENT_MODEL_ID = "MiniMaxAI/MiniMax-M1-80k"
COORDINATOR_MODEL_PROVIDER = "novita"
SUBAGENT_MODEL_PROVIDER = "novita"

def run_deep_research(user_query: str) -> str:
    print("Running the deep research...")

    # 1) Generate research plan
    research_plan = generate_research_plan(user_query)

    # 2) Split into explicit subtasks
    subtasks = split_into_subtasks(research_plan)

    print("Initializing Coordinator")
    print("Coordinator Model: ", COORDINATOR_MODEL_ID)
    print("Subagent Model: ", SUBAGENT_MODEL_ID)

    coordinator_model = InferenceClientModel(
        model_id=COORDINATOR_MODEL_ID,
        api_key=os.environ["HF_TOKEN"],
        provider=COORDINATOR_MODEL_PROVIDER,
        )
    subagent_model = InferenceClientModel(
        model_id=SUBAGENT_MODEL_ID,
        api_key=os.environ["HF_TOKEN"],
        provider=SUBAGENT_MODEL_PROVIDER,
        )

    with MCPClient({"url": MCP_URL, "transport": "streamable-http"}) as mcp_tools:
        @tool
        def initialize_subagent(subtask_id: str, subtask_title: str, subtask_description: str) -> str:
            """
           Spawn a dedicated research sub-agent for a single subtask.

            Args:
                subtask_id (str): The unique identifier for the subtask.
                subtask_title (str): The descriptive title of the subtask.
                subtask_description (str): Detailed instructions for the sub-agent to perform the subtask.

            The sub-agent:
            - Has access to the Firecrawl MCP tools.
            - Must perform deep research ONLY on this subtask.
            - Returns a structured markdown report with:
              - a clear heading identifying the subtask,
              - a narrative explanation,
              - bullet-point key findings,
              - explicit citations / links to sources.
            """
            print(f"Initializing Subagent for task {subtask_id}...")

            subagent = ToolCallingAgent(
                tools=mcp_tools,                # Firecrawl MCP toolkit
                model=subagent_model,
                add_base_tools=False,
                name=f"subagent_{subtask_id}",
            )
            subagent_prompt = SUBAGENT_PROMPT_TEMPLATE.format(
                user_query=user_query,
                research_plan=research_plan,
                subtask_id=subtask_id,
                subtask_title=subtask_title,
                subtask_description=subtask_description,
            )

            return subagent.run(subagent_prompt)

        coordinator = ToolCallingAgent(
            tools=[initialize_subagent],
            model=coordinator_model,
            add_base_tools=False,
            name="coordinator_agent"
        )

        subtasks_json = json.dumps(subtasks, indent=2, ensure_ascii=False)

        coordinator_prompt = COORDINATOR_PROMPT_TEMPLATE.format(
            user_query=user_query,
            research_plan=research_plan,
            subtasks_json=subtasks_json,
        )

        final_report = coordinator.run(coordinator_prompt)

        return final_report
