In [1]:
!uv pip install 'crewai-tools[mcp]'

[2mUsing Python 3.12.11 environment at: /usr[0m
[2K[2mResolved [1m206 packages[0m [2min 2.08s[0m[0m
[2K[2mPrepared [1m80 packages[0m [2min 8.11s[0m[0m
[2mUninstalled [1m4 packages[0m [2min 19ms[0m[0m
[2K[2mInstalled [1m80 packages[0m [2min 508ms[0m[0m
 [32m+[39m [1malembic[0m[2m==1.16.5[0m
 [32m+[39m [1manthropic[0m[2m==0.64.0[0m
 [32m+[39m [1mappdirs[0m[2m==1.4.4[0m
 [32m+[39m [1masgiref[0m[2m==3.9.1[0m
 [32m+[39m [1mbackoff[0m[2m==2.2.1[0m
 [32m+[39m [1mbcrypt[0m[2m==4.3.0[0m
 [32m+[39m [1mbrowserbase[0m[2m==1.4.0[0m
 [32m+[39m [1mchroma-hnswlib[0m[2m==0.7.6[0m
 [32m+[39m [1mchromadb[0m[2m==0.5.23[0m
 [32m+[39m [1mcohere[0m[2m==5.17.0[0m
 [32m+[39m [1mcoloredlogs[0m[2m==15.0.1[0m
 [32m+[39m [1mcrewai[0m[2m==0.175.0[0m
 [32m+[39m [1mcrewai-tools[0m[2m==0.65.0[0m
 [32m+[39m [1mdataclasses-json[0m[2m==0.6.7[0m
 [32m+[39m [1mdeprecation[0m[2m==2.1.0[0m
 [32m+[39m [1md

In [2]:
!pip install -U transformers kernels torch

Collecting kernels
  Downloading kernels-0.9.0-py3-none-any.whl.metadata (3.3 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading kernels-0.9.0-py3-none-any.whl (37 kB)
Downloading tokenizers-0.21.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m44.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers, kernels
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.20.3
    Uninstalling tokenizers-0.20.3:
      Successfully uninstalled tokenizers-0.20.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
chromadb 0.5.23 requires tokenizers<=0.20.3,>=0.13.2, but you have tokenizers 0.21.4 

In [7]:
from mcp.server.fastmcp import FastMCP
from typing import Optional
import pandas as pd
import json
import os
import asyncio
import threading
from concurrent.futures import ThreadPoolExecutor

import nest_asyncio
nest_asyncio.apply()

mcp = FastMCP('medical-data-server', port=5000)

DATA_FILE = "/content/Benchmark_Report_Analyzer.xlsx"  # Your medical data file


def load_data():
    """Helper to load the medical Excel data."""
    if not os.path.exists(DATA_FILE):
        return pd.DataFrame(columns=["ReportId", "CPT", "ReportContent", "TreatmentPlan"])
    return pd.read_excel(DATA_FILE)


def save_data(df: pd.DataFrame):
    """Helper to save updated data back to Excel."""
    df.to_excel(DATA_FILE, index=False)


# ------------------ Tools ------------------

@mcp.tool()
def search_reports(query: str) -> str:
    """Search reports by ReportId, CPT, or report content text."""
    df = load_data()
    results = df[df.astype(str).apply(lambda row: row.str.contains(query, case=False, na=False)).any(axis=1)]
    return results.to_json(orient="records")


@mcp.tool()
def delete_report(report_id: str) -> str:
    """Delete a patient report by ReportId."""
    df = load_data()
    if report_id not in df["ReportId"].astype(str).values:
        return json.dumps({"status": "error", "message": "ReportId not found"})
    df = df[df["ReportId"].astype(str) != report_id]
    save_data(df)
    return json.dumps({"status": "success", "message": f"Report {report_id} deleted"})


@mcp.tool()
def add_treatment_plan(report_id: str, treatment_plan: str) -> str:
    """Add or update a treatment plan for a patient report."""
    df = load_data()
    if report_id not in df["ReportId"].astype(str).values:
        return json.dumps({"status": "error", "message": "ReportId not found"})
    df.loc[df["ReportId"].astype(str) == report_id, "TreatmentPlan"] = treatment_plan
    save_data(df)
    return json.dumps({"status": "success", "message": f"Treatment plan updated for {report_id}"})


@mcp.tool()
def add_new_report(report_id: str, cpt: str, content: str) -> str:
    """Add a completely new patient report."""
    df = load_data()
    if report_id in df["ReportId"].astype(str).values:
        return json.dumps({"status": "error", "message": "ReportId already exists"})
    new_row = {"ReportId": report_id, "CPT": cpt, "ReportContent": content, "TreatmentPlan": ""}
    df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
    save_data(df)
    return json.dumps({"status": "success", "message": f"Report {report_id} added"})


def run_server_in_thread():
    """Run the MCP server in a separate thread with its own event loop."""
    def server_thread():
        # Create a new event loop for this thread
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)

        try:
            # Run the server
            loop.run_until_complete(mcp.run(transport="sse"))
        except Exception as e:
            print(f"Server error: {e}")
        finally:
            loop.close()

    # Start the server in a daemon thread
    thread = threading.Thread(target=server_thread, daemon=True)
    thread.start()
    print("Server is running in background thread...")
    return thread


# Alternative method using ThreadPoolExecutor
def run_server_with_executor():
    """Run the MCP server using ThreadPoolExecutor."""
    def run_async_server():
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        try:
            return loop.run_until_complete(mcp.run(transport="sse"))
        finally:
            loop.close()

    executor = ThreadPoolExecutor(max_workers=1)
    future = executor.submit(run_async_server)
    print("Server is running in background thread...")
    return future


# Method for direct Jupyter cell execution
async def start_server_jupyter():
    """Start server directly in Jupyter (use in a cell with await)."""
    await mcp.run(transport="sse")


# Choose your method:
if __name__ == "__main__":
    # Method 1: Thread-based (recommended for Jupyter/Colab)
    server_thread = run_server_in_thread()

    # Method 2: Alternative with ThreadPoolExecutor
    # server_future = run_server_with_executor()

    # Method 3: For direct execution in Jupyter cell, run this instead:
    # await start_server_jupyter()

Server is running in background thread...


INFO:     Started server process [515]
INFO:     Waiting for application startup.
INFO:     Application startup complete.


In [9]:
from crewai import Crew, Process, Task, Agent
from crewai_tools import MCPServerAdapter
from transformers import pipeline
import torch
import os
from typing import Type, List, Optional
from pydantic import BaseModel, Field
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt

# ✅ Load HuggingFace model
model_id = "openai/gpt-oss-20b"
pipe = pipeline(
    "text-generation",
    model=model_id,
    torch_dtype="auto",
    device_map="auto",
)

# ✅ Simple wrapper so CrewAI can call HuggingFace pipeline like an LLM
class HFLLM:
    def __init__(self, pipe):
        self.pipe = pipe

    def __call__(self, prompt: str, **kwargs):
        outputs = self.pipe(prompt, max_new_tokens=256, **kwargs)
        return outputs[0]["generated_text"]

# Initialize the HuggingFace LLM
hf_llm = HFLLM(pipe)

# ✅ Connect to your MCP server (ensure your medical-data-server is running)
server_params = {
    "url": "http://localhost:5000/sse",  # your server with patient tools
    "transport": "sse"
}

with MCPServerAdapter(server_params) as mcp_tools:
    print(f"Available tools: {[tool.name for tool in mcp_tools]}")

    # ----------------- Agent Definitions -----------------

    search_agent = Agent(
        role="Patient Data Search Agent",
        goal=(
            "Find patient records by ReportId, CPT, or content using MCP tools. "
            "Never guess column names or make assumptions about data structure."
        ),
        backstory=(
            "You are a medical data retrieval specialist who excels at finding "
            "specific patient information using precise search tools. "
            "You strictly use MCP tools like search_reports to locate data."
        ),
        tools=mcp_tools,
        llm=hf_llm,
        verbose=True
    )

    delete_agent = Agent(
        role="Patient Data Delete Agent",
        goal=(
            "Safely delete patient reports after proper verification. "
            "Always confirm ReportId exists before deletion."
        ),
        backstory=(
            "You are a cautious data management specialist focused on secure deletion. "
            "You never delete records without explicit confirmation and proper verification. "
            "You use MCP tools to ensure data integrity during deletion operations."
        ),
        tools=mcp_tools,
        llm=hf_llm,
        verbose=True
    )

    treatment_agent = Agent(
        role="Treatment Planning Agent",
        goal=(
            "Add, update, and manage treatment plans in patient records. "
            "Use MCP tools to store treatment data without guessing patient IDs."
        ),
        backstory=(
            "You are a clinical workflow specialist who maintains accurate treatment "
            "documentation. You ensure all treatment plans are properly linked to "
            "patient records using verified identifiers through MCP tools."
        ),
        tools=mcp_tools,
        llm=hf_llm,
        verbose=True
    )

    create_agent = Agent(
        role="Report Creation Agent",
        goal=(
            "Insert new patient reports into the system with proper validation. "
            "Ensure ReportId uniqueness and data integrity."
        ),
        backstory=(
            "You are a medical records specialist who creates comprehensive patient "
            "reports. You validate all data fields and ensure unique identifiers "
            "before adding new records using MCP tools."
        ),
        tools=mcp_tools,
        llm=hf_llm,
        verbose=True
    )

    analytics_agent = Agent(
        role="Medical Data Analytics Agent",
        goal=(
            "Analyze patient data trends, generate insights, and create statistical summaries. "
            "Use MCP tools for data analysis and never invent numbers."
        ),
        backstory=(
            "You are a healthcare data analyst specializing in medical outcomes and "
            "patient care metrics. You use statistical methods and MCP tools to "
            "provide actionable insights for clinical decision-making."
        ),
        tools=mcp_tools,
        llm=hf_llm,
        verbose=True
    )

    visualization_agent = Agent(
        role="Medical Data Visualization Agent",
        goal=(
            "Create clear, accurate visualizations of patient data and medical trends. "
            "Generate charts that support clinical decision-making."
        ),
        backstory=(
            "You specialize in medical data visualization with expertise in creating "
            "charts that healthcare professionals can easily interpret. "
            "You use MCP tools to generate accurate visual representations."
        ),
        tools=mcp_tools,
        llm=hf_llm,
        verbose=True
    )

    manager_agent = Agent(
        role="Medical Data Manager",
        goal="Coordinate all agents to manage patient data operations efficiently",
        backstory=(
            "You oversee the entire medical data workflow, ensuring proper task "
            "delegation, data integrity, and compliance with healthcare standards. "
            "You coordinate between specialized agents for optimal results."
        ),
        llm=hf_llm,
        verbose=True
    )

    # ----------------- Task Definitions -----------------

    search_task = Task(
        description=(
            "Given the user's search query, use the search_reports MCP tool to find "
            "relevant patient records. Search by ReportId, CPT code, or content keywords. "
            "Return structured results with all relevant patient information."
        ),
        expected_output="A structured list of patient reports in JSON format with ReportId, CPT, and content.",
        agent=search_agent
    )

    delete_task = Task(
        description=(
            "Given a ReportId, verify the record exists using search tools, then "
            "safely delete the patient report using the delete_report MCP tool. "
            "Provide confirmation of deletion or error if record not found."
        ),
        expected_output="Confirmation message stating successful deletion or error details.",
        agent=delete_agent
    )

    treatment_task = Task(
        description=(
            "Given patient information and treatment details, use the add_treatment_plan "
            "MCP tool to store or update treatment plans. Verify patient exists before "
            "adding treatment information."
        ),
        expected_output="Confirmation message of successful treatment plan addition/update.",
        agent=treatment_agent
    )

    create_task = Task(
        description=(
            "Given new patient report data, use the add_new_report MCP tool to create "
            "a new patient record. Ensure ReportId is unique and all required fields "
            "are properly formatted before insertion."
        ),
        expected_output="Confirmation message with new ReportId and creation status.",
        agent=create_agent
    )

    analytics_task = Task(
        description=(
            "Analyze patient data patterns, calculate metrics, and generate insights "
            "using available MCP analytical tools. Focus on trends, outcomes, and "
            "statistical summaries relevant to the user's query."
        ),
        expected_output="A concise analytical summary with key metrics and insights.",
        agent=analytics_agent
    )

    visualization_task = Task(
        description=(
            "Create visualizations of patient data using MCP tools. Generate charts "
            "that clearly display trends, distributions, or comparisons based on "
            "the user's visualization request."
        ),
        expected_output="Path to generated visualization file or chart description.",
        agent=visualization_agent
    )

    meta_task = Task(
        description=(
            "Retrieve and return dataframe schema, metadata, and structure information "
            "using MCP tools. Provide column names, data types, and sample values "
            "for data validation and reference."
        ),
        expected_output="Well-structured schema and metadata information.",
        agent=meta_agent
    )

    # ----------------- Crew Configuration -----------------

    crew = Crew(
        agents=[
            search_agent,
            delete_agent,
            treatment_agent,
            create_agent,
            analytics_agent,
            visualization_agent,
            meta_agent
        ],
        tasks=[
            search_task,
            delete_task,
            treatment_task,
            create_task,
            analytics_task,
            visualization_task,
            meta_task
        ],
        memory=True,
        process=Process.hierarchical,
        manager_agent=manager_agent
    )

    # ----------------- Example Usage -----------------

    # Example 1: Search for patient records
    result = crew.kickoff(inputs={
        "query": "Find reports mentioning diabetes",
        "report_id": "",
        "cpt": "",
        "content": "",
        "treatment_plan": ""
    })

    print("Search Results:")
    print(result)

    # Example 2: Create new patient report
    result = crew.kickoff(inputs={
        "query": "Create new patient report",
        "report_id": "2025001",
        "cpt": "99214",
        "content": "Patient presents with symptoms of hypertension and elevated blood pressure.",
        "treatment_plan": "Prescribe ACE inhibitor, recommend low-sodium diet, schedule follow-up in 2 weeks."
    })

    print("Creation Results:")
    print(result)

    # Example 3: Add treatment plan
    result = crew.kickoff(inputs={
        "query": "Add treatment plan for existing patient",
        "report_id": "2001",
        "cpt": "",
        "content": "",
        "treatment_plan": "Continue current medication, add daily exercise routine, monitor glucose levels."
    })

    print("Treatment Plan Results:")
    print(result)

    # Example 4: Analytics request
    result = crew.kickoff(inputs={
        "query": "Analyze patient demographics and common conditions",
        "report_id": "",
        "cpt": "",
        "content": "",
        "treatment_plan": ""
    })

    print("Analytics Results:")
    print(result)

Fetching 40 files:   0%|          | 0/40 [00:00<?, ?it/s]



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Fetching 40 files:   0%|          | 0/40 [00:00<?, ?it/s]



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Fetching 40 files:   0%|          | 0/40 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Fetching 40 files:   0%|          | 0/40 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

ValueError: Could not load model openai/gpt-oss-20b with any of the following classes: (<class 'transformers.models.auto.modeling_auto.AutoModelForCausalLM'>, <class 'transformers.models.auto.modeling_tf_auto.TFAutoModelForCausalLM'>, <class 'transformers.models.gpt_oss.modeling_gpt_oss.GptOssForCausalLM'>). See the original errors:

while loading with AutoModelForCausalLM, an error is thrown:
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/transformers/pipelines/base.py", line 292, in infer_framework_load_model
    model = model_class.from_pretrained(model, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/models/auto/auto_factory.py", line 600, in from_pretrained
    return model_class.from_pretrained(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/modeling_utils.py", line 317, in _wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/modeling_utils.py", line 5074, in from_pretrained
    ) = cls._load_pretrained_model(
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/modeling_utils.py", line 5537, in _load_pretrained_model
    _error_msgs, disk_offload_index, cpu_offload_index = load_shard_file(args)
                                                         ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/modeling_utils.py", line 975, in load_shard_file
    disk_offload_index, cpu_offload_index = _load_state_dict_into_meta_model(
                                            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 120, in decorate_context
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/modeling_utils.py", line 883, in _load_state_dict_into_meta_model
    hf_quantizer.create_quantized_param(
  File "/usr/local/lib/python3.12/dist-packages/transformers/quantizers/quantizer_mxfp4.py", line 249, in create_quantized_param
    load_and_swizzle_mxfp4(
  File "/usr/local/lib/python3.12/dist-packages/transformers/integrations/mxfp4.py", line 391, in load_and_swizzle_mxfp4
    blocks = blocks.to(target_device)
             ^^^^^^^^^^^^^^^^^^^^^^^^
torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 254.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 200.12 MiB is free. Process 4930 has 14.54 GiB memory in use. Of the allocated memory 14.28 GiB is allocated by PyTorch, and 171.40 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/transformers/pipelines/base.py", line 310, in infer_framework_load_model
    model = model_class.from_pretrained(model, **fp32_kwargs)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/models/auto/auto_factory.py", line 600, in from_pretrained
    return model_class.from_pretrained(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/modeling_utils.py", line 317, in _wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/modeling_utils.py", line 5074, in from_pretrained
    ) = cls._load_pretrained_model(
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/modeling_utils.py", line 5537, in _load_pretrained_model
    _error_msgs, disk_offload_index, cpu_offload_index = load_shard_file(args)
                                                         ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/modeling_utils.py", line 975, in load_shard_file
    disk_offload_index, cpu_offload_index = _load_state_dict_into_meta_model(
                                            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 120, in decorate_context
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/modeling_utils.py", line 883, in _load_state_dict_into_meta_model
    hf_quantizer.create_quantized_param(
  File "/usr/local/lib/python3.12/dist-packages/transformers/quantizers/quantizer_mxfp4.py", line 249, in create_quantized_param
    load_and_swizzle_mxfp4(
  File "/usr/local/lib/python3.12/dist-packages/transformers/integrations/mxfp4.py", line 391, in load_and_swizzle_mxfp4
    blocks = blocks.to(target_device)
             ^^^^^^^^^^^^^^^^^^^^^^^^
torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 254.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 72.12 MiB is free. Process 4930 has 14.67 GiB memory in use. Of the allocated memory 14.41 GiB is allocated by PyTorch, and 164.93 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

while loading with TFAutoModelForCausalLM, an error is thrown:
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/transformers/pipelines/base.py", line 292, in infer_framework_load_model
    model = model_class.from_pretrained(model, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/models/auto/auto_factory.py", line 603, in from_pretrained
    raise ValueError(
ValueError: Unrecognized configuration class <class 'transformers.models.gpt_oss.configuration_gpt_oss.GptOssConfig'> for this kind of AutoModel: TFAutoModelForCausalLM.
Model type should be one of BertConfig, CamembertConfig, CTRLConfig, GPT2Config, GPT2Config, GPTJConfig, MistralConfig, OpenAIGPTConfig, OPTConfig, RemBertConfig, RobertaConfig, RobertaPreLayerNormConfig, RoFormerConfig, TransfoXLConfig, XGLMConfig, XLMConfig, XLMRobertaConfig, XLNetConfig.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/transformers/pipelines/base.py", line 310, in infer_framework_load_model
    model = model_class.from_pretrained(model, **fp32_kwargs)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/models/auto/auto_factory.py", line 603, in from_pretrained
    raise ValueError(
ValueError: Unrecognized configuration class <class 'transformers.models.gpt_oss.configuration_gpt_oss.GptOssConfig'> for this kind of AutoModel: TFAutoModelForCausalLM.
Model type should be one of BertConfig, CamembertConfig, CTRLConfig, GPT2Config, GPT2Config, GPTJConfig, MistralConfig, OpenAIGPTConfig, OPTConfig, RemBertConfig, RobertaConfig, RobertaPreLayerNormConfig, RoFormerConfig, TransfoXLConfig, XGLMConfig, XLMConfig, XLMRobertaConfig, XLNetConfig.

while loading with GptOssForCausalLM, an error is thrown:
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/transformers/pipelines/base.py", line 292, in infer_framework_load_model
    model = model_class.from_pretrained(model, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/modeling_utils.py", line 317, in _wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/modeling_utils.py", line 5074, in from_pretrained
    ) = cls._load_pretrained_model(
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/modeling_utils.py", line 5537, in _load_pretrained_model
    _error_msgs, disk_offload_index, cpu_offload_index = load_shard_file(args)
                                                         ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/modeling_utils.py", line 975, in load_shard_file
    disk_offload_index, cpu_offload_index = _load_state_dict_into_meta_model(
                                            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 120, in decorate_context
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/modeling_utils.py", line 883, in _load_state_dict_into_meta_model
    hf_quantizer.create_quantized_param(
  File "/usr/local/lib/python3.12/dist-packages/transformers/quantizers/quantizer_mxfp4.py", line 249, in create_quantized_param
    load_and_swizzle_mxfp4(
  File "/usr/local/lib/python3.12/dist-packages/transformers/integrations/mxfp4.py", line 391, in load_and_swizzle_mxfp4
    blocks = blocks.to(target_device)
             ^^^^^^^^^^^^^^^^^^^^^^^^
torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 72.12 MiB is free. Process 4930 has 14.67 GiB memory in use. Of the allocated memory 14.41 GiB is allocated by PyTorch, and 164.93 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/transformers/pipelines/base.py", line 310, in infer_framework_load_model
    model = model_class.from_pretrained(model, **fp32_kwargs)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/modeling_utils.py", line 317, in _wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/modeling_utils.py", line 5074, in from_pretrained
    ) = cls._load_pretrained_model(
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/modeling_utils.py", line 5537, in _load_pretrained_model
    _error_msgs, disk_offload_index, cpu_offload_index = load_shard_file(args)
                                                         ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/modeling_utils.py", line 975, in load_shard_file
    disk_offload_index, cpu_offload_index = _load_state_dict_into_meta_model(
                                            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 120, in decorate_context
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/modeling_utils.py", line 883, in _load_state_dict_into_meta_model
    hf_quantizer.create_quantized_param(
  File "/usr/local/lib/python3.12/dist-packages/transformers/quantizers/quantizer_mxfp4.py", line 249, in create_quantized_param
    load_and_swizzle_mxfp4(
  File "/usr/local/lib/python3.12/dist-packages/transformers/integrations/mxfp4.py", line 391, in load_and_swizzle_mxfp4
    blocks = blocks.to(target_device)
             ^^^^^^^^^^^^^^^^^^^^^^^^
torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 72.12 MiB is free. Process 4930 has 14.67 GiB memory in use. Of the allocated memory 14.41 GiB is allocated by PyTorch, and 164.93 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


