In [2]:
import requests
import json
from bs4 import BeautifulSoup
from typing import Any

from agentscope.service.service_response import (
    ServiceResponse,
    ServiceExecStatus,
)
from agentscope.utils.common import requests_get

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
def get_category_members(
    entity: str, 
    max_members: int=1000, 
    limit_per_request: int=500
    ) -> ServiceResponse:
    """Function to retrieve category members from Wikipedia:Category pages
    
    Args:
        entity (str): searching keywords 
        max_members (int): maximum number of members to output
        limit_per_request (int): number of members retrieved per quest
    
    Returns:
        `ServiceResponse`: A dictionary containing `status` and `content`.
        The `status` attribute is from the ServiceExecStatus enum,
        indicating success or error.
        The `content` field is a list of dicts if successful,
        or an error message if failed.
        Keys of each dict:
            
            "pageid": unique page ID for the member
            
            "ns": namespace for the member, indicating if the corresponding page is Article/User/... See https://en.wikipedia.org/wiki/Wikipedia:Namespace for details.
            
            "title": title of the member
            
        Example:
        
        .. code-block:: python
            members = get_category_members(entity="Machine_learning", max_members=5)
            print(members)
        
        It returns contents:

        .. code-block:: python
            {
                'status': <ServiceExecStatus.SUCCESS: 1>,
                'content': [{'pageid': 67911196, 'ns': 0, 'title': 'Bayesian learning mechanisms'},
                            {'pageid': 233488, 'ns': 0, 'title': 'Machine learning'},
                            {'pageid': 53587467, 'ns': 0, 'title': 'Outline of machine learning'},
                            {'pageid': 64439717, 'ns': 0, 'title': '80 Million Tiny Images'},
                            {'pageid': 75530149, 'ns': 0, 'title': 'Accelerated Linear Algebra'}]
            
            }
    
    """
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        'action': 'query',
        'list': 'categorymembers',
        'cmtitle': f'Category:{entity}',
        'cmlimit': limit_per_request,  # Maximum number of results per request
        'format': 'json'
    }

    members = []
    total_fetched = 0

    while total_fetched < max_members:
        # response = requests.get(url, params=params)
        # data = response.json()
        # data = requests_get(url, params=params)
        data = {'batchcomplete': '', 'continue': {'cmcontinue': 'page|040306485a503846443a302a32011001c4dc0d|23329', 'continue': '-||'}, 'query': {'categorymembers': [{'pageid': 13213953, 'ns': 0, 'title': 'List of pythonid species and subspecies'}]}}
        print(data)
        input(111)
        batch_members = data['query']['categorymembers']
        
        members.extend(batch_members)
        total_fetched += len(batch_members)

        # Check if there is a continuation token
        if 'continue' in data and total_fetched < max_members:
            params['cmcontinue'] = data['continue']['cmcontinue']
        else:
            break
    
    # If more members were fetched than max_members, trim the list
    if len(members) > max_members:
        members = members[:max_members]
    
    if len(members) > 0:
        return ServiceResponse(ServiceExecStatus.SUCCESS, members)
    else:
        return ServiceResponse(ServiceExecStatus.ERROR, members)
    
members = get_category_members(entity="Pythonidae", max_members=1)
print(members)

{'batchcomplete': '', 'continue': {'cmcontinue': 'page|040306485a503846443a302a32011001c4dc0d|23329', 'continue': '-||'}, 'query': {'categorymembers': [{'pageid': 13213953, 'ns': 0, 'title': 'List of pythonid species and subspecies'}]}}
{'status': <ServiceExecStatus.SUCCESS: 1>, 'content': [{'pageid': 13213953, 'ns': 0, 'title': 'List of pythonid species and subspecies'}]}


In [29]:
def get_infobox(
    entity: str
    ) -> ServiceResponse:
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        'action': 'parse',
        'page': entity,
        'prop': 'text',
        'format': 'json'
    }

    # response = requests.get(url, params=params)
    # data = response.json()
    data = requests_get(url, params=params)
    # data = {
    #     'parse': {
    #         'title': 'Python (programming language)', 
    #         'pageid': 23862, 
    #         'text': {
    #             '*': '<table class="infobox vevent"><caption class="infobox-title summary">Python</caption><tbody><tr><td class="infobox-image" colspan="2"><span typeof="mw:File"><a class="mw-file-description" href="/wiki/File:Python-logo-notext.svg"><img class="mw-file-element" data-file-height="126" data-file-width="115" decoding="async" height="133" src="//upload.wikimedia.org/wikipedia/commons/thumb/c/c3/Python-logo-notext.svg/121px-Python-logo-notext.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/c/c3/Python-logo-notext.svg/182px-Python-logo-notext.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/c/c3/Python-logo-notext.svg/242px-Python-logo-notext.svg.png 2x" width="121"/></a></span></td></tr><tr><th class="infobox-label" scope="row"><a href="/wiki/Programming_paradigm" title="Programming paradigm">Paradigm</a></th><td class="infobox-data"><a class="mw-redirect" href="/wiki/Multi-paradigm_programming_language" title="Multi-paradigm programming language">Multi-paradigm</a>: <a href="/wiki/Object-oriented_programming" title="Object-oriented programming">object-oriented</a>,<sup class="reference" id="cite_ref-1"><a href="#cite_note-1">[1]</a></sup> <a href="/wiki/Procedural_programming" title="Procedural programming">procedural</a> (<a href="/wiki/Imperative_programming" title="Imperative programming">imperative</a>), <a href="/wiki/Functional_programming" title="Functional programming">functional</a>, <a href="/wiki/Structured_programming" title="Structured programming">structured</a>, <a href="/wiki/Reflective_programming" title="Reflective programming">reflective</a></td></tr><tr><th class="infobox-label" scope="row"><a href="/wiki/Software_design" title="Software design">Designed by</a></th><td class="infobox-data"><a href="/wiki/Guido_van_Rossum" title="Guido van Rossum">Guido van Rossum</a></td></tr><tr><th class="infobox-label" scope="row"><a class="mw-redirect" href="/wiki/Software_developer" title="Software developer">Developer</a></th><td class="infobox-data organiser"><a href="/wiki/Python_Software_Foundation" title="Python Software Foundation">Python Software Foundation</a></td></tr><tr><th class="infobox-label" scope="row">First appeared</th><td class="infobox-data">20 February 1991<span class="noprint">; 33 years ago</span><span style="display:none"> (<span class="bday dtstart published updated">1991-02-20</span>)</span><sup class="reference" id="cite_ref-alt-sources-history_2-0"><a href="#cite_note-alt-sources-history-2">[2]</a></sup></td></tr><tr><td class="infobox-full-data" colspan="2"><link href="mw-data:TemplateStyles:r1229112069" rel="mw-deduplicated-inline-style"/></td></tr><tr><th class="infobox-label" scope="row" style="white-space: nowrap;"><a href="/wiki/Software_release_life_cycle" title="Software release life cycle">Stable release</a></th><td class="infobox-data"><div style="margin:0px;">3.12.4 <span class="mw-valign-text-top" typeof="mw:File/Frameless"><a href="https://www.wikidata.org/wiki/Q28865?uselang=en#P348" title="Edit this on Wikidata"><img alt="Edit this on Wikidata" class="mw-file-element" data-file-height="20" data-file-width="20" decoding="async" height="10" src="//upload.wikimedia.org/wikipedia/en/thumb/8/8a/OOjs_UI_icon_edit-ltr-progressive.svg/10px-OOjs_UI_icon_edit-ltr-progressive.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/8/8a/OOjs_UI_icon_edit-ltr-progressive.svg/15px-OOjs_UI_icon_edit-ltr-progressive.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/8/8a/OOjs_UI_icon_edit-ltr-progressive.svg/20px-OOjs_UI_icon_edit-ltr-progressive.svg.png 2x" width="10"/></a></span>  / 6 June 2024<span class="noprint">; 14 days ago</span><span style="display:none"> (<span class="bday dtstart published updated">6 June 2024</span>)</span></div></td></tr><tr style="display:none"><td colspan="2">\n</td></tr><tr><th class="infobox-label" scope="row"><a href="/wiki/Type_system" title="Type system">Typing discipline</a></th><td class="infobox-data"><a href="/wiki/Duck_typing" title="Duck typing">duck</a>, <a class="mw-redirect" href="/wiki/Dynamic_typing" title="Dynamic typing">dynamic</a>, <a href="/wiki/Strong_and_weak_typing" title="Strong and weak typing">strong</a>;<sup class="reference" id="cite_ref-3"><a href="#cite_note-3">[3]</a></sup> <a class="mw-redirect" href="/wiki/Optional_typing" title="Optional typing">optional type annotations</a> (since 3.5, but those hints are ignored, except with unofficial tools)<sup class="reference" id="cite_ref-type_hint-PEP_4-0"><a href="#cite_note-type_hint-PEP-4">[4]</a></sup></td></tr><tr><th class="infobox-label" scope="row"><a href="/wiki/Operating_system" title="Operating system">OS</a></th><td class="infobox-data"><b>Tier 1</b>: 64-bit <a href="/wiki/Linux" title="Linux">Linux</a>, <a href="/wiki/MacOS" title="MacOS">macOS</a>; 64- and 32-bit <a class="mw-redirect" href="/wiki/Windows" title="Windows">Windows</a> 10+<sup class="reference" id="cite_ref-5"><a href="#cite_note-5">[5]</a></sup><br/><b>Tier 2</b>: E.g. 32-bit <a href="/wiki/WebAssembly" title="WebAssembly">WebAssembly</a> (WASI) <b>Tier 3</b>: 64-bit <a href="/wiki/FreeBSD" title="FreeBSD">FreeBSD</a>, <a href="/wiki/IOS" title="IOS">iOS</a>; e.g. <a href="/wiki/Raspberry_Pi_OS" title="Raspberry Pi OS">Raspberry Pi OS</a><br/>Unofficial (or has been known to work): Other <a href="/wiki/Unix-like" title="Unix-like">Unix-like</a>/<a href="/wiki/Berkeley_Software_Distribution" title="Berkeley Software Distribution">BSD</a> variants and e.g. <a href="/wiki/Android_(operating_system)" title="Android (operating system)">Android</a> 5.0+ (official from Python 3.13 planned<sup class="reference" id="cite_ref-6"><a href="#cite_note-6">[6]</a></sup>) and a few other platforms<sup class="reference" id="cite_ref-7"><a href="#cite_note-7">[7]</a></sup><sup class="reference" id="cite_ref-8"><a href="#cite_note-8">[8]</a></sup><sup class="reference" id="cite_ref-9"><a href="#cite_note-9">[9]</a></sup></td></tr><tr><th class="infobox-label" scope="row"><a href="/wiki/Software_license" title="Software license">License</a></th><td class="infobox-data"><a href="/wiki/Python_Software_Foundation_License" title="Python Software Foundation License">Python Software Foundation License</a></td></tr><tr><th class="infobox-label" scope="row"><a href="/wiki/Filename_extension" title="Filename extension">Filename extensions</a></th><td class="infobox-data">.py, .pyw, .pyz,<sup class="reference" id="cite_ref-10"><a href="#cite_note-10">[10]</a></sup><br/>\n.pyi, .pyc, .pyd</td></tr><tr><th class="infobox-label" scope="row">Website</th><td class="infobox-data"><span class="url"><a class="external text" href="https://www.python.org/" rel="nofollow">python.org</a></span></td></tr><tr><th class="infobox-header" colspan="2" style="background-color: #eee;">Major <a href="/wiki/Programming_language_implementation" title="Programming language implementation">implementations</a></th></tr><tr><td class="infobox-full-data" colspan="2"><a href="/wiki/CPython" title="CPython">CPython</a>, <a href="/wiki/PyPy" title="PyPy">PyPy</a>, <a href="/wiki/Stackless_Python" title="Stackless Python">Stackless Python</a>, <a href="/wiki/MicroPython" title="MicroPython">MicroPython</a>, <a href="/wiki/CircuitPython" title="CircuitPython">CircuitPython</a>, <a href="/wiki/IronPython" title="IronPython">IronPython</a>, <a href="/wiki/Jython" title="Jython">Jython</a></td></tr><tr><th class="infobox-header" colspan="2" style="background-color: #eee;"><a href="/wiki/Programming_language#Dialects,_flavors_and_implementations" title="Programming language">Dialects</a></th></tr><tr><td class="infobox-full-data" colspan="2"><a href="/wiki/Cython" title="Cython">Cython</a>, <a href="/wiki/PyPy#RPython" title="PyPy">RPython</a>, <a href="/wiki/Bazel_(software)" title="Bazel (software)">Starlark</a><sup class="reference" id="cite_ref-11"><a href="#cite_note-11">[11]</a></sup></td></tr><tr><th class="infobox-header" colspan="2" style="background-color: #eee;">Influenced by</th></tr><tr><td class="infobox-full-data" colspan="2"><a href="/wiki/ABC_(programming_language)" title="ABC (programming language)">ABC</a>,<sup class="reference" id="cite_ref-faq-created_12-0"><a href="#cite_note-faq-created-12">[12]</a></sup> <a href="/wiki/Ada_(programming_language)" title="Ada (programming language)">Ada</a>,<sup class="reference" id="cite_ref-13"><a href="#cite_note-13">[13]</a></sup> <a href="/wiki/ALGOL_68" title="ALGOL 68">ALGOL 68</a>,<sup class="reference" id="cite_ref-98-interview_14-0"><a href="#cite_note-98-interview-14">[14]</a></sup> <br/><a href="/wiki/APL_(programming_language)" title="APL (programming language)">APL</a>,<sup class="reference" id="cite_ref-python.org_15-0"><a href="#cite_note-python.org-15">[15]</a></sup> <a href="/wiki/C_(programming_language)" title="C (programming language)">C</a>,<sup class="reference" id="cite_ref-AutoNT-1_16-0"><a href="#cite_note-AutoNT-1-16">[16]</a></sup> <a href="/wiki/C%2B%2B" title="C++">C++</a>,<sup class="reference" id="cite_ref-classmix_17-0"><a href="#cite_note-classmix-17">[17]</a></sup> <a href="/wiki/CLU_(programming_language)" title="CLU (programming language)">CLU</a>,<sup class="reference" id="cite_ref-effbot-call-by-object_18-0"><a href="#cite_note-effbot-call-by-object-18">[18]</a></sup> <a href="/wiki/Dylan_(programming_language)" title="Dylan (programming language)">Dylan</a>,<sup class="reference" id="cite_ref-AutoNT-2_19-0"><a href="#cite_note-AutoNT-2-19">[19]</a></sup> <br/><a href="/wiki/Haskell" title="Haskell">Haskell</a>,<sup class="reference" id="cite_ref-AutoNT-3_20-0"><a href="#cite_note-AutoNT-3-20">[20]</a></sup><sup class="reference" id="cite_ref-python.org_15-1"><a href="#cite_note-python.org-15">[15]</a></sup> <a href="/wiki/Icon_(programming_language)" title="Icon (programming language)">Icon</a>,<sup class="reference" id="cite_ref-AutoNT-4_21-0"><a href="#cite_note-AutoNT-4-21">[21]</a></sup> <a href="/wiki/Lisp_(programming_language)" title="Lisp (programming language)">Lisp</a>,<sup class="reference" id="cite_ref-AutoNT-6_22-0"><a href="#cite_note-AutoNT-6-22">[22]</a></sup> <span class="nowrap"><br/><a href="/wiki/Modula-3" title="Modula-3">Modula-3</a></span>,<sup class="reference" id="cite_ref-98-interview_14-1"><a href="#cite_note-98-interview-14">[14]</a></sup><sup class="reference" id="cite_ref-classmix_17-1"><a href="#cite_note-classmix-17">[17]</a></sup> <a href="/wiki/Perl" title="Perl">Perl</a>,<sup class="reference" id="cite_ref-23"><a href="#cite_note-23">[23]</a></sup> <a href="/wiki/Standard_ML" title="Standard ML">Standard ML</a><sup class="reference" id="cite_ref-python.org_15-2"><a href="#cite_note-python.org-15">[15]</a></sup></td></tr><tr><th class="infobox-header" colspan="2" style="background-color: #eee;">Influenced</th></tr><tr><td class="infobox-full-data" colspan="2"><a href="/wiki/Apache_Groovy" title="Apache Groovy">Apache Groovy</a>, <a href="/wiki/Boo_(programming_language)" title="Boo (programming language)">Boo</a>, <a href="/wiki/Cobra_(programming_language)" title="Cobra (programming language)">Cobra</a>, <a href="/wiki/CoffeeScript" title="CoffeeScript">CoffeeScript</a>,<sup class="reference" id="cite_ref-24"><a href="#cite_note-24">[24]</a></sup> <a href="/wiki/D_(programming_language)" title="D (programming language)">D</a>, <a href="/wiki/F_Sharp_(programming_language)" title="F Sharp (programming language)">F#</a>, <a href="/wiki/Godot_(game_engine)#GDScript" title="Godot (game engine)">GDScript</a>, <a href="/wiki/Genie_(programming_language)" title="Genie (programming language)">Genie</a>,<sup class="reference" id="cite_ref-25"><a href="#cite_note-25">[25]</a></sup> <a href="/wiki/Go_(programming_language)" title="Go (programming language)">Go</a>, <a href="/wiki/JavaScript" title="JavaScript">JavaScript</a>,<sup class="reference" id="cite_ref-26"><a href="#cite_note-26">[26]</a></sup><sup class="reference" id="cite_ref-27"><a href="#cite_note-27">[27]</a></sup> <a href="/wiki/Julia_(programming_language)" title="Julia (programming language)">Julia</a>,<sup class="reference" id="cite_ref-Julia_28-0"><a href="#cite_note-Julia-28">[28]</a></sup> <a href="/wiki/Mojo_(programming_language)" title="Mojo (programming language)">Mojo</a>,<sup class="reference" id="cite_ref-Mojo_29-0"><a href="#cite_note-Mojo-29">[29]</a></sup> <a href="/wiki/Nim_(programming_language)" title="Nim (programming language)">Nim</a>, <a href="/wiki/Ring_(programming_language)" title="Ring (programming language)">Ring</a>,<sup class="reference" id="cite_ref-The_Ring_programming_language_and_other_languages_30-0"><a href="#cite_note-The_Ring_programming_language_and_other_languages-30">[30]</a></sup> <a href="/wiki/Ruby_(programming_language)" title="Ruby (programming language)">Ruby</a>,<sup class="reference" id="cite_ref-bini_31-0"><a href="#cite_note-bini-31">[31]</a></sup> <a href="/wiki/Swift_(programming_language)" title="Swift (programming language)">Swift</a><sup class="reference" id="cite_ref-lattner2014_32-0"><a href="#cite_note-lattner2014-32">[32]</a></sup></td></tr><tr><td class="infobox-below hlist" colspan="2" style="border-top: 1px solid #aaa; padding-top: 3px;">\n<ul><li><span class="noviewer" typeof="mw:File"><a class="mw-file-description" href="/wiki/File:Wikibooks-logo-en-noslogan.svg"><img alt="" class="mw-file-element" data-file-height="400" data-file-width="400" decoding="async" height="16" src="//upload.wikimedia.org/wikipedia/commons/thumb/d/df/Wikibooks-logo-en-noslogan.svg/16px-Wikibooks-logo-en-noslogan.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/d/df/Wikibooks-logo-en-noslogan.svg/24px-Wikibooks-logo-en-noslogan.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/d/df/Wikibooks-logo-en-noslogan.svg/32px-Wikibooks-logo-en-noslogan.svg.png 2x" width="16"/></a></span> <a class="extiw" href="https://en.wikibooks.org/wiki/Python_Programming" title="wikibooks:Python Programming">Python Programming</a> at Wikibooks</li></ul>\n</td></tr></tbody></table>\n'
    #             }
    #         }
    #     }
    
    data = {
        'parse': {
            'title': 'Test', 
            'pageid': 20, 
            'text': { '*':"""
                     <table class="infobox vevent">
                     <table>
                     <tr>
                     <th>Column1</th>
                     <td>Data1</td>
                     </tr>
                     <tr>
                     <th>Column2</th>
                     <td>Data2</td>
                     </tr>
                     </table>
                     </div>
                     """
            }
        }
    }


    print(f"data:{data}")
    

    raw_html = data['parse']['text']['*']

    soup = BeautifulSoup(raw_html, 'html.parser')
    # print(f"soup:{soup}")
    infobox = soup.find('table', {'class': 'infobox'})
    # print(f"infobox:{infobox}")

    if not infobox:
        return ServiceResponse(ServiceExecStatus.ERROR, None)

    infobox_data = {}
    for row in infobox.find_all('tr'):
        header = row.find('th')
        value = row.find('td')
        if header and value:
            key = header.get_text(" ", strip=True)
            val = value.get_text(" ", strip=True)
            infobox_data[key] = val

    return ServiceResponse(ServiceExecStatus.SUCCESS, infobox_data)

infobox_data = get_infobox(entity="Python (programming language)")
print(infobox_data)

data:{'parse': {'title': 'Test', 'pageid': 20, 'text': {'*': '\n                     <table class="infobox vevent">\n                     <table>\n                     <tr>\n                     <th>Column1</th>\n                     <td>Data1</td>\n                     </tr>\n                     <tr>\n                     <th>Column2</th>\n                     <td>Data2</td>\n                     </tr>\n                     </table>\n                     </div>\n                     '}}}
{'status': <ServiceExecStatus.SUCCESS: 1>, 'content': {'Column1': 'Data1', 'Column2': 'Data2'}}


In [16]:
import requests
import re

def get_page_content_by_paragraph(entity, max_paragraphs=None):
    """
    Retrieve content from a Wikipedia page and split it into paragraphs,
    excluding section headers.

    Parameters:
    - title (str): The title of the Wikipedia page.
    - max_paragraphs (int, optional): The maximum number of paragraphs to retrieve. Default is None (retrieve all paragraphs).

    Returns:
    - list: A list of paragraphs from the Wikipedia page.
    """
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        'action': 'query',
        'prop': 'extracts',
        'explaintext': True,
        'titles': entity,
        'format': 'json'
    }

    # response = requests.get(url, params=params)
    # data = response.json()
    # data = requests_get(url, params=params)
    data = {
        'query': {
            'pages': {
                '20': {
                    'pageid': 20, 
                    'ns': 0, 
                    'title': 'Test', 
                    'extract': 'This is a test \n\n This is the second test'
                }
            }
        }
    }
    print(data['query']['pages'].keys())
    print(f"data:{data}")
    
    page = next(iter(data['query']['pages'].values()))
    content = page.get('extract', 'No content found.')
    if content=='No content found.':
        return ServiceResponse(ServiceExecStatus.ERROR, content)
    
    # Split content into paragraphs and filter out headers
    paragraphs = [para for para in content.split('\n\n') if not re.match(r'^\s*==.*==\s*$', para) and para.strip() != '']
    
    # Return the specified number of paragraphs
    if max_paragraphs:
        paragraphs = paragraphs[:max_paragraphs]
    
    return ServiceResponse(ServiceExecStatus.SUCCESS, paragraphs)

# Example usage
title = "Python (programming language)"
paragraphs = get_page_content_by_paragraph(entity=title, max_paragraphs=2)
print(paragraphs)

dict_keys(['20'])
data:{'query': {'pages': {'20': {'pageid': 20, 'ns': 0, 'title': 'Test', 'extract': 'This is a test \n\n This is the second test'}}}}
{'status': <ServiceExecStatus.SUCCESS: 1>, 'content': ['This is a test ', ' This is the second test']}


In [30]:
def get_all_wikipedia_tables(entity):
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        'action': 'parse',
        'page': entity,
        'prop': 'text',
        'format': 'json'
    }

    # response = requests.get(url, params=params)
    # data = response.json()
    # data = requests_get(url, params=params)
    data = {
        'parse': {
            'title': 'Test', 
            'pageid': 20, 
            'text': {
                '*':"""<table class="wikitable">
        <tr><th>Header 1</th><th>Header 2</th></tr>
        <tr><td>Data 1</td><td>Data 2</td></tr>
    </table>"""
            }
        }
    }
    
    print(f"data:{data}")
    raw_html = data['parse']['text']['*']

    soup = BeautifulSoup(raw_html, 'html.parser')
    tables = soup.find_all('table', {'class': 'wikitable'})
    
    if not tables:
        return ServiceResponse(ServiceExecStatus.ERROR, None)

    all_tables_data = []
    for table_index, table in enumerate(tables):
        headers = [header.get_text(strip=True) for header in table.find_all('th')]
        table_dict = {header: [] for header in headers}

        for row in table.find_all('tr')[1:]:  # Skip the header row
            cells = row.find_all(['td', 'th'])
            if len(cells) == len(headers):  # Ensure the row has the correct number of cells
                for i, cell in enumerate(cells):
                    table_dict[headers[i]].append(cell.get_text(strip=True))
        
        all_tables_data.append(table_dict)

    return ServiceResponse(ServiceExecStatus.SUCCESS, all_tables_data)

# Example usage
article = "Python (programming language)"
table_data = get_all_wikipedia_tables(article)
print(table_data)

data:{'parse': {'title': 'Test', 'pageid': 20, 'text': {'*': '<table class="wikitable">\n        <tr><th>Header 1</th><th>Header 2</th></tr>\n        <tr><td>Data 1</td><td>Data 2</td></tr>\n    </table>'}}}
{'status': <ServiceExecStatus.SUCCESS: 1>, 'content': [{'Header 1': ['Data 1'], 'Header 2': ['Data 2']}]}


In [34]:
def get_page_images_with_captions(entity):
    url = "https://en.wikipedia.org/w/api.php"
    
    # Step 1: Get the list of images
    params = {
        'action': 'query',
        'prop': 'images',
        'titles': entity,
        'format': 'json'
    }
    # response = requests.get(url, params=params)
    # data = response.json()
    # data = requests_get(url, params=params)
    data = {
        'query': {
            'pages': {
                '20': {
                    'pageid': 20, 
                    'ns': 0, 
                    'title': 'Python (programming language)', 
                    'images': [
                        {
                            'ns': 6, 
                            'title': 'File:Commons-logo.svg'}
                    ]
                }
            }
        }
    }
    print(f"data:{data}")
    page = next(iter(data['query']['pages'].values()))
    images = page.get('images', [])
    if len(images)==0:
        return ServiceResponse(ServiceExecStatus.ERROR, None)

    # Step 2: Get details for each image
    image_details = []
    for image in images:
        image_title = image['title']
        params = {
            'action': 'query',
            'titles': image_title,
            'prop': 'imageinfo',
            'iiprop': 'url|extmetadata',
            'format': 'json'
        }
        response = requests.get(url, params=params)
        data = response.json()
        image_page = next(iter(data['query']['pages'].values()))
        if 'imageinfo' in image_page:
            image_info = image_page['imageinfo'][0]
            image_url = image_info.get('url', '')
            extmetadata = image_info.get('extmetadata', {})
            caption = extmetadata.get('ImageDescription', {}).get('value', 'No caption available')
            image_details.append({
                'title': image_title,
                'url': image_url,
                'caption': caption
            })
    
    return ServiceResponse(ServiceExecStatus.SUCCESS, image_details)

# Example usage
title = "Python (programming language)"
images_with_captions = get_page_images_with_captions(title)
images_with_captions

data:{'query': {'pages': {'20': {'pageid': 20, 'ns': 0, 'title': 'Python (programming language)', 'images': [{'ns': 6, 'title': 'File:Commons-logo.svg'}]}}}}


{'status': <ServiceExecStatus.SUCCESS: 1>,
 'content': [{'title': 'File:Commons-logo.svg',
   'url': 'https://upload.wikimedia.org/wikipedia/en/4/4a/Commons-logo.svg',
   'caption': 'The Wikimedia Commons logo, SVG version.'}]}

In [None]:
def bing_search(
    question: str,
    api_key: str,
    num_results: int = 10,
    **kwargs: Any,
) -> ServiceResponse:
    """
    Search question in Bing Search API and return the searching results

    Args:
        question (`str`):
            The search query string.
        api_key (`str`):
            The API key provided for authenticating with the Bing Search API.
        num_results (`int`, defaults to `10`):
            The number of search results to return.
        **kwargs (`Any`):
            Additional keyword arguments to be included in the search query.
            For more details, please refer to
            https://learn.microsoft.com/en-us/bing/search-apis/bing-web-search/reference/query-parameters

    Returns:
        `ServiceResponse`: A dictionary with two variables: `status` and
        `content`. The `status` variable is from the ServiceExecStatus enum,
        and `content` is a list of search results or error information,
        which depends on the `status` variable.
        For each searching result, it is a dictionary with keys 'title',
        'link', and 'snippet'.

    Example:
        .. code-block:: python

            results = bing_search(question="What is an agent?",
                                 bing_api_key="your bing api key",
                                 num_results=2,
                                 mkt="en-US")
            print(results)

        It returns the following dict.

        .. code-block:: python

            {
                'status': <ServiceExecStatus.SUCCESS: 1>,
                'content': [
                    {
                        'title': 'What Is an Agent? Definition, Types of
                            Agents, and Examples - Investopedia',
                        'link':
                        'https://www.investopedia.com/terms/a/agent.asp',
                        'snippet': "An agent is someone that is given
                            permission (either explicitly or assumed) to act
                            on an individual's behalf and may do so in a
                            variety of capacities. This could include
                            selling a home, executing..."},
                    {
                        'title': 'AGENT Definition & Usage Examples |
                                    Dictionary.com',
                        'link': 'https://www.dictionary.com/browse/agent',
                        'snippet': 'noun. a person who acts on behalf of
                            another person, group, business, government,
                            etc; representative. a person or thing that acts
                            or has the power to act. a phenomenon,
                            substance, or organism that exerts some force or
                            effect: a chemical agent.'
                    }
                ]
            }
    """

    # Bing Search API endpoint
    bing_search_url = "https://api.bing.microsoft.com/v7.0/search"

    params = {"q": question, "count": num_results}
    if kwargs:
        params.update(**kwargs)

    headers = {"Ocp-Apim-Subscription-Key": api_key}

    search_results = requests_get(
        bing_search_url,
        params,
        headers,
    )

    if isinstance(search_results, str):
        return ServiceResponse(ServiceExecStatus.ERROR, search_results)

    # Retrieve the top search result links
    results = search_results.get("webPages", {}).get("value", [])
    print(results)
    exit(0)

    # Return all snippet
    return ServiceResponse(
        ServiceExecStatus.SUCCESS,
        [
            # We changed the keywords to be consistent with the
            # Google search results format.
            {
                "title": result["name"],
                "link": result["url"],
                "snippet": result["snippet"],
            }
            for result in results
        ],
    )
    
    bing_search()

In [1]:
import json
from agentscope.service.web.wiki import (
        get_category_members,
        get_infobox,
        get_page_content_by_paragraph,
        get_all_wikipedia_tables,
        get_page_images_with_captions,
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
get_page_images_with_captions("Python (programming language)")

{'status': <ServiceExecStatus.SUCCESS: 1>,
 'content': [{'title': 'File:Commons-logo.svg',
   'url': 'https://upload.wikimedia.org/wikipedia/en/4/4a/Commons-logo.svg',
   'caption': 'No caption available'},
  {'title': 'File:Free and open-source software logo (2009).svg',
   'url': 'https://upload.wikimedia.org/wikipedia/commons/3/31/Free_and_open-source_software_logo_%282009%29.svg',
   'caption': '<a href="https://en.wikipedia.org/wiki/FOSS" class="extiw" title="w:FOSS">FOSS</a> logo created in inkscape consisting of a teal colored green square. Text set in Gentium Italic.'},
  {'title': 'File:Guido van Rossum OSCON 2006 cropped.png',
   'url': 'https://upload.wikimedia.org/wikipedia/commons/9/94/Guido_van_Rossum_OSCON_2006_cropped.png',
   'caption': '<a href="https://nl.wikipedia.org/wiki/Guido_van_Rossum" class="extiw" title="nl:Guido van Rossum">Guido van Rossum</a> op OSCON 2006'},
  {'title': 'File:OOjs UI icon edit-ltr-progressive.svg',
   'url': 'https://upload.wikimedia.org/

In [26]:
def wiki_api(params: dict) -> dict:
    """Scratch information via Wiki API"""
    url = "https://en.wikipedia.org/w/api.php"
    return requests_get(url, params=params)


entity= ''
search_params = {
    "action": "query",
    "list": "search",
    "srsearch": entity,
    "format": "json",
}

search_data = wiki_api(search_params)


In [27]:
search_data

{'error': {'code': 'missingparam',
  'info': 'The "srsearch" parameter must be set.',
  '*': 'See https://en.wikipedia.org/w/api.php for API usage. Subscribe to the mediawiki-api-announce mailing list at &lt;https://lists.wikimedia.org/postorius/lists/mediawiki-api-announce.lists.wikimedia.org/&gt; for notice of API deprecations and breaking changes.'},
 'servedby': 'mw-api-ext.eqiad.main-7894c678c7-7lrdp'}