In [4]:
import requests

# Search for members of a category
def get_category_members(category):
    url = f"https://en.wikipedia.org/w/api.php?action=query&list=categorymembers&cmtitle=Category:{category}&format=json"
    response = requests.get(url)
    data = response.json()
    return data['query']['categorymembers']
    # return data

# Get categories of an article
def get_article_categories(article):
    url = f"https://en.wikipedia.org/w/api.php?action=query&prop=categories&titles={article}&format=json"
    response = requests.get(url)
    data = response.json()
    pages = data['query']['pages']
    page_id = next(iter(pages))
    return pages[page_id].get('categories', [])

# Get table of contents (sections) of an article
def get_table_of_contents(article):
    url = f"https://en.wikipedia.org/w/api.php?action=parse&page={article}&prop=sections&format=json"
    response = requests.get(url)
    data = response.json()
    return data['parse']['sections']

# Search for lists and timelines
def search_lists_and_timelines(search_term):
    url = f"https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={search_term}&format=json"
    response = requests.get(url)
    data = response.json()
    return data['query']['search']


In [30]:
category_members = get_category_members("Role-playing_games")
article_categories = get_article_categories("Python (programming language)")
table_of_contents = get_table_of_contents("Python (programming language)")
lists_and_timelines = search_lists_and_timelines("list of role-playing games")

In [37]:
table_of_contents[-1]

{'toclevel': 1,
 'level': '2',
 'line': 'External links',
 'number': '17',
 'index': '29',
 'fromtitle': 'Python_(programming_language)',
 'byteoffset': 164045,
 'anchor': 'External_links',
 'linkAnchor': 'External_links'}

In [5]:
# infer category members
## max_member controls how many members to return
def get_category_members(category, max_members=1000, limit_per_request=500):
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        'action': 'query',
        'list': 'categorymembers',
        'cmtitle': f'Category:{category}',
        'cmlimit': limit_per_request,  # Maximum number of results per request
        'format': 'json'
    }

    members = []
    total_fetched = 0

    while total_fetched < max_members:
        response = requests.get(url, params=params)
        data = response.json()
        batch_members = data['query']['categorymembers']
        
        members.extend(batch_members)
        total_fetched += len(batch_members)

        # Check if there is a continuation token
        if 'continue' in data and total_fetched < max_members:
            params['cmcontinue'] = data['continue']['cmcontinue']
        else:
            break
    
    # If more members were fetched than max_members, trim the list
    if len(members) > max_members:
        members = members[:max_members]

    return members

In [9]:
category = "Machine_learning"
members = get_category_members(category, max_members=5)

In [10]:
members

[{'pageid': 67911196, 'ns': 0, 'title': 'Bayesian learning mechanisms'},
 {'pageid': 233488, 'ns': 0, 'title': 'Machine learning'},
 {'pageid': 53587467, 'ns': 0, 'title': 'Outline of machine learning'},
 {'pageid': 64439717, 'ns': 0, 'title': '80 Million Tiny Images'},
 {'pageid': 75530149, 'ns': 0, 'title': 'Accelerated Linear Algebra'}]

In [14]:
# extract infobox
import requests
from bs4 import BeautifulSoup

def get_infobox(article):
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        'action': 'parse',
        'page': article,
        'prop': 'text',
        'format': 'json'
    }

    response = requests.get(url, params=params)
    data = response.json()
    raw_html = data['parse']['text']['*']

    soup = BeautifulSoup(raw_html, 'html.parser')
    infobox = soup.find('table', {'class': 'infobox'})

    if not infobox:
        return None

    infobox_data = {}
    for row in infobox.find_all('tr'):
        header = row.find('th')
        value = row.find('td')
        if header and value:
            key = header.get_text(" ", strip=True)
            val = value.get_text(" ", strip=True)
            infobox_data[key] = val

    return infobox_data

# Example usage
article = "Python (programming language)"
infobox_data = get_infobox(article)
print()
if infobox_data:
    for key, value in infobox_data.items():
        print(f"{key}: {value}")
else:
    print("No infobox found.")


{'Paradigm': 'Multi-paradigm : object-oriented , [1] procedural ( imperative ), functional , structured , reflective', 'Designed\xa0by': 'Guido van Rossum', 'Developer': 'Python Software Foundation', 'First\xa0appeared': '20\xa0February 1991 ; 33 years ago ( 1991-02-20 ) [2]', 'Stable release': '3.12.4 / 6 June 2024 ; 14 days ago ( 6 June 2024 )', 'Typing discipline': 'duck , dynamic , strong ; [3] optional type annotations (since 3.5, but those hints are ignored, except with unofficial tools) [4]', 'OS': 'Tier 1 : 64-bit Linux , macOS ; 64- and 32-bit Windows 10+ [5] Tier 2 : E.g. 32-bit WebAssembly (WASI) Tier 3 : 64-bit FreeBSD , iOS ; e.g. Raspberry Pi OS Unofficial (or has been known to work): Other Unix-like / BSD variants and e.g. Android 5.0+ (official from Python 3.13 planned [6] ) and a few other platforms [7] [8] [9]', 'License': 'Python Software Foundation License', 'Filename extensions': '.py, .pyw, .pyz, [10] .pyi, .pyc, .pyd', 'Website': 'python.org'}
Paradigm: Multi-par

In [44]:
def search_lists_and_timelines(search_term):
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        'action': 'query',
        'list': 'search',
        'srsearch': search_term,
        'format': 'json',
        'srlimit': 20  # You can adjust this limit as needed
    }
    response = requests.get(url, params=params)
    data = response.json()
    return data['query']['search'] if 'query' in data else []

# Example usage
search_term = "Python (programming language)"
results = search_lists_and_timelines(search_term)
for result in results:
    print(f"Title: {result['title']}\nSnippet: {result['snippet']}\n")

Title: Python (programming language)
Snippet: <span class="searchmatch">Python</span> is a high-level, general-purpose <span class="searchmatch">programming</span> <span class="searchmatch">language</span>. Its design philosophy emphasizes code readability with the use of significant indentation

Title: History of Python
Snippet: The <span class="searchmatch">programming</span> <span class="searchmatch">language</span> <span class="searchmatch">Python</span> was conceived in the late 1980s, and its implementation was started in December 1989 by Guido van Rossum at CWI in the

Title: Mojo (programming language)
Snippet: Mojo is a <span class="searchmatch">programming</span> <span class="searchmatch">language</span> in the <span class="searchmatch">Python</span> family that is currently under development. It is available both in browsers via Jupyter notebooks, and locally

Title: Python syntax and semantics
Snippet: The syntax of the <span class="searchmatch">Python</span> <span class="s

In [20]:
import requests
from bs4 import BeautifulSoup
import json

def get_all_wikipedia_tables(article_title):
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        'action': 'parse',
        'page': article_title,
        'prop': 'text',
        'format': 'json'
    }

    response = requests.get(url, params=params)
    data = response.json()
    raw_html = data['parse']['text']['*']

    soup = BeautifulSoup(raw_html, 'html.parser')
    tables = soup.find_all('table', {'class': 'wikitable'})
    
    if not tables:
        return []

    all_tables_data = []
    for table_index, table in enumerate(tables):
        headers = [header.get_text(strip=True) for header in table.find_all('th')]
        table_dict = {header: [] for header in headers}

        for row in table.find_all('tr')[1:]:  # Skip the header row
            cells = row.find_all(['td', 'th'])
            if len(cells) == len(headers):  # Ensure the row has the correct number of cells
                for i, cell in enumerate(cells):
                    table_dict[headers[i]].append(cell.get_text(strip=True))
        
        all_tables_data.append({
            'table_index': table_index,
            'table_data': table_dict
        })

    return all_tables_data

# Example usage
article = "Python (programming language)"
table_data = get_all_wikipedia_tables(article)
if table_data:
    for row in table_data:
        print(row)
else:
    print("No table found.")

{'table_index': 0, 'table_data': {'Type': ['bool', 'bytearray', 'bytes', 'complex', 'dict', 'types.EllipsisType', 'float', 'frozenset', 'int', 'list', 'types.NoneType', 'types.NotImplementedType', 'range', 'set', 'str', 'tuple'], 'Mutability': ['immutable', 'mutable', 'immutable', 'immutable', 'mutable', 'immutable', 'immutable', 'immutable', 'immutable', 'mutable', 'immutable', 'immutable', 'immutable', 'mutable', 'immutable', 'immutable'], 'Description': ['Boolean value', 'Sequence ofbytes', 'Sequence of bytes', 'Complex numberwith real and imaginary parts', 'Associative array(or dictionary) of key and value pairs; can contain mixed types (keys and values), keys must be a hashable type', 'Anellipsisplaceholder to be used as an index inNumPyarrays', 'Double-precisionfloating-point number. The precision is machine-dependent but in practice is generally implemented as a 64-bitIEEE\xa0754number with 53\xa0bits of precision.[118]', 'Unorderedset, contains no duplicates; can contain mixed 

In [48]:
# get pages link to the article
def get_page_links(title):
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        'action': 'query',
        'prop': 'links',
        'titles': title,
        'format': 'json',
        'pllimit': 'max'
    }
    response = requests.get(url, params=params)
    data = response.json()
    page = next(iter(data['query']['pages'].values()))
    return page.get('links', [])

article = "Python (programming language)"
links = get_page_links(article)
links

[{'ns': 0, 'title': '"Hello, World!" program'},
 {'ns': 0, 'title': '3ds Max'},
 {'ns': 0, 'title': '?:'},
 {'ns': 0, 'title': 'ABC (programming language)'},
 {'ns': 0, 'title': 'ADMB'},
 {'ns': 0, 'title': 'ALGOL'},
 {'ns': 0, 'title': 'ALGOL 68'},
 {'ns': 0, 'title': 'APL (programming language)'},
 {'ns': 0, 'title': 'ATmega'},
 {'ns': 0, 'title': 'AVR microcontrollers'},
 {'ns': 0, 'title': 'Abaqus'},
 {'ns': 0, 'title': 'Academic Free License'},
 {'ns': 0, 'title': 'Academic conference'},
 {'ns': 0, 'title': 'Action selection'},
 {'ns': 0, 'title': 'Activation function'},
 {'ns': 0, 'title': 'Ada (programming language)'},
 {'ns': 0, 'title': 'Advanced Simulation Library'},
 {'ns': 0, 'title': 'Adversarial machine learning'},
 {'ns': 0, 'title': 'AlexNet'},
 {'ns': 0, 'title': 'Alex Graves (computer scientist)'},
 {'ns': 0, 'title': 'Alex Martelli'},
 {'ns': 0, 'title': 'Algebra'},
 {'ns': 0, 'title': 'AlphaFold'},
 {'ns': 0, 'title': 'AlphaGo'},
 {'ns': 0, 'title': 'AlphaZero'},
 {

In [53]:
# extract image and details.

def get_page_images_with_captions(title):
    url = "https://en.wikipedia.org/w/api.php"
    
    # Step 1: Get the list of images
    params = {
        'action': 'query',
        'prop': 'images',
        'titles': title,
        'format': 'json'
    }
    response = requests.get(url, params=params)
    data = response.json()
    page = next(iter(data['query']['pages'].values()))
    images = page.get('images', [])
    
    # Step 2: Get details for each image
    image_details = []
    for image in images:
        image_title = image['title']
        params = {
            'action': 'query',
            'titles': image_title,
            'prop': 'imageinfo',
            'iiprop': 'url|extmetadata',
            'format': 'json'
        }
        response = requests.get(url, params=params)
        data = response.json()
        image_page = next(iter(data['query']['pages'].values()))
        if 'imageinfo' in image_page:
            image_info = image_page['imageinfo'][0]
            image_url = image_info.get('url', '')
            extmetadata = image_info.get('extmetadata', {})
            caption = extmetadata.get('ImageDescription', {}).get('value', 'No caption available')
            image_details.append({
                'title': image_title,
                'url': image_url,
                'caption': caption
            })
    
    return image_details

# Example usage
title = "Python (programming language)"
images_with_captions = get_page_images_with_captions(title)
for img in images_with_captions:
    print(f"Title: {img['title']}")
    print(f"URL: {img['url']}")
    print(f"Caption: {img['caption']}")
    print()

Title: File:Commons-logo.svg
URL: https://upload.wikimedia.org/wikipedia/en/4/4a/Commons-logo.svg
Caption: The Wikimedia Commons logo, SVG version.

Title: File:Free and open-source software logo (2009).svg
URL: https://upload.wikimedia.org/wikipedia/commons/3/31/Free_and_open-source_software_logo_%282009%29.svg
Caption: <a href="https://en.wikipedia.org/wiki/FOSS" class="extiw" title="w:FOSS">FOSS</a> logo created in inkscape consisting of a teal colored green square. Text set in Gentium Italic.

Title: File:Guido van Rossum OSCON 2006 cropped.png
URL: https://upload.wikimedia.org/wikipedia/commons/9/94/Guido_van_Rossum_OSCON_2006_cropped.png
Caption: <a href="https://nl.wikipedia.org/wiki/Guido_van_Rossum" class="extiw" title="nl:Guido van Rossum">Guido van Rossum</a> op OSCON 2006

Title: File:OOjs UI icon edit-ltr-progressive.svg
URL: https://upload.wikimedia.org/wikipedia/en/8/8a/OOjs_UI_icon_edit-ltr-progressive.svg
Caption: <p>English: <span lang="en">An icon from the OOjs UI Me