In [1]:
import requests
import os
import time

# Step 1: Get up to thousands of contributions by a user (with pagination)
def get_user_contributions(username, max_articles=2000):
    print(f"[+] Getting contributions for: {username}")
    url = "https://en.wikipedia.org/w/api.php"
    contributions = []
    uc_continue = None

    while len(contributions) < max_articles:
        params = {
            "action": "query",
            "list": "usercontribs",
            "ucuser": username,
            "uclimit": 500,
            "format": "json"
        }
        if uc_continue:
            params["uccontinue"] = uc_continue

        try:
            response = requests.get(url, params=params, timeout=10)
            response.raise_for_status()
            data = response.json()
            contribs = data.get("query", {}).get("usercontribs", [])
            contributions.extend(contribs)
            uc_continue = data.get("continue", {}).get("uccontinue")

            if not uc_continue:
                break
        except Exception as e:
            print(f"[!] Error fetching contributions for {username}: {e}")
            break

    return contributions[:max_articles]

# Step 2: Filter political articles with improved keywords
def filter_political_articles(contributions):
    keywords = [
        "politic", "election", "government", "president", "minister",
        "diplomacy", "parliament", "policy", "senate", "democracy", "congress",
        "revolution", "campaign", "voting", "legislation", "foreign relations"
    ]
    titles = set()
    for contrib in contributions:
        title = contrib.get("title", "").lower()
        if any(k in title for k in keywords):
            titles.add(contrib["title"])
    return list(titles)

# Step 3: Get article content
def get_article_content(title):
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "prop": "extracts",
        "explaintext": True,
        "titles": title,
        "format": "json"
    }
    try:
        response = requests.get(url, params=params, timeout=10)
        response.raise_for_status()
        data = response.json()
        pages = data.get("query", {}).get("pages", {})
        page = next(iter(pages.values()))
        return page.get("extract", "")
    except Exception as e:
        print(f"[!] Failed to fetch article: {title} – {e}")
        return ""

# Step 4: Save text files
def save_article(title, content, folder):
    if not content.strip() or len(content) < 300:
        return
    os.makedirs(folder, exist_ok=True)
    safe_title = title.replace("/", "_").replace("\\", "_")
    filepath = os.path.join(folder, f"{safe_title}.txt")
    with open(filepath, "w", encoding="utf-8") as f:
        f.write(content)
    print(f"  → Saved: {title}")

# Step 5: Process one user
def process_user(username, save_folder, max_articles=500):
    contributions = get_user_contributions(username, max_articles)
    political_titles = filter_political_articles(contributions)

    print(f"  → {len(political_titles)} political articles found.")
    user_folder = os.path.join(save_folder, username.replace(" ", "_"))

    saved_titles = set(os.listdir(user_folder)) if os.path.exists(user_folder) else set()
    saved_titles = {f.replace(".txt", "") for f in saved_titles}

    for title in political_titles:
        if title.replace("/", "_") in saved_titles:
            continue
        content = get_article_content(title)
        save_article(title, content, user_folder)
        time.sleep(1)

# Step 6: Run for many users
usernames = [
    "Brianmc", "Koavf", "Ser Amantio di Nicolao", "Fuzheado", "SimonP",
    "TParis", "MBisanz", "Piotrus", "Future Perfect at Sunrise", "Drmies",
    "Ritchie333", "Sphilbrick", "Magicpiano", "User:Russavia", "Cirt"
]

for user in usernames:
    print(f"\n--- Processing {user} ---")
    process_user(user, save_folder="wiki_politics_dataset", max_articles=1500)



--- Processing Brianmc ---
[+] Getting contributions for: Brianmc
  → 0 political articles found.

--- Processing Koavf ---
[+] Getting contributions for: Koavf
  → 6 political articles found.
  → Saved: John McGuire (Virginia politician)
  → Saved: Talk:Flag of the president of the United States

--- Processing Ser Amantio di Nicolao ---
[+] Getting contributions for: Ser Amantio di Nicolao
  → 8 political articles found.
  → Saved: Max Miller (politician)
  → Saved: Aisha Mohammed (Ethiopian politician)
  → Saved: Ernst Fraenkel (political scientist)

--- Processing Fuzheado ---
[+] Getting contributions for: Fuzheado
  → 9 political articles found.
  → Saved: 1936 United States presidential election in Connecticut
  → Saved: List of forms of government
  → Saved: 1986 California gubernatorial election
  → Saved: Green Revolution
  → Saved: 2017 German federal election
  → Saved: World Governments Summit
  → Saved: 2021 German federal election
  → Saved: Political party
  → Saved: P