In [3]:
import time
import re
import mwclient
import pandas as pd
import json

# Step 1: Identify all categories that I will be using.

There are 6 subcategories: high, medium, low, top importance & NA / unknown importance.

# Step 2: Get all talk pages for one category.

In [4]:
user_agent = 'User-Agent (contact: zz102@wellesley.edu)'
site = mwclient.Site('en.wikipedia.org', clients_useragent=user_agent)

# 2. Access the Category
category = site.categories['High-importance_Health_and_fitness_articles']

"""
for p in category:
    print(p)
"""
talkPages = []
for p in category:
    talkPages.append(p.name)
len(talkPages)

111

In [5]:
talkPages[:3]

['Talk:996 working hour system',
 'Talk:Active mobility',
 'Talk:Aerobic conditioning']

111 high importance articles.

In [6]:
user_agent = 'User-Agent (contact: zz102@wellesley.edu)'
site = mwclient.Site('en.wikipedia.org', clients_useragent=user_agent)

# 2. Access the Category
category = site.categories['Low-importance_Health_and_fitness_articles']

"""
for p in category:
    print(p)
"""
articleNames = []
for p in category:
    articleNames.append(p.name)
len(articleNames)

1976

1976 low-importance articles.

# Step 3: For each talk page, navigate to the actual page and save the page titles.

In [7]:
articleNames = []
for page in talkPages[:3]:
    talk_page = site.Pages[page]

    # 3. Get the Article (Subject) page
    # Logic: If it is a Talk page (Namespace 1), remove the "Talk:" prefix.
    if talk_page.namespace == 1:
        # 'Talk:Exercise' -> 'Exercise'
        article_title = talk_page.name.split(':', 1)[1]
        articleNames.append(article_title)

In [8]:
articleNames[:3]

['996 working hour system', 'Active mobility', 'Aerobic conditioning']

# Step 4: After navigating to the actual page, extract relevant properties.

We want the usual page title and QID, but the short description is also a very helpful feature that we can use for zero-shot classification.

In [16]:
def getPageProps(articleName):
    "Returns an ordered nested dictionary after calling the api."
    page_title = articleName
    result = site.api(
        'query',
        prop='pageprops',
        titles=page_title,
        ppprop='wikibase_item'
    )
    return result

def flattenResults(result):
    "Inputs an Ordered Dict and Returns a flattened dictionary"
    return pd.json_normalize(result).to_dict('records')[0]

def extractProperties(flattenedDict):
    """"
    Takes in a flattened Dictionary and extracts the relevant information.
    Returns a dictionary of pageid, title, and wikibase_item.
    """
    newDict = {}
    # get the value associated with pageid, it's a list of keys so specify the first one
    pageid = [p for p in flattenedDict if p.endswith('pageid')][0] 
    title = [t for t in flattenedDict if t.endswith('title')][0]
    qid = [q for q in flattenedDict if q.endswith('wikibase_item')][0]

    newDict = {'pageid': flattenedDict[pageid], 
               'title': flattenedDict[title], 
               'qid': flattenedDict[qid]}
    return newDict

Now, we can put this code together.

In [10]:
raw_props = []
for p in articleNames:

    user_agent = 'User-Agent (contact: zz102@wellesley.edu)'
    site = mwclient.Site('en.wikipedia.org', clients_useragent=user_agent)

    raw_props.append(getPageProps(p))
    
clean_props = []
for item in raw_props:
    # 1. Process the item
    cleaned_item = extractProperties(flattenResults(item))
        
    # 2. Add the category name to the dictionary
    cleaned_item['category'] = 'high_importance'
        
    # 3. Append to final list
    clean_props.append(cleaned_item)

In [11]:
clean_props[:3]

[{'pageid': 60380397,
  'title': '996 working hour system',
  'qid': 'Q62568684',
  'category': 'high_importance'},
 {'pageid': 22570610,
  'title': 'Active mobility',
  'qid': 'Q4010939',
  'category': 'high_importance'},
 {'pageid': 2012158,
  'title': 'Aerobic conditioning',
  'qid': 'Q4688194',
  'category': 'high_importance'}]

In [18]:
def getAllArticlesByCat(category_name):
    user_agent = 'User-Agent (contact: zz102@wellesley.edu)'
    site = mwclient.Site('en.wikipedia.org', clients_useragent=user_agent)
    category = site.categories[category_name]

    # Step 1: Get talk pages
    talkPages = [t.name for t in category]
    
    # Step 2: Get article names
    articleNames = []
    for p in talkPages:
        talk_page = site.Pages[p]
        if talk_page.namespace == 1:
            article_title = talk_page.name.split(':', 1)[1]
            articleNames.append(article_title)

    # Step 3: Get properties for each article
    raw_props = [getPageProps(a) for a in articleNames]

    # Step 4: Clean up results 
    clean_props = []
    for item in raw_props:
        try:
            cleaned_item = extractProperties(flattenResults(item))
            cleaned_item['category'] = category_name.split('_')[0]
            clean_props.append(cleaned_item)
        except Exception as e:
            print(f"Error processing {item}: {e}")
    return clean_props

In [19]:
'High-importance_Health_and_fitness_articles'.split('_')[0]

'High-importance'

# Step 5: Loop through all categories.

In [20]:
categories = ['High-importance_Health_and_fitness_articles', "Low-importance_Health_and_fitness_articles", "Mid-importance_Health_and_fitness_articles", "NA-importance_Health_and_fitness_articles", "Top-importance_Health_and_fitness_articles", "Unknown-importance_Health_and_fitness_articles"]

There should be 6907 articles.

In [21]:
full_data = []

for c in categories:
    full_data.extend(getAllArticlesByCat(c))

Error processing OrderedDict({'batchcomplete': '', 'query': OrderedDict({'pages': OrderedDict({'40950837': OrderedDict({'pageid': 40950837, 'ns': 0, 'title': 'Diarrhea in developing regions'})}), 'userinfo': OrderedDict({'id': 0, 'name': '192.42.89.2', 'anon': ''})})}): list index out of range
Error processing OrderedDict({'batchcomplete': '', 'query': OrderedDict({'pages': OrderedDict({'1588686': OrderedDict({'pageid': 1588686, 'ns': 0, 'title': 'American Legacy Foundation'})}), 'userinfo': OrderedDict({'id': 0, 'name': '192.42.89.2', 'anon': ''})})}): list index out of range
Error processing OrderedDict({'batchcomplete': '', 'query': OrderedDict({'pages': OrderedDict({'81442807': OrderedDict({'pageid': 81442807, 'ns': 0, 'title': 'BinayTara'})}), 'userinfo': OrderedDict({'id': 0, 'name': '192.42.89.2', 'anon': ''})})}): list index out of range
Error processing OrderedDict({'batchcomplete': '', 'query': OrderedDict({'pages': OrderedDict({'80766801': OrderedDict({'pageid': 80766801, 'n

In [23]:
len(full_data)

3177

3177 articles.

# Step 6: Save all data into a json file.

In [24]:
with open ('all_articles_by_importance.json', 'w') as f:
    json.dump(full_data, f)

In [5]:
data = []
with open ('all_articles_by_importance.json', 'r') as f:
    data = json.load(f)
data[:3]

[{'pageid': 60380397,
  'title': '996 working hour system',
  'qid': 'Q62568684',
  'category': 'High-importance'},
 {'pageid': 22570610,
  'title': 'Active mobility',
  'qid': 'Q4010939',
  'category': 'High-importance'},
 {'pageid': 2012158,
  'title': 'Aerobic conditioning',
  'qid': 'Q4688194',
  'category': 'High-importance'}]