## Ontology Creation Using Wikipedia Categories
This notebook assumes that it can connect to a mysql database that has the category, page, and categorylinks tables from wikipedia.  See https://dumps.wikimedia.org/enwiki/latest/.  This is about 50GB of data in mysql (roughly 150 million rows).  To load the data, I had to put the mysql data files in a ramdisk and then move them to a drive after the load was complete.

This notebook extracts all of the subcategories and pages that are linked to the initially specified set of categories to the specified depth.  A depth of 4 results in roughly 600k total nodes in the graph.

In [17]:
import re
import MySQLdb
from tqdm import tnrange, tqdm_notebook
import pickle
import time
import random

db = MySQLdb.connect(user='root', passwd='abc123', host='127.0.0.1', db='wiki')
c = db.cursor()

DEPTH = 4
category_graph = {}
next_nodes = []

new_nodes = [b'Artificial_intelligence', b'Nanotechnology', b'Robotics', b'Biotechnology',
             b'Networks', b'Bioinformatics', b'Biological_engineering', b'Computational_biology',
             b'Telecommunications', b'Energy', b'Ecosystems', b'Environmental_economics',
             b'Habitat', b'Earth_system_sciences', b'Environment', b'Computers', b'Learning',
             b'Education', b'Water', b'Space', b'Health', b'Poverty', b'Aid', b'Hunger',
             b'Development_economics', b'Farms', b'Land_management', b'Prevention', b'Security',
             b'Emergency_services', b'Political_philosphy', b'Governance', b'Accountability',
             b'Justice', b'Ethical_principles', b'Rights', b'Identity_politics', b'Individualism'
            ]

In [18]:
def wiki_canonicalize(category):
    # replace spaces with underscore, lowercase, and uppercase first character
    if category[:2] == b'\xce\xa3':
        category = category[2:]
    return category.replace(b' ', b'_').lower().capitalize() 

def human_canonicalize(category):
    if type(category) == bytes:
        return category.replace(b'_', b' ').lower().decode('utf-8')
    else:
        return category.replace('_', ' ').lower()

In [19]:
readable_roots = [human_canonicalize(root) for root in new_nodes]

with open('readable_roots.p', 'wb') as fp:
    pickle.dump(readable_roots, fp)

In [20]:
i=0
for current_depth in tqdm_notebook(range(DEPTH), desc='Categories'):
    for category in tqdm_notebook(new_nodes, desc='Subcategories', leave=False):
        if category.endswith(b'_stub') or category.endswith(b'_stubs'):
            continue
        if category not in category_graph:
            category_graph[category] = []
        c.execute('''
           select cl_type, count(*) from categorylinks
           where cl_to = %s group by cl_type
        ''', (category,))
        # apply some filtering if there are too many subcategories or pages.  The theory
        # is that anything with that many subcategories or pages won't be useful in the context
        # of the ontology we're building
        count_dict = {}
        for row in c.fetchall():
            count_dict[row[0]] = row[1]
        too_many_pages = count_dict.get(b'page', 0) > 250
        too_many_subcats = count_dict.get(b'subcat', 0) > 100
        if too_many_pages and too_many_subcats:
            print('skip (all): {}'.format(category), flush=True)
            continue
        where_clause = ''
        if too_many_pages:
            where_clause += ' and cl_type != "page"'
            print('skip (pages): {}'.format(category), flush=True)
        if too_many_subcats:
            where_clause += ' and cl_type != "subcat"'
            print('skip (subcat): {}'.format(category), flush=True)
        c.execute('''
           select cl_sortkey, cl_type from categorylinks
           where cl_to = %s and cl_type != "file" {}
        '''.format(where_clause), (category,))
        for row in c.fetchall():
            # rows can have multiple values separated by \n
            for subcat_or_page in row[0].strip().split(b'\n'):
                subcat_or_page = wiki_canonicalize(subcat_or_page)
                if subcat_or_page.endswith(b'_stub') or subcat_or_page.endswith(b'_stubs'):
                    continue
                if subcat_or_page not in category_graph[category] and \
                        subcat_or_page != category:
                    category_graph[category].append(subcat_or_page)
                if row[1] == b'page' and subcat_or_page not in category_graph:
                    category_graph[subcat_or_page] = []
                if row[1] == b'subcat' and subcat_or_page not in new_nodes and \
                        subcat_or_page not in category_graph and \
                        subcat_or_page not in next_nodes:
                    next_nodes.append(subcat_or_page)
    # replace the content of new_nodes with the content of next_nodes so that the loop will continue to iterate
    final_subcats = next_nodes
    for item in new_nodes:
        new_nodes.remove(item)
    for item in next_nodes:
        new_nodes.append(item)
    next_nodes = []
    
for subcat in final_subcats:
    category_graph[subcat] = []
    
readable_graph = {}
for category in category_graph:
    readable_graph[human_canonicalize(category)] = \
            [human_canonicalize(subcat) for subcat in category_graph[category]]
with open('readable_graph.p', 'wb') as fp:
    pickle.dump(readable_graph, fp)

skip (pages): b'Artificial_intelligence'
skip (pages): b'Biotechnology'
skip (pages): b'Bioinformatics'
skip (pages): b'Biotechnology'
skip (pages): b'Bioinformatics'
skip (pages): b'Human\xe2\x80\x93computer_interaction'
skip (pages): b'Artificial_intelligence_researchers'
skip (pages): b'Life_sciences_industry'
skip (pages): b'Biological_databases'
skip (subcat): b'Telecommunications_by_country'
skip (pages): b'Computer_networking'
skip (subcat): b'Energy_by_country'
skip (subcat): b'Environment_by_year'
skip (pages): b'Ecology'
skip (pages): b'Management'
skip (pages): b'Urban_studies_and_planning'
skip (pages): b'Psychology'
skip (pages): b'Educational_psychology'
skip (subcat): b'Water_by_country'
skip (pages): b'Hydrology'
skip (subcat): b'Health_by_country'
skip (pages): b'Biotechnology'
skip (pages): b'Human\xe2\x80\x93computer_interaction'
skip (subcat): b'Telecommunications_by_country'
skip (pages): b'Educational_psychology'
skip (subcat): b'Health_by_country'
skip (subcat): 

In [21]:
print(len(category_graph))

617752


In [131]:
node_name = b'Artificial_intelligence'
print(node_name)
for i in range(100):
    if category_graph[node_name] == []:
        break
    node_name = category_graph[node_name][random.randint(0, len(category_graph[node_name])-1)]
    print('{}: {}'.format(i, node_name)) 


b'Artificial_intelligence'
0: b'Logic_programming'
1: b'Scientific_community_metaphor'
