# Load in Libraries

In [1]:
# data loading and manipulation
import os
import requests
import bs4 as bs

import pprint
pp = pprint.PrettyPrinter(indent=4)

import urllib.request
import ssl
import html5lib
context = ssl._create_unverified_context()

# Scrape Data for CS Fields of Study
Use ACM CCS as field classifications: https://dl.acm.org/ccs/ccs_flat.cfm#10003120

In [2]:
# check if source is already downloaded
css_source_file = './acm_ccs_source.txt'
ccs_source = ''

if os.path.isfile(css_source_file):
    with open(css_source_file, 'r') as f:
        ccs_source = f.read(); 
else:
    acm_ccs_url = 'https://dl.acm.org/ccs/ccs_flat.cfm#10003120'
    ccs_source = urllib.request.urlopen(acm_ccs_url, context=context).read()

ccs_parsed = bs.BeautifulSoup(ccs_source, 'html5lib')

In [3]:
# from https://stackoverflow.com/questions/17850121/parsing-nested-html-list-with-beautifulsoup
def dictify(ul, parent):
    result = []
    for li in ul.find_all('li', recursive=False):
        key = next(li.stripped_strings)
        ul = li.find('ul')
        if ul:
            result.append({
                'name': key,
                'parent': parent,
                'children': dictify(ul, key)
            })
    return result

In [4]:
flat_content = ccs_parsed.find('div', id='holdflat').ul
output = dictify(flat_content, None)
output

[{'name': 'General and reference',
  'parent': None,
  'children': [{'name': 'Document types',
    'parent': 'General and reference',
    'children': [{'name': 'Surveys and overviews',
      'parent': 'Document types',
      'children': []},
     {'name': 'Reference works', 'parent': 'Document types', 'children': []},
     {'name': 'General conference proceedings',
      'parent': 'Document types',
      'children': []},
     {'name': 'Biographies', 'parent': 'Document types', 'children': []},
     {'name': 'General literature',
      'parent': 'Document types',
      'children': []},
     {'name': 'Computing standards, RFCs and guidelines',
      'parent': 'Document types',
      'children': []}]},
   {'name': 'Cross-computing tools and techniques',
    'parent': 'General and reference',
    'children': [{'name': 'Reliability',
      'parent': 'Cross-computing tools and techniques',
      'children': []},
     {'name': 'Empirical studies',
      'parent': 'Cross-computing tools and te

In [5]:
def max_depth(tree, curr_depth):
    # base case: no more children
    if len(tree) == 0:
        return curr_depth
    
    # recurse deeper
    curr_max_depth = curr_depth
    for child in tree:
        new_max_depth = max_depth(child['children'], curr_depth + 1)
        if new_max_depth > curr_max_depth:
            curr_max_depth = new_max_depth
    
    return curr_max_depth

In [6]:
for general_category in output:
    print('General Category: {} | Max Depth: {}'.format(general_category['name'], max_depth([general_category], 0)))

General Category: General and reference | Max Depth: 3
General Category: Hardware | Max Depth: 5
General Category: Computer systems organization | Max Depth: 5
General Category: Networks | Max Depth: 6
General Category: Software and its engineering | Max Depth: 6
General Category: Theory of computation | Max Depth: 6
General Category: Mathematics of computing | Max Depth: 6
General Category: Information systems | Max Depth: 5
General Category: Security and privacy | Max Depth: 4
General Category: Human-centered computing | Max Depth: 4
General Category: Computing methodologies | Max Depth: 5
General Category: Applied computing | Max Depth: 5
General Category: Social and professional topics | Max Depth: 6
