# Load in Libraries

In [1]:
# data loading and manipulation
import os
import requests

import bs4 as bs

import string
import re
from nltk.corpus import stopwords
stops = set(stopwords.words("english"))

import urllib.request
import ssl
import html5lib
context = ssl._create_unverified_context()

# Scrape Data for CS Fields of Study
Use ACM CCS as field classifications: https://dl.acm.org/ccs/ccs_flat.cfm#10003120

In [2]:
# check if source is already downloaded
css_source_file = './acm_ccs_source.txt'
ccs_source = ''

if os.path.isfile(css_source_file):
    with open(css_source_file, 'r') as f:
        ccs_source = f.read(); 
else:
    acm_ccs_url = 'https://dl.acm.org/ccs/ccs_flat.cfm#10003120'
    ccs_source = urllib.request.urlopen(acm_ccs_url, context=context).read()

ccs_parsed = bs.BeautifulSoup(ccs_source, 'html5lib')

In [3]:
# from https://stackoverflow.com/questions/17850121/parsing-nested-html-list-with-beautifulsoup
def dictify(ul, parent):
    result = []
    for li in ul.find_all('li', recursive=False):
        key = next(li.stripped_strings)
        ul = li.find('ul')
        if ul:
            result.append({
                'name': key,
                'children': dictify(ul, key)
            })
    return result

def max_depth(tree, curr_depth):
    # base case: no more children
    if len(tree) == 0:
        return curr_depth
    
    # recurse deeper
    curr_max_depth = curr_depth
    for child in tree:
        new_max_depth = max_depth(child['children'], curr_depth + 1)
        if new_max_depth > curr_max_depth:
            curr_max_depth = new_max_depth
    
    return curr_max_depth

def get_nodes_at_depth(tree, parent, curr_depth, desired_depth):
    """
    Gets nodes at the desired depth level.
    """
    # base case: no more children or depth limit reached
    if len(tree) == 0 or curr_depth > desired_depth:
        return
    
    # recurse deeper
    if curr_depth == desired_depth:
        print('Nodes for {}'.format(parent))
    for child in tree:
        if curr_depth == desired_depth:
            print(child['name'])
        get_nodes_at_depth(child['children'], child['name'], curr_depth + 1, desired_depth)
    
    print()
    return

In [4]:
def index_of_key(list_of_dicts, field, target):
    for index, curr_dict in enumerate(list_of_dicts):
        if curr_dict[field] == target: 
            return index

In [5]:
def add_parent(tree, curr_parent=None):
    # base case: no more children
    if len(tree) == 0:
        return
    
    for child in tree:
        # set current parent
        child['parent'] = curr_parent
        
        # recurse deeper
        new_max_depth = add_parent(child['children'], curr_parent=child['name'])
    
    return

In [6]:
# generate output from html parse tree
flat_content = ccs_parsed.find('div', id='holdflat').ul
output = dictify(flat_content, None)

# remove general and reference
output = [x for x in output if x['name'] != 'General and reference']

# add all subfields of HCI into HCI, and replace hcc with hci
hcc_index = index_of_key(output, 'name', 'Human-centered computing')
hci_index = index_of_key(output[hcc_index]['children'], 'name', 'Human computer interaction (HCI)')

new_hci_children = [child for index, child in enumerate(output[hcc_index]['children']) if index != hci_index]
output[hcc_index]['children'][hci_index]['children'] += new_hci_children

# remove human-centered computing and replace with hci
output[hcc_index] = output[hcc_index]['children'][hci_index]

# combine stuff under AI and make its own top-level grouping
comp_method_index = index_of_key(output, 'name', 'Computing methodologies')
ai_index = index_of_key(output[comp_method_index]['children'], 'name', 'Artificial intelligence')
symb_index = index_of_key(output[comp_method_index]['children'], 'name', 'Symbolic and algebraic manipulation')
ml_index = index_of_key(output[comp_method_index]['children'], 'name', 'Machine learning')
modeling_index = index_of_key(output[comp_method_index]['children'], 'name', 'Modeling and simulation')

output[comp_method_index]['children'][ai_index]['children'] += [output[comp_method_index]['children'][symb_index],
                                                                output[comp_method_index]['children'][ml_index],
                                                                output[comp_method_index]['children'][modeling_index]]
output += [output[comp_method_index]['children'][ai_index]]

# add others from Computing methodologies
output += [child for index, child in enumerate(output[comp_method_index]['children'])
           if index not in set([ai_index, symb_index, ml_index, modeling_index])] 

# delete Computing methodologies
del output[comp_method_index]

# add parents
add_parent(output)

# show final output
output

[{'name': 'Hardware',
  'children': [{'name': 'Printed circuit boards',
    'children': [{'name': 'Electromagnetic interference and compatibility',
      'children': [],
      'parent': 'Printed circuit boards'},
     {'name': 'PCB design and layout',
      'children': [],
      'parent': 'Printed circuit boards'}],
    'parent': 'Hardware'},
   {'name': 'Communication hardware, interfaces and storage',
    'children': [{'name': 'Signal processing systems',
      'children': [{'name': 'Digital signal processing',
        'children': [],
        'parent': 'Signal processing systems'},
       {'name': 'Beamforming',
        'children': [],
        'parent': 'Signal processing systems'},
       {'name': 'Noise reduction',
        'children': [],
        'parent': 'Signal processing systems'}],
      'parent': 'Communication hardware, interfaces and storage'},
     {'name': 'Sensors and actuators',
      'children': [],
      'parent': 'Communication hardware, interfaces and storage'},
   

In [7]:
# show depths for each category
for general_category in output:
    print('{}: Max Depth =  {}'.format(general_category['name'], max_depth([general_category], 0)))

Hardware: Max Depth =  5
Computer systems organization: Max Depth =  5
Networks: Max Depth =  6
Software and its engineering: Max Depth =  6
Theory of computation: Max Depth =  6
Mathematics of computing: Max Depth =  6
Information systems: Max Depth =  5
Security and privacy: Max Depth =  4
Human computer interaction (HCI): Max Depth =  4
Applied computing: Max Depth =  5
Social and professional topics: Max Depth =  6
Artificial intelligence: Max Depth =  5
Parallel computing methodologies: Max Depth =  3
Computer graphics: Max Depth =  3
Distributed computing methodologies: Max Depth =  3
Concurrent computing methodologies: Max Depth =  2


In [8]:
get_nodes_at_depth(output, None, 1, 1)

Nodes for None
Hardware
Computer systems organization
Networks
Software and its engineering
Theory of computation
Mathematics of computing
Information systems
Security and privacy
Human computer interaction (HCI)
Applied computing
Social and professional topics
Artificial intelligence
Parallel computing methodologies
Computer graphics
Distributed computing methodologies
Concurrent computing methodologies



In [9]:
get_nodes_at_depth(output, None, 1, 2)

Nodes for Hardware
Printed circuit boards
Communication hardware, interfaces and storage
Integrated circuits
Very large scale integration design
Power and energy
Electronic design automation
Hardware validation
Hardware test
Robustness
Emerging technologies

Nodes for Computer systems organization
Architectures
Embedded and cyber-physical systems
Real-time systems
Dependable and fault-tolerant systems and networks

Nodes for Networks
Network architectures
Network protocols
Network components
Network algorithms
Network performance evaluation
Network properties
Network services
Network types

Nodes for Software and its engineering
Software organization and properties
Software notations and tools
Software creation and management

Nodes for Theory of computation
Models of computation
Formal languages and automata theory
Computational complexity and cryptography
Logic
Design and analysis of algorithms
Randomness, geometry and discrete structures
Theory and algorithms for application domains

# Generate KRF From Scraped Data
Example predicates that should be generated:
```
(isa ML-Topic AcademicTopic)
(subTopic ML-Topic AI-Topic)
```

In [10]:
def clean_topic_name(topic_name):
    # remove any text between parens
    cleaned_topic_name = re.sub("([\(\[]).*?([\)\]])", "\g<1>\g<2>", topic_name)
    
    # remove any puncutation and convert to lowercase
    cleaned_topic_name = cleaned_topic_name.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation))).lower()
    
    # remove stop words and make CamelCase
    cleaned_topic_name = ''.join([word.title() for word in cleaned_topic_name.split() if not word.isspace() and word not in stops])
    
    return cleaned_topic_name


def generate_krf_as_list(tree, krf_list):
    # base case: no more children
    if len(tree) == 0:
        return krf_list
    
    # recurse deeper
    for child in tree:
        curr_child_name = clean_topic_name(child['name'])
        krf_list.append('(isa {} AcademicTopic)'.format(curr_child_name))
        
        if child['parent'] is not None:
            curr_parent_name = clean_topic_name(child['parent'])
            krf_list.append('(subTopic {} {})'.format(curr_child_name, curr_parent_name))
            
        krf_list = generate_krf_as_list(child['children'], krf_list)
    
    return krf_list

In [11]:
krf_list = generate_krf_as_list(output, [])
with open('../academic-fields.krf', 'w') as f:
    f.write('(in-microtheory TeachingKioskMt)\n\n')
    f.write('\n'.join(krf_list))