# yorku_eecs_course_map
We wil scrape data off the YorkU website for selecting courses and create a graph so that students can see if they have prerequisites to take certain courses.
<br>This Python notebook will be used to collect the data.

### Import dependencies

In [54]:
from urllib.request import urlopen
from bs4 import BeautifulSoup as soup
import re
import json

In [81]:
regex_normal = re.compile('[A-Z][A-Z][/][A-z]{4} [0-9]{4}')
regex_with_spaces = re.compile('[A-Z][A-Z][/] [A-z]{4} [0-9]{4}')
regex_old = re.compile('[A-Z][A-Z][/][A-Z]{3} [0-9]{4}')

def parse_info_and_get_courses(prerequisite_description):
    courses = prerequisite_description.split(';')
    prerequisites = []
    for course in  courses:
        contents = [[], [], []]
        contents[0] = regex_normal.findall(course)
        contents[1] = regex_with_spaces.findall(course)
        contents[2] = regex_old.findall(course)
        for content in contents:
            if content != []:
                prerequisites.append(content)
                
    return prerequisites


def establish_connection_and_collect_data():
    JSON_OUTPUT = '{"courses": ['
    COURSES_OFFERED = 92  # the eecs department at york offers 118 courses over f/w
    
    for i in range(COURSES_OFFERED):
        if i == 49 or i == 50 or i == 81:
            pass  # internship semsester for comp sci and security, cs projet
        else:
            # prepare url before making call to website
            url ='https://w2prod.sis.yorku.ca/Apps/WebObjects/cdm.woa/4/wo/Pghe5gIdme1Q6CqIxg4LGg/2.3.10.8.3.'
            url += str(i) + '.0.5'

            ''' make call to website '''
            # establish connection and download html file
            u_client = urlopen(url)
            # offload contents into variable
            page_html = u_client.read()
            # close connection with client
            u_client.close()

            ''' parse html via BS '''
            page_soup = soup(page_html, 'html.parser')
            course_title = str(page_soup.find('font', {'color': '#CC0000'}))
            course_title  = course_title[22:-7]
            course_description = page_soup.find('table', {'cellpadding': '10'}).tr.td.find_all('p')
            course_description = course_description[3].text

            # course information
            course_code = course_title[:12]
            course_credit = course_title[15:19]
            course_name = course_title[22:]
            prerequisite_description = ''
            credit_exclusion_description = ''
            
            if 'prerequisites' in course_description.lower():
                # two ways information is presented
                # Prerequisites: General Prerequisite; LE/EECS 1030 3.00 or LE/EECS 2030 3.00. 
                # Prerequisites: SC/MATH 1190 3.00, or two 4U Math courses
                start_index = course_description.lower().index('prerequisites') + 14  # including the ':' after the word
                if 'general prerequisite' in course_description.lower():
                    start_index = course_description.lower().index('general prerequisite') + 21
                end_index = len(course_description) - 1  # to account for '.' @ end of the sentence
                if 'course credit exclusion' in course_description.lower():
                    end_index = course_description.lower().index('course credit exclusion')
                    credit_exclusion_description = course_description[end_index:]
                # additional space left there by human error
                if 'course credit  exclusions:' in course_description.lower():
                    end_index = course_description.lower().index('course credit  exclusion')
                    credit_exclusion_description = course_description[end_index:]
                if 'corequisites' in course_description.lower():
                    end_index = course_description.lower().index('corequisites') - 2 # account for '.' @ end of the sentence
                prerequisite_description = course_description[start_index:end_index]
                # THE CODE ABOVE HAS BEEN CHECKED AND WORKS AS IT WAS INTENDED

            elif 'prerequisite' in course_description.lower():
                # some have: Prerequisite: SC/MATH 1013 3.00 or equivalent;
                start_index = course_description.lower().index('prerequisite') + 13  # including the ':' after the word
                if 'general prerequisite' in course_description.lower():
                    start_index = course_description.lower().index('general prerequisite') + 21
                end_index = len(course_description) - 1  # to account for '.' @ end of the sentence
                if 'course credit exclusion' in course_description.lower():
                    end_index = course_description.lower().index('course credit exclusion') - 2
                    credit_exclusion_description = course_description[end_index:]
                if 'corequisite' in course_description.lower():
                    end_index = course_description.lower().index('corequisite') - 2 # account for '.' @ end of the sentence
                prerequisite_description = course_description[start_index:end_index]
                # THE CODE ABOVE HAS BEEN CHECKED AND WORKS AS IT WAS INTENDED

            else:
                prerequisite_description = 'None'
                
            prerequisites = parse_info_and_get_courses(prerequisite_description)
            credit_exclusions = parse_info_and_get_courses(credit_exclusion_description)
            
            
            dict_string = {'course_code': course_code, 'course_name': course_name, 'course_credit': course_credit, 'prerequisites': prerequisites, 'credit_exclusions': credit_exclusions} 
            JSON_OUTPUT += json.dumps(dict_string) + ', '
            print(dict_string, '\n')
    
    JSON_OUTPUT = JSON_OUTPUT[:-2]   # remove the last comma
    JSON_OUTPUT += ']}'
    
    with open('courses.json', 'w') as f:
        for letter in JSON_OUTPUT:
            f.write(letter)
    # THE DATA COLLECTION STEP IS NOW COMPLETE! TIME TO MAKE COOL VISUALIZATIONS.


            
establish_connection_and_collect_data()

{'course_code': 'LE/EECS 1001', 'course_name': 'Research Directions in Computing', 'course_credit': '1.00', 'prerequisites': [], 'credit_exclusions': []} 

{'course_code': 'LE/EECS 1011', 'course_name': 'Computational Thinking through Mechatronics', 'course_credit': '3.00', 'prerequisites': [], 'credit_exclusions': []} 

{'course_code': 'LE/EECS 1012', 'course_name': 'Net-centric Introduction to Computing', 'course_credit': '3.00', 'prerequisites': [], 'credit_exclusions': [['AP/ITEC 3020'], ['LE/EECS 2041'], ['SC/CSE 2041', 'SC/CSE 2041']]} 

{'course_code': 'LE/EECS 1019', 'course_name': 'Discrete Mathematics for Computer Science', 'course_credit': '3.00', 'prerequisites': [['SC/MATH 1190']], 'credit_exclusions': [['LE/EECS 1028', 'SC/MATH 1028', 'SC/MATH 2320']]} 

{'course_code': 'LE/EECS 1021', 'course_name': 'Object Oriented Programming from Sensors to Actuators', 'course_credit': '3.00', 'prerequisites': [], 'credit_exclusions': [['LE/EECS 1022'], ['LE/CSE 1020', 'SC/CSE 1020']]