In [1]:
import json
import numpy as np
import pandas as pd

In [2]:
with open("../data/raw/epfl.json") as file:
    epfl = json.load(file)
    
with open("../data/raw/epfl_master_specs.json") as file:
    epfl_masterspecs = json.load(file)

Snapshots from the json objects:

In [3]:
epfl['master']['Data Science']['courses']['CS-449']

{'name': 'Systems for data science',
 'section': 'IN',
 'language': 'English',
 'note': '',
 'path': '/coursebook/en/systems-for-data-science-CS-449?cb_cycle=bama_cyclemaster&cb_section=sc_ds',
 'semester': 'Spring',
 'exam_form': 'During the semester',
 'credits': '6',
 'subject_examined': 'Systems for data science',
 'lecture': '2 Hour(s) per week x 14 weeks',
 'exercises': '2 Hour(s) per week x 14 weeks',
 'project': '2 Hour(s) per week x 14 weeks',
 'lecturers': [['Koch Christoph', 'http://people.epfl.ch/205917']],
 'required': ['CS-322: Introduction to database systems'],
 'recommended': ['CS-323: Introduction to operating systems',
  'CS-206 Parallelism and concurrency'],
 'concepts': ['Algorithms and data structures ¿ sorting algorithms, balanced trees, graph traversals.',
  'The Scala programming language will be used throughout the course. Programming experience in this language is strongly recommended.',
  'Basic knowledge or computer networking and distributed systems'],
 'p

In [4]:
epfl_masterspecs['Civil Engineering']

{'spec_key': {'b': 'Geotechnical Engineering',
  'c': 'Transportation',
  'd': 'Structural Engineering',
  'e': 'Hydraulic Engineering and Energy'},
 'courses': {'CIVIL-597': [],
  'CIVIL-599': [],
  'ENG-445': [],
  'CIVIL-420': ['d'],
  'CIVIL-457': ['c'],
  'CIVIL-402': ['b'],
  'CIVIL-412': ['e'],
  'CIVIL-474': [],
  'CIVIL-491': [],
  'CIVIL-490': [],
  'PENS-490': [],
  'CIVIL-492': [],
  'PENS-491': [],
  'CIVIL-598': [],
  'CIVIL-443': ['d'],
  'CIVIL-422': ['b', 'd'],
  'CIVIL-414': ['d'],
  'CIVIL-435': ['d'],
  'CIVIL-476': ['d'],
  'ENV-508': [],
  'AR-535': ['d'],
  'CIVIL-423': ['b'],
  'CIVIL-430': ['d'],
  'CIVIL-525': ['d'],
  'ENG-470': ['b', 'd', 'e'],
  'ENG-471': ['b', 'd', 'e'],
  'CIVIL-411': ['b', 'd', 'e'],
  'CIVIL-557': ['c'],
  'CIVIL-459': ['c'],
  'ENG-466': [],
  'CIVIL-444': ['b', 'd', 'e'],
  'CIVIL-428': ['b'],
  'CIVIL-436': ['b', 'd', 'e'],
  'CIVIL-437': ['b', 'd', 'e'],
  'CIVIL-515': ['e'],
  'CIVIL-410': ['b', 'e'],
  'CIVIL-403': ['b', 'e'],
  

Let's start with noting that only 25 master's programs are currently offered at EPFL and only 13 bachelor programs. So, we have some outdated programs in our database. Let's remove them.

In [5]:
{level: len(programs) for level, programs in epfl.items()}

{'propedeutics': 14,
 'bachelor': 17,
 'master': 29,
 'minor': 20,
 'doctoral_school': 22}

In [6]:
# Only 13 bachelor programs are currently offered
# + Humanities and Social Sciences Program
# + Design Together ENAC
# https://www.epfl.ch/education/bachelor/programs/
bachelor_not_current = ['Chemistry', 'Chemical Engineering']

# Only 25 master's programs are currently offered
# + Humanities and Social Sciences Program
# https://www.epfl.ch/education/master/programs/
master_not_current = [
    'Bioengineering',
    'Life Sciences and Technologies - master program',
    'Micro- and Nanotechnologies for Integrated Systems'
]

# The following minors are missing, among possibly others:
#   Computational science and engineering
#   Mechanical engineering
# We do not take any action for now

In [7]:
# remove programs not currently offered from the main json object
for level_name in list(epfl.keys()):
    for program_name in list(epfl[level_name].keys()):
        if (
            (level_name == 'bachelor' and program_name in bachelor_not_current) or 
            (level_name == 'master' and program_name in master_not_current)
        ):
            del epfl[level_name][program_name]
            if level_name == 'master' and program_name in epfl_masterspecs:
                # remove program from the list of programs with specializations
                del epfl_masterspecs[program_name]

Let's now analyze master's specializations (no specializations at other levels) and evaluate data accuracy. We compare the data source (studyplan pages) with the studyplan brochures. We have created a separate json object `epfl_master_specs` to store information about specializations.

In [8]:
"""
The studyplan page sometimes differs greatly from the up-to-date studyplan brochure
which might indicate that the studyplan page hasn't been updated.
We skip these specializations for now (we would have to manually type them in):

Architecture
https://www.epfl.ch/education/master/wp-content/uploads/2018/08/ENAC_ARCHI_MA-1.pdf
vs.
https://edu.epfl.ch/studyplan/en/master/architecture
"""
specs_to_remove = ['Architecture']

"""
The specializations legend on studyplan pages is sometimes obsolete (hasn't been removed)
We remove these programs from the list of specialization programs

Materials Science and Engineering (only one specialization now)
https://www.epfl.ch/education/master/wp-content/uploads/2018/08/STI_MX_MA-1.pdf
vs.
https://edu.epfl.ch/studyplan/en/master/materials-science-and-engineering

Microengineering
https://www.epfl.ch/education/master/wp-content/uploads/2018/08/STI_SMT_MA_RV-1.pdf
vs.
https://edu.epfl.ch/studyplan/en/master/microengineering
"""
specs_to_remove.extend(['Materials Science and Engineering', 'Microengineering'])

"""
The specializations legend on studyplan pages is sometimes obsolete, but the studyplan itself
contains references to specializations that correctly correspond to the the studyplan brochure
In this case, we update the legend manually and fix the data

Electrical and Electronics Engineering
https://www.epfl.ch/education/master/wp-content/uploads/2018/08/STI_EL_MA-1.pdf
vs.
https://edu.epfl.ch/studyplan/en/master/electrical-and-electronics-engineering
"""
electrical_electronics_eng_specs = {
    "a": "Microelectronics circuits and systems",
    "b": "Electronic technologies and device-circuit interactions",
    "c": "Bioelectronics",
    "d": "Internet of Things (IoT)",
    "e": "Data Science and Systems",
    "f": "Signal, Image, Video and Communication",
    "g": "Wireless and Photonics Circuits and Systems",
}

We remove all programs in `specs_to_remove`:

In [9]:
for program in specs_to_remove:
    del epfl_masterspecs[program]

... and fix "Electrical and Electronics Engineering" specializations key:

In [10]:
epfl_masterspecs['Electrical and Electronics Engineering']['spec_key'] = electrical_electronics_eng_specs

We observe that some courses have listed specializations not present in the `spec_key` for the given program. We remove these references:

In [11]:
for program_name, program_spec in epfl_masterspecs.items():
    for code, course_spec in program_spec['courses'].items():
        if (any(spec not in program_spec['spec_key'] for spec in course_spec)):
            print(f"{program_name}, spec key: {list(program_spec['spec_key'].keys())}, {code}: {course_spec}")
            
            new_course_spec = [spec for spec in course_spec if spec in program_spec['spec_key']]
            epfl_masterspecs[program_name]['courses'][code] = new_course_spec
            
            print(f"Course spec changed from {course_spec} to {new_course_spec}")

Electrical and Electronics Engineering, spec key: ['a', 'b', 'c', 'd', 'e', 'f', 'g'], EE-588: ['h']
Course spec changed from ['h'] to []
Electrical and Electronics Engineering, spec key: ['a', 'b', 'c', 'd', 'e', 'f', 'g'], MATH-261: ['d', 'e', 'h']
Course spec changed from ['d', 'e', 'h'] to ['d', 'e']
Electrical and Electronics Engineering, spec key: ['a', 'b', 'c', 'd', 'e', 'f', 'g'], CS-423: ['d', 'f', 'h']
Course spec changed from ['d', 'f', 'h'] to ['d', 'f']
Electrical and Electronics Engineering, spec key: ['a', 'b', 'c', 'd', 'e', 'f', 'g'], EE-576: ['h']
Course spec changed from ['h'] to []
Electrical and Electronics Engineering, spec key: ['a', 'b', 'c', 'd', 'e', 'f', 'g'], ME-409: ['h']
Course spec changed from ['h'] to []
Electrical and Electronics Engineering, spec key: ['a', 'b', 'c', 'd', 'e', 'f', 'g'], EE-466: ['h']
Course spec changed from ['h'] to []
Electrical and Electronics Engineering, spec key: ['a', 'b', 'c', 'd', 'e', 'f', 'g'], MICRO-565: ['h']
Course spe

Next, we add a `specs` dictionary to each program in `epfl_masterspecs` with keys and values switching roles compared to the `courses` property. This will make it easier to look up all courses with a given specialization.

In [12]:
# courses property
list(epfl_masterspecs['Computer Science']['courses'].items())[:10]

[('CS-595', []),
 ('CS-599', []),
 ('CS-450', ['b', 'c', 'd', 'e', 'i']),
 ('CS-470', ['a', 'd', 'g']),
 ('CS-442', ['f']),
 ('COM-401', ['d', 'e', 'g', 'j']),
 ('CS-422', ['b', 'c', 'g', 'j']),
 ('CS-438', ['g']),
 ('CS-451', ['c', 'g', 'i', 'j']),
 ('CS-452', ['c', 'g'])]

In [13]:
for program_name, program_spec in epfl_masterspecs.items():
    # initialize dictionary with empty list values
    epfl_masterspecs[program_name]['specs'] = {k: [] for k in epfl_masterspecs[program_name]['spec_key']}
    
    for course_code, course_spec in program_spec['courses'].items():
        # iterate over all specialization codes for each course, and add the course code to the specs dictionary
        for k in course_spec:
            epfl_masterspecs[program_name]['specs'][k].append(course_code)

In [14]:
epfl_masterspecs['Computer Science']['specs']['g']

['CS-470',
 'COM-401',
 'CS-422',
 'CS-438',
 'CS-451',
 'CS-452',
 'COM-407',
 'CS-420',
 'CS-471',
 'CS-453',
 'COM-405',
 'COM-503',
 'CS-522']

Now, we have processed the specializations and can move on.

Next step is to create two new json objects by transforming the `epfl` object. The goal will be to remove redundancies in the original object, since it has many courses occuring repeatedly, and not only enable easy access to programs, but also _individual courses_. The idea is to create one object `epfl_courses` with all course details and another object `epfl_programs` which will only includes course codes for each program.

In [15]:
epfl_courses = {}

for level_name in list(epfl.keys()):
    for program_name in list(epfl[level_name].keys()):
        for course_name in list(epfl[level_name][program_name]["courses"].keys()):
            epfl_courses[course_name] = epfl[level_name][program_name]["courses"][course_name]

#storing the dict in a json file
with open('../data/processed/epfl_courses.json', 'w') as json_file:
    json.dump(epfl_courses, json_file, indent=4)

In [24]:
{'1': 1, '1': 2, '2': { '3': 3 }} == {'1': 1, '1': 2, '2': { '3': 3 }}

False

TODO: check whether same courses in different sections ever have a different number of coefficients?

should we add the url to the program description in epfl_programs too?

In [18]:
epfl_programs = {}

for level_name in list(epfl.keys()):
    #initializing empty dicts for every eduacation level
    epfl_programs[level_name] = {}
    
    for program_name in list(epfl[level_name].keys()):
        courses = list(epfl[level_name][program_name]["courses"].keys())
        
        #adding the course list for a given program
        epfl_programs[level_name].update({program_name : courses})

#storing the dict in a json file
with open('../data/processed/epfl_programs.json', 'w') as json_file:
    json.dump(epfl_programs, json_file, indent=4)

In [20]:
# epfl_programs