In [135]:
import requests
from bs4 import BeautifulSoup
import time
import sys
import json

from typing import *

## Initial request to start the scraping

In [2]:
initial_req = requests.get("https://splus.cumulus.vub.ac.be/SWS/v3/evenjr/NL/STUDENTSET/studentset.aspx")
initial_html = BeautifulSoup(initial_req.text)

initial_html

<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml"><head><script language="JavaScript" src="studentset.js" type="text/JavaScript"></script>
<title>
	VUB - LESROOSTER PER STUDENTENGROEP - 2.0.51
</title><link href="https://fonts.googleapis.com/css?family=Roboto" rel="stylesheet"/><link href="../../css/global.css" rel="stylesheet" type="text/css"/><link href="styles.css" rel="stylesheet" type="text/css"/></head>
<body onload="checkTags();">
    <table border="0" cellpadding="0" cellspacing="0" width="100%">
    <tbody><tr>
        <td><img class="logo-vub" src="../../images/VUB-LOGO-BLACK.svg"/><font class="studentsettitle">LESROOSTER 
		  </font><img class="studentsettitle_arrow" src="../../images/studentsettitle_arrow.png"/></td>
    </tr>
    <tr>
        <td><a class="language_link" href="../../NL/STUDENTSET/">NEDERLANDS</a> | <a class="language_link" href="../../EN/STUDENTSET/">ENGLISH</a></td>
    </tr>
    <tr>
        <td>
            <hr class="HorizontalRule"/>
        

In [3]:
# get __VIEWSTATE
VIEWSTATE = initial_html.find(id='__VIEWSTATE').get("value")

VIEWSTATE

'/wEPDwUKMTkwNzE5MTYzNQ9kFgICBA9kFgQCAQ8PFgQeCENzc0NsYXNzBQZ0VGFibGUeBF8hU0ICAmRkAgMPDxYEHwAFBnRUYWJsZR8BAgJkZGTJEMT9VIl1dRnHEz7iurfJ9zkhtQ=='

## Proof of concept new request

`POST` request:
- `__VIEWSTATE`: from hidden input (sort of cooky system)
- `__EVENTTARGET`: often `tDepartmentClicked` (first argument `__doPostBack`)
- `__EVENTARGUMENT`: clicked element

JS functie gebruikt voor `POST`:
```javascript
var theForm = document.forms['form1'];
if (!theForm) {
    theForm = document.form1;
}
function __doPostBack(eventTarget, eventArgument) {
    if (!theForm.onsubmit || (theForm.onsubmit() != false)) {
        theForm.__EVENTTARGET.value = eventTarget;
        theForm.__EVENTARGUMENT.value = eventArgument;
        theForm.submit();
    }
}
```

In [20]:
# Clicking on MASTER (MA)
url = "https://splus.cumulus.vub.ac.be/SWS/v3/evenjr/NL/STUDENTSET/studentset.aspx"
data = {
    "__VIEWSTATE": VIEWSTATE,
    "__EVENTTARGET": "tTypeClicked", # Clicked event
    "__EVENTARGUMENT": "MA", # MASTER
}
with requests.Session() as s:
    original_req = s.get(url) # initial view
    master_req = s.post(url, data=data) # click event
    master_html = BeautifulSoup(master_req.text)

master_html

<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml"><head><script language="JavaScript" src="studentset.js" type="text/JavaScript"></script>
<title>
	VUB - LESROOSTER PER STUDENTENGROEP - 2.0.51
</title><link href="https://fonts.googleapis.com/css?family=Roboto" rel="stylesheet"/><link href="../../css/global.css" rel="stylesheet" type="text/css"/><link href="styles.css" rel="stylesheet" type="text/css"/></head>
<body onload="checkTags();">
    <table border="0" cellpadding="0" cellspacing="0" width="100%">
    <tbody><tr>
        <td><img class="logo-vub" src="../../images/VUB-LOGO-BLACK.svg"/><font class="studentsettitle">LESROOSTER 
		  </font><img class="studentsettitle_arrow" src="../../images/studentsettitle_arrow.png"/></td>
    </tr>
    <tr>
        <td><a class="language_link" href="../../NL/STUDENTSET/">NEDERLANDS</a> | <a class="language_link" href="../../EN/STUDENTSET/">ENGLISH</a></td>
    </tr>
    <tr>
        <td>
            <hr class="HorizontalRule"/>
        

## Structure of HTML

Session cookie is required!!!

De naam van de opleiding in de id
- `html`:
    - `td.td-set`: type opleiding -> `tTypeClicked`
        - BA
        - MA
        - ...
    - `td.td-set`: Faculteit -> `tDepartmentClicked`
        - LW
        - IR
        - ...
    - `td.td-set`: Opleiding -> `tTagClicked`
        - BA
        - MA
        - ...

`tTagClicked` redirects to `Default.aspx`.

- `POST` request to get selection
```python
data = {
    "__EVENTVALIDATION": evt_validation, # from html
    "__EVENTTARGET": "",
    "__EVENTARGUMENT": "",
    "__LASTFOCUS": "",
    "__VIEWSTATE": evt_view_state, # from html
    "tLinkType": "setbytag",
    "tWildcard": "",
    "dlObject": "#SPLUS9826", # chosen cours, from html <select id="dlObject"> <option>...
    "lbWeeks": [
        "1;2;3;4;5;6;7;8;9;10;11;12;13;14",
        "22;23;24;25;26;27;28;29;32;33;34;35;36"
    ],
    "lbDays": "1;2;3;4;5;6",
    "dlPeriod": "2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;19;20;21;22;23;24;25;26;27;28;29;30;31;32;33",
    "RadioType": "ical_set;ical_set;ical_set", # radio_2 from html (2 different types found)
    "bGetTimetable": "Bekijk+het+lesrooster",
}
```
- `GET` request to see url `https://splus.cumulus.vub.ac.be/SWS/v3/evenjr/NL/STUDENTSET/showtimetable.aspx`

ICal url in `<input id="ical_url"...`

- `GET` request to `studentset.aspx` to return to original list and preserve chosen state

In [62]:
# routines needed for next parts

def get_html(s: requests.Session, state_id: str, action_str: str, btn_id: str) -> BeautifulSoup:
    print(f"Making req: {action_str}: {btn_id}")
    url = "https://splus.cumulus.vub.ac.be/SWS/v3/evenjr/NL/STUDENTSET/studentset.aspx"
    data = {
        "__VIEWSTATE": state_id,
        "__EVENTTARGET": action_str, # Clicked event
        "__EVENTARGUMENT": btn_id, # clicked btn
    }
    req = s.post(url, data=data)
    html = BeautifulSoup(req.text)
    
    return html, html.find(id='__VIEWSTATE').get("value")

def get_sub_ids(section_html: BeautifulSoup) -> List[str]:
    td = section_html.find_all("td", class_="tCell")
    td.extend(section_html.find_all("td", class_="tCellSelected"))
    return list(map(lambda elem: elem['id'], td))

def get_sets(html: BeautifulSoup) -> List[BeautifulSoup]:
    return html.find_all("td", class_="td-set")

In [80]:
# Proof of concept get opleiding
with requests.Session() as s:
    s.headers["User-Agent"] = "Mozilla/5.0 (X11; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0"
    s.headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
    
    url = "https://splus.cumulus.vub.ac.be/SWS/v3/evenjr/NL/STUDENTSET/studentset.aspx"
    
    init_req = s.get(url)
    init_html = BeautifulSoup(init_req.text)
    view_state = html.find(id='__VIEWSTATE').get("value")
    print("===================== SESSION COOKIES ============")
    print(s.cookies)
    print("==================== INIT HTML ===================")
    print(init_html)
    
    print("[*] Set Content Type to form validation")
    s.headers["Content-Type"] = "application/x-www-form-urlencoded"
    
    # First click on MA
    ma_html, view_state = get_html(s, view_state, "tTypeClicked", "MA")
    print("===================== SESSION COOKIES ============")
    print(s.cookies)
    print("===================== MA HTML ====================")
    print(ma_html)
        
    # Second click on IR
    ir_html, view_state = get_html(s, view_state, "tDepartmentClicked", "IR")
    print("===================== SESSION COOKIES ============")
    print(s.cookies)
    print("===================== IR HTML ====================")
    print(ir_html)
    
    # Second click on IR
    ir_url_html, view_state = get_html(s, view_state, "tTagClicked", "SWS_BA_IR_NL_RS_Ingenieurswetenschappen_SET")
    print("===================== SESSION COOKIES ============")
    print(s.cookies)
    print("===================== TT HTML ====================")
    print(ir_url_html)
    
    # Select class and go to ical url
    url = "https://splus.cumulus.vub.ac.be/SWS/v3/evenjr/NL/STUDENTSET/Default.aspx"
    evt_validation = ir_url_html.find(id="__EVENTVALIDATION").get("value")
    evt_view_state = ir_url_html.find(id="__VIEWSTATE").get("value")
    
    data = {
        "__EVENTVALIDATION": evt_validation,
        "__EVENTTARGET": "",
        "__EVENTARGUMENT": "",
        "__LASTFOCUS": "",
        "__VIEWSTATE": evt_view_state,
        "tLinkType": "setbytag",
        "tWildcard": "",
        "dlObject": "#SPLUS9826", # Gekozen vak
        "lbWeeks": [
            "1;2;3;4;5;6;7;8;9;10;11;12;13;14",
            "22;23;24;25;26;27;28;29;32;33;34;35;36"
        ],
        "lbDays": "1;2;3;4;5;6",
        "dlPeriod": "2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;19;20;21;22;23;24;25;26;27;28;29;30;31;32;33",
        "RadioType": "ical_set;ical_set;ical_set",
        "bGetTimetable": "Bekijk+het+lesrooster",
    }
    
    select_req = s.post(url, data=data)
    select_html = BeautifulSoup(select_req.text)
    print("===================== SESSION COOKIES ============")
    print(s.cookies)
    print("===================== TT RET HTML ====================")
    print(select_html)
    
    # ical
    url = "https://splus.cumulus.vub.ac.be/SWS/v3/evenjr/NL/STUDENTSET/showtimetable.aspx"
    ical_req = s.get(url)
    ical_html = BeautifulSoup(ical_req.text)
    print("===================== SESSION COOKIES ============")
    print(s.cookies)
    print("===================== ICALL HTML ====================")
    print(ical_html)
    
    # Next calendar
    url = "https://splus.cumulus.vub.ac.be/SWS/v3/evenjr/NL/STUDENTSET/studentset.aspx?"
    back_req = s.get(url)
    back_html = BeautifulSoup(back_req.text)
    view_state = back_html.find(id="__VIEWSTATE").get("value")
    print("===================== SESSION COOKIES ============")
    print(s.cookies)
    print("===================== BACK HTML ====================")
    print(back_html)
    
    lw_html, view_state = get_html(s, view_state, "tDepartmentClicked", "LW")
    print("===================== SESSION COOKIES ============")
    print(s.cookies)
    print("===================== LW HTML ====================")
    print(lw_html)
    
    print("===================== SESSION COOKIES ============")
    print(s.cookies)
    
s = None

<RequestsCookieJar[<Cookie ASP.NET_SessionId=v0fqanfr0nbqa1mjunelakzk for splus.cumulus.vub.ac.be/>, <Cookie ScientiaSWS=12A7D6ECABA38FBDE3E270F28CFB16C8E388E278D259B886713107C9FFE98930C6F205E7B1EA0365E40077E07F65801FC6698FCA3FE23F8BE417B88544D42CAFC21F65718E962D484DC0309674CCEE46DDD0D719E3D71D1F6096D1E54A4A72B938CC99DF for splus.cumulus.vub.ac.be/>]>
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml"><head><script language="JavaScript" src="studentset.js" type="text/JavaScript"></script>
<title>
	VUB - LESROOSTER PER STUDENTENGROEP - 2.0.51
</title><link href="https://fonts.googleapis.com/css?family=Roboto" rel="stylesheet"/><link href="../../css/global.css" rel="stylesheet" type="text/css"/><link href="styles.css" rel="stylesheet" type="text/css"/></head>
<body onload="checkTags();">
    <table border="0" cellpadding="0" cellspacing="0" width="100%">
    <tbody><tr>
        <td><img class="logo-vub" src="../../images/VUB-LOGO-BLACK.svg"/><font class="studentsettitle">LESROOST

In [133]:
def get_ical(s: requests.Session, initial_html: BeautifulSoup):
    tree = {}
    
    # get all classes
    dlObject = initial_html.find(id="dlObject")
    dlObject_options = dlObject.find_all("option")
    opt_vals = list(map(lambda i: {"id": i.get("value"), "value": i.contents[0]}, dlObject_options))
    print(f"({len(opt_vals)})", end="")
    
    # Select class
    for opt in opt_vals:
        ical_html = ""
        
        try:
            url = "https://splus.cumulus.vub.ac.be/SWS/v3/evenjr/NL/STUDENTSET/Default.aspx"
            evt_validation = initial_html.find(id="__EVENTVALIDATION").get("value")
            evt_view_state = initial_html.find(id="__VIEWSTATE").get("value")
            evt_radio = initial_html.find(id="RadioType_2").get("value")

            data = {
                "__EVENTVALIDATION": evt_validation,
                "__EVENTTARGET": "",
                "__EVENTARGUMENT": "",
                "__LASTFOCUS": "",
                "__VIEWSTATE": evt_view_state,
                "tLinkType": "setbytag",
                "tWildcard": "",
                "dlObject": opt["id"], # Gekozen vak
                "lbWeeks": [
                    "1;2;3;4;5;6;7;8;9;10;11;12;13;14",
                    "22;23;24;25;26;27;28;29;32;33;34;35;36"
                ],
                "lbDays": "1;2;3;4;5;6",
                "dlPeriod": "2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;19;20;21;22;23;24;25;26;27;28;29;30;31;32;33",
                "RadioType": evt_radio,
                "bGetTimetable": "Bekijk+het+lesrooster",
            }

            opt_id = opt["id"]
            #print(f"Making request to Default with {opt_id}")
            print(".", end="")
            select_req = s.post(url, data=data) # Send request
            #select_html = BeautifulSoup(select_req.text)

            # ical
            url = "https://splus.cumulus.vub.ac.be/SWS/v3/evenjr/NL/STUDENTSET/showtimetable.aspx"
            ical_req = s.get(url) # Go to ical screen
            ical_html = BeautifulSoup(ical_req.text)
            url_input = ical_html.find(id="ical_url")
            ical_url = url_input.get("value")

            tree[opt["id"]] = {
                "url": ical_url,
                **opt
            }
        except AttributeError as err:
            opt_id = opt["id"]
            print(f"ICal could not be found for {opt}, error: {err}", file=sys.stderr)
            print(ical_html, file=sys.stderr)
        
    print() # esthetics
    # Next calendar
    print("Going back")
    url = "https://splus.cumulus.vub.ac.be/SWS/v3/evenjr/NL/STUDENTSET/studentset.aspx?"
    back_req = s.get(url)
    back_html = BeautifulSoup(back_req.text)
    view_state = back_html.find(id="__VIEWSTATE").get("value")
    
    return tree, view_state

def get_opleid(s: requests.Session, state_id: str, initial_html: BeautifulSoup):
    tree = {}
    sets = get_sets(initial_html)
    opleidingen = get_sub_ids(sets[2])
    for opleiding in opleidingen:
        tree[opleiding] = {}
        html, state_id = get_html(s, state_id, "tTagClicked", opleiding)
        ical, state_id = get_ical(s, html)
        tree[opleiding] = ical
    return tree, state_id

def get_fac(s: requests.Session, state_id: str, inital_html: BeautifulSoup):
    tree = {}
    sets = get_sets(inital_html)
    fac = get_sub_ids(sets[1])
    for f in fac[2:]:
        html, state_id = get_html(s, state_id, "tDepartmentClicked", f)
        opl, state_id = get_opleid(s, state_id, html)
        tree[f] = opl
    return tree, state_id

def get_type(s: requests.Session, state_id: str, initial_html: BeautifulSoup):
    tree = {}
    sets = get_sets(initial_html)
    types = get_sub_ids(sets[0])
    for t in types:
        html, state_id = get_html(s, state_id, "tTypeClicked", t)
        fac, state_id = get_fac(s, state_id, html)
        tree[t] = fac
    return tree, state_id

In [134]:
# Buildng tree
with requests.Session() as s:
    s.headers["User-Agent"] = "Mozilla/5.0 (X11; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0"
    s.headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
    
    req = s.get(url) # Inital req
    html = BeautifulSoup(req.text)
    view_state = html.find(id='__VIEWSTATE').get("value")

    print("[*] Set Content Type to form validation")
    s.headers["Content-Type"] = "application/x-www-form-urlencoded"
    
    tree = get_type(s, view_state, html)
    
tree

[*] Set Content Type to form validation
Making req: tTypeClicked: BA
Making req: tDepartmentClicked: PE
Making req: tTagClicked: SWS_BA_PE_NL_RS_Agogische_Wetenschappen_SET
(9).........
Going back
Making req: tTagClicked: SWS_BA_PE_NL_RS_Psychologie_SET
(21).....................
Going back
Making req: tTagClicked: SWS_BA_PE_NL_WS_Psychologie_SET
(4)....
Going back
Making req: tDepartmentClicked: ES
Making req: tTagClicked: SWS_BA_ES_ENG_RS_Business_Economics_GRP
(4)....
Going back
Making req: tTagClicked: SWS_BA_ES_NL_RS_Communicatiewetenschappen_GRP
(3)...
Going back
Making req: tTagClicked: SWS_BA_ES_NL_RS_Toegepaste_Economische_Wetenschappen_Handelsingenieur_GRP
(3)...
Going back
Making req: tTagClicked: SWS_BA_ES_NL_RS_Politieke_Wetenschappen_GRP
(3)...
Going back
Making req: tTagClicked: SWS_BA_ES_ENG_RS_Social_Sciences_GRP
(5).....
Going back
Making req: tTagClicked: SWS_BA_ES_NL_RS_Sociologie_GRP
(24)........................
Going back
Making req: tTagClicked: SWS_BA_ES_NL_RS_To

({'BA': {'PE': {'SWS_BA_PE_NL_RS_Agogische_Wetenschappen_SET': {'BPS2257': {'url': 'http://splus.cumulus.vub.ac.be/sws/v3/evenjr/NL/XML/default.aspx?ical_set&p1=EF29CC48C18E1A2B440A71EC42FE8852',
      'id': 'BPS2257',
      'value': '1 BA Agogische Wetenschappen'},
     '#SPLUS82A90F': {'url': 'http://splus.cumulus.vub.ac.be/sws/v3/evenjr/NL/XML/default.aspx?ical_set&p1=0467517F8C3703143468F881EA82A90F',
      'id': '#SPLUS82A90F',
      'value': '1 BA Agogische wetenschappen - steunmodule'},
     '#SPLUS34CA71': {'url': 'http://splus.cumulus.vub.ac.be/sws/v3/evenjr/NL/XML/default.aspx?ical_set&p1=3F5C794BF0DA4D944A7151F93134CA71',
      'id': '#SPLUS34CA71',
      'value': '2 BA Agogische Wetenschappen'},
     '#SPLUS82A910': {'url': 'http://splus.cumulus.vub.ac.be/sws/v3/evenjr/NL/XML/default.aspx?ical_set&p1=0467517F8C3703143468F881EA82A910',
      'id': '#SPLUS82A910',
      'value': '2 BA Agogische wetenschappen - steunmodule'},
     '4G92212_1509': {'url': 'http://splus.cumulus.

In [141]:
data_tree = json.dumps(tree[0])

In [142]:
with open("data_tree_dummy.json", "w") as f:
    f.write(data_tree)