In [155]:
import re
from urllib.parse import urljoin

from bs4 import BeautifulSoup
import requests

Let's define the url we start to and the base url for our future requests

In [156]:
URL = "http://isa.epfl.ch/imoniteur_ISAP/%21gedpublicreports.htm"
BASE_URL = 'http://isa.epfl.ch/imoniteur_ISAP/'

We initalize session, and start from "List of registered students by section and semester" page defined by URL and jump there using the GET. There is one parameter defined as part of href.

In [157]:
# start session
session = requests.Session()
payload = {'ww_i_reportmodel': '133685247'}
response = session.get(URL, 
                       headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'},
                      params=payload)

The search page is a FRAME. From Postman we can find that the right one ends with filter. We extract our target destination frame and create a link address using BASE_URL and the right src

In [158]:
soup = BeautifulSoup(response.text, 'html.parser')
frames = soup.find_all('frame')
header_link = [urljoin(BASE_URL, frame.get('src'))
                            for frame in frames if frame.get('src').find('filter')!=-1][0]

Finally we can go to the page used to extract trageted data.

In [159]:
# get document html url
response = session.get(header_link, headers={'Referer': URL})
soup = BeautifulSoup(response.text, 'html.parser')

Check that we did it correctly

In [160]:
print(soup.prettify())

<html>
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type">
   <div>
   </div>
   <title>
   </title>
   <script src="GEDPUBLICREPORTS.txt?ww_x_path=Gestac.Base.Palette_js&amp;ww_c_langue=fr" type="text/javascript">
   </script>
   <link href="GEDPUBLICREPORTS.css?ww_x_path=Gestac.Moniteur.Style" rel="stylesheet" type="text/css">
    <link href="GEDPUBLICREPORTS.css?ww_x_path=Gestac.Moniteur.StyleNavigator" rel="stylesheet" type="text/css"/>
   </link>
  </meta>
 </head>
 <body alink="#666666" bgcolor="#ffffff" link="#666666" marginheight="0" marginwidth="5" vlink="#666666">
  <div class="filtres">
   <form action="!GEDPUBLICREPORTS.filter" method="GET" name="f">
    <input name="ww_b_list" type="hidden" value="1">
     <input name="ww_i_reportmodel" type="hidden" value="133685247">
      <input name="ww_c_langue" type="hidden" value="">
       <h1 id="titre">
        Liste des étudiants inscrits par semestre
       </h1>
       <table border="0" id="format">
 

Now we define the targeted variables what we are looking for Section, Academique and Pedagogique periods

In [161]:
sections = ["Informatique"]
acad_periods = [str(i)+"-"+str(i+1) for i in range(2007,2017)]
pedag_periods = [
                "Bachelor semestre 1",
                "Bachelor semestre 2",
                "Bachelor semestre 3",
                "Bachelor semestre 4",
                "Bachelor semestre 5",
                "Bachelor semestre 5b",
                "Bachelor semestre 6",
                "Bachelor semestre 6b",
                "Master semestre 1",
                "Master semestre 2",
                "Master semestre 3",
                "Master semestre 4",
                "Mineur semestre 1",
                "Mineur semestre 2",
                "Mise à niveau",
                "Projet Master automne",
                "Projet Master printemps",
                "Semestre automne",
                "Semestre printemps",
                "Stage automne 3ème année",
                "Stage automne 4ème année",
                "Stage printemps 3ème année",
                "Stage printemps 4ème année",
                "Stage printemps master",
                "Semestre d\'automne",
                "Semestre de printemps",
                ]

Let's check how our request looks like when we want to download XLS data for Informatique section and 2007-2008 Bachelor semestre 1 periods
http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.filter?
    ww_b_list=1&
    ww_i_reportmodel=133685247&
    ww_c_langue=&
    ww_i_reportModelXsl=133685271&
    zz_x_UNITE_ACAD=Informatique&
    ww_x_UNITE_ACAD=249847&
    zz_x_PERIODE_ACAD=2007-2008&
    ww_x_PERIODE_ACAD=978181&
    zz_x_PERIODE_PEDAGO=Bachelor+semestre+1&
    ww_x_PERIODE_PEDAGO=249108&
    zz_x_HIVERETE=&
    ww_x_HIVERETE=null&
    dummy=ok
    
We can see that same variables have a plain text values others integers extracted from corresponded "< input>" and "< option>" blocks.
Someothers are not relevant for our case and can be fixed.
We extract numerical variables using BeautifullSoup.

In [162]:
ww_i_reportModelXsl = soup.find('input', text="xls").get('value')

In [163]:
ww_x_UNITE_ACADs = [soup.find('option', text=section).get('value') for section in sections]

In [164]:
ww_x_PERIODE_ACADs = [soup.find('option', text=acad_period).get('value') for acad_period in acad_periods]

In [165]:
ww_x_PERIODE_PEDAGOs = [soup.find('option', text=pedag_period).get('value') for pedag_period in pedag_periods]

Now we can put all things together to run requests and download documents

In [182]:
for zz_x_UNITE_ACAD, ww_x_UNITE_ACAD in zip(sections,ww_x_UNITE_ACADs):
    for zz_x_PERIODE_ACAD, ww_x_PERIODE_ACAD in zip(acad_periods,ww_x_PERIODE_ACADs):
        for zz_x_PERIODE_PEDAGO, ww_x_PERIODE_PEDAGO in zip(pedag_periods, ww_x_PERIODE_PEDAGOs):
            
            payload = {
                        'ww_b_list':'1',
                        'ww_i_reportmodel': '133685247',
                        'ww_c_langue': '',
                        'ww_i_reportModelXsl': ww_i_reportModelXsl,
                        'zz_x_UNITE_ACAD': zz_x_UNITE_ACAD,
                        'ww_x_UNITE_ACAD': ww_x_UNITE_ACAD,
                        'zz_x_PERIODE_ACAD': zz_x_PERIODE_ACAD,
                        'ww_x_PERIODE_ACAD': ww_x_PERIODE_ACAD,
                        'zz_x_PERIODE_PEDAGO': zz_x_PERIODE_PEDAGO,
                        'ww_x_PERIODE_PEDAGO': ww_x_PERIODE_PEDAGO,
                        'zz_x_HIVERETE': '',
                        'ww_x_HIVERETE': 'null',
                        'dummy': 'ok'
                      }
            
            response = session.get(URL, 
                       headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'},
                      params=payload)
            

Bachelor semestre 1 249108
Bachelor semestre 2 249114
Bachelor semestre 3 942155
Bachelor semestre 4 942163
Bachelor semestre 5 942120
Bachelor semestre 5b 2226768
Bachelor semestre 6 942175
Bachelor semestre 6b 2226785
Master semestre 1 2230106
Master semestre 2 942192
Master semestre 3 2230128
Master semestre 4 2230140
Mineur semestre 1 2335667
Mineur semestre 2 2335676
Mise à niveau 2063602308
Projet Master automne 249127
Projet Master printemps 3781783
Semestre automne 953159
Semestre printemps 2754553
Stage automne 3ème année 953137
Stage automne 4ème année 2226616
Stage printemps 3ème année 983606
Stage printemps 4ème année 2226626
Stage printemps master 2227132
Semestre d'automne 2936286
Semestre de printemps 2936295
Bachelor semestre 1 249108
Bachelor semestre 2 249114
Bachelor semestre 3 942155
Bachelor semestre 4 942163
Bachelor semestre 5 942120
Bachelor semestre 5b 2226768
Bachelor semestre 6 942175
Bachelor semestre 6b 2226785
Master semestre 1 2230106
Master semestre 2 94

In [None]:
                        'ww_b_list':'1',
                        'ww_i_reportmodel': '133685247',
                        'ww_c_langue': '',
                        'ww_i_reportModelXsl': '133685271',
                        'zz_x_UNITE_ACAD': 'Informatique',
                        'ww_x_UNITE_ACAD': '249847',
                        'zz_x_PERIODE_ACAD': '2007-2008',
                        'ww_x_PERIODE_ACAD': '978181',
                        'zz_x_PERIODE_PEDAGO': 'Bachelor+semestre+1',
                        'ww_x_PERIODE_PEDAGO': '249108',
                        'zz_x_HIVERETE': '',
                        'ww_x_HIVERETE': 'null',
                        'dummy': 'ok'

In [167]:
REQUEST_URL = "http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.filter?ww_b_list=1&ww_i_reportmodel=133685247&ww_c_langue=&ww_i_reportModelXsl=133685271&zz_x_UNITE_ACAD=Informatique&ww_x_UNITE_ACAD=249847&zz_x_PERIODE_ACAD=2007-2008&ww_x_PERIODE_ACAD=978181&zz_x_PERIODE_PEDAGO=Bachelor+semestre+1&ww_x_PERIODE_PEDAGO=249108&zz_x_HIVERETE=&ww_x_HIVERETE=null&dummy=ok"

In [170]:
response_download = session.get(REQUEST_URL, headers={'Referer': header_link})
soup_download = BeautifulSoup(response_download.text, 'html.parser')

In [171]:
print(soup_download.prettify())

<html>
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type">
   <div>
   </div>
   <title>
   </title>
   <script src="GEDPUBLICREPORTS.txt?ww_x_path=Gestac.Base.Palette_js&amp;ww_c_langue=fr" type="text/javascript">
   </script>
   <link href="GEDPUBLICREPORTS.css?ww_x_path=Gestac.Moniteur.Style" rel="stylesheet" type="text/css">
    <link href="GEDPUBLICREPORTS.css?ww_x_path=Gestac.Moniteur.StyleNavigator" rel="stylesheet" type="text/css"/>
   </link>
  </meta>
 </head>
 <body alink="#666666" bgcolor="#ffffff" link="#666666" marginheight="0" marginwidth="5" vlink="#666666">
  <div class="filtres">
   <form action="!GEDPUBLICREPORTS.filter" method="GET" name="f">
    <input name="ww_b_list" type="hidden" value="1">
     <input name="ww_i_reportmodel" type="hidden" value="133685247">
      <input name="ww_c_langue" type="hidden" value="">
       <h1 id="titre">
        Liste des étudiants inscrits par semestre
       </h1>
       <table border="0" id="format">
 

In [177]:
soup_download.find(text=re.compile(r'ww_x_GPS=\d*'))

AttributeError: 'NoneType' object has no attribute 'parent'

In [178]:
DOCUMENT_LINK = "http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.XLS?ww_x_GPS=-1&ww_i_reportModel=133685247&ww_i_reportModelXsl=133685271&ww_x_UNITE_ACAD=249847&ww_x_PERIODE_ACAD=978181&ww_x_PERIODE_PEDAGO=249108&ww_x_HIVERETE=null"

In [179]:
# download file
with open('test.xls', 'wb') as handle:
    response = session.get(DOCUMENT_LINK, stream=True)

    for block in response.iter_content(1024):
        if not block:
            break

        handle.write(block)