# Fetching the data from the IS Academia API

We start by getting the HTML response of the tabular student data from ISAcademia.
For this, we use the [Requests](http://docs.python-requests.org/en/master/) library.


In [149]:
# We are going to use requests to do the HTTP-calls for gathering data, and BeautifulSoup for parsing the 
# HTML that we recieve
import requests
from bs4 import BeautifulSoup

# re will help us parse the html by using regular expressions
import re

# Furthermore, we will use the normal stack of pandas, numpy, matplotlib and seaborn
import pandas as pd
import numpy as np
import matplotlib.pyplot as pls
import seaborn as sb

# Statistical test library
import scipy.stats as stats

## Making the reqest

*Warning*: we are loading a lot of data, thus the loading takes quite a long time. Therefore, don't run this unless it's needed

To not spam the API too much, we collect all the data in one try, and filter it afterwords.

We use the following parameters:

## TODO: Update this
~~~~~~~~~~~~~~~~
- ww_x_GPS:-1
- ww_i_reportModel:133685247
- ww_i_reportModelXsl:133685270
- ww_x_UNITE_ACAD:249847
- ww_x_PERIODE_ACAD:null
- ww_x_PERIODE_PEDAGO:null
- ww_x_HIVERETE:null


Which leads to the folloring request:
http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html?ww_x_GPS=-1&ww_i_reportModel=133685247&ww_i_reportModelXsl=133685270&ww_x_UNITE_ACAD=249847&ww_x_PERIODE_ACAD=null&ww_x_PERIODE_PEDAGO=null&ww_x_HIVERETE=null
~~~~~~~~~~~~~~~~

In [150]:
DEBUG = True

# TODO: make the request by using parameters to the function call, instead of coding it in the URI.
# TODO: verify that the uri is correct, and that we get all the data that we want

if DEBUG:
    # For testing and development we use the test_uri, which only loads data from 2016-2017
    uri = "http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html?ww_x_GPS=-1&ww_i_reportModel=133685247&ww_i_reportModelXsl=133685270&ww_x_UNITE_ACAD=249847&ww_x_PERIODE_ACAD=355925344&ww_x_PERIODE_PEDAGO=null&ww_x_HIVERETE=null"
else:
    # For 'production', collect all the data available from ISAcademia, for students at the IC-section
    uri = "http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html?ww_x_GPS=-1&ww_i_reportModel=133685247&ww_i_reportModelXsl=133685270&ww_x_UNITE_ACAD=249847&ww_x_PERIODE_ACAD=null&ww_x_PERIODE_PEDAGO=null&ww_x_HIVERETE=null"

req = requests.get(uri)

## Parsing the result

In [151]:
# Defining some helper functions, for clarity
def clean(string):
    return string.strip().lower().replace(' ', '_')

def is_semester_info(data):
    return len(data) <= 2

def is_header(data):
    return not ((len(data) > 2) and data[-2].isdigit())

def parse_table(table):
    students = []
    header = ''
    semester = ''

    for tr in table:
        row_data = []
        for td in tr:
            value = td.get_text().strip().replace('\xa0', ' ')
            row_data.append(value)

                     
        if is_semester_info(row_data):
            info = [clean(value) for value in row_data[0].split(', ')]
            section = info[0]
            year = info[1]
            semester, wat = info[2].split('\n_')
        elif(is_header(row_data)):
            header = [ clean(val) for val  in  row_data] 
        else:
            person = {'year': year, 'semester': semester, 'section': section, 'wat': wat}
            for i, key in enumerate(header):
                val = row_data[i].strip()
                if val: 
                    person[key] = val
                    
            students.append(person)
    
    return students

In [152]:
soup = BeautifulSoup(req.text, 'html.parser')
students_table = soup.find('table')

students = parse_table(students_table)

df = pd.DataFrame(students)
df.set_index(['no_sciper'], inplace=True)

original = df.copy()

['echange_in', '2016-2017', 'semestre_automne\n_(54_ét.)']
['echange_in', '2016-2017', 'semestre_printemps\n_(29_ét.)']
['informatique', '2016-2017', 'admission_automne\n_(22_ét.)']
['informatique', '2016-2017', 'admission_printemps\n_(2_ét.)']
['informatique', '2016-2017', 'bachelor_semestre_1\n_(235_ét.)']
['informatique', '2016-2017', 'bachelor_semestre_2\n_(1_ét.)']
['informatique', '2016-2017', 'bachelor_semestre_3\n_(83_ét.)']
['informatique', '2016-2017', 'bachelor_semestre_4\n_(2_ét.)']
['informatique', '2016-2017', 'bachelor_semestre_5\n_(120_ét.)']
['informatique', '2016-2017', 'bachelor_semestre_6\n_(24_ét.)']
['informatique', '2016-2017', 'master_semestre_1\n_(139_ét.)']
['informatique', '2016-2017', 'master_semestre_2\n_(2_ét.)']
['informatique', '2016-2017', 'master_semestre_3\n_(117_ét.)']
['informatique', '2016-2017', 'mise_à_niveau\n_(0_ét.)']
['informatique', '2016-2017', 'projet_master_automne\n_(57_ét.)']
['informatique', '2016-2017', 'projet_master_printemps\n_(0_é

In [153]:
#this here is for debug only if I mess up df somewhere down below in the code
df = original
#Lets list some basic info about parsed data
print(df.shape)
print(df.dtypes)
df.head()

(895, 12)
civilité          object
ecole_echange     object
filière_opt.      object
mineur            object
nom_prénom        object
section           object
semester          object
spécialisation    object
statut            object
type_echange      object
wat               object
year              object
dtype: object


Unnamed: 0_level_0,civilité,ecole_echange,filière_opt.,mineur,nom_prénom,section,semester,spécialisation,statut,type_echange,wat,year
no_sciper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
272925,Monsieur,Aalto University,,,Aspelin Karl Oskar,echange_in,semestre_automne,,Présent,Erasmus,(54_ét.),2016-2017
272175,Monsieur,Norwegian University of Science and Technology...,,,Aurlien Kristian,echange_in,semestre_automne,,Présent,Erasmus,(54_ét.),2016-2017
272043,Monsieur,Universidade de Lisboa,,,Baptista Águas André,echange_in,semestre_automne,,Présent,Erasmus,(54_ét.),2016-2017
273845,Monsieur,"University of New South Wales, Sydney",,,Bernardi Michael,echange_in,semestre_automne,,Présent,Bilatéral,(54_ét.),2016-2017
276596,Monsieur,Technische Universität Berlin,,,Böhm Felix,echange_in,semestre_automne,,Présent,Erasmus,(54_ét.),2016-2017


In [154]:
# Well first of all we noticed that parsing all data without specifing date 
# also resulted in data of students from years before 2007 and we don't want that.

# Lets split year column into year_start and year_end
df['year_start'], df['year_end'] = df['year'].str.split('-', 1).str
# Cast from object to int
df[['year_start','year_end']] = df[['year_start','year_end']].apply(pd.to_numeric)
# Drop year column 
new_df = df.drop("year", axis=1)

# Verify
print(new_df.dtypes)
new_df.head()


civilité          object
ecole_echange     object
filière_opt.      object
mineur            object
nom_prénom        object
section           object
semester          object
spécialisation    object
statut            object
type_echange      object
wat               object
year_start         int64
year_end           int64
dtype: object


Unnamed: 0_level_0,civilité,ecole_echange,filière_opt.,mineur,nom_prénom,section,semester,spécialisation,statut,type_echange,wat,year_start,year_end
no_sciper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
272925,Monsieur,Aalto University,,,Aspelin Karl Oskar,echange_in,semestre_automne,,Présent,Erasmus,(54_ét.),2016,2017
272175,Monsieur,Norwegian University of Science and Technology...,,,Aurlien Kristian,echange_in,semestre_automne,,Présent,Erasmus,(54_ét.),2016,2017
272043,Monsieur,Universidade de Lisboa,,,Baptista Águas André,echange_in,semestre_automne,,Présent,Erasmus,(54_ét.),2016,2017
273845,Monsieur,"University of New South Wales, Sydney",,,Bernardi Michael,echange_in,semestre_automne,,Présent,Bilatéral,(54_ét.),2016,2017
276596,Monsieur,Technische Universität Berlin,,,Böhm Felix,echange_in,semestre_automne,,Présent,Erasmus,(54_ét.),2016,2017


In [155]:
# Get bachelor students...
bachelor_df = new_df[new_df["semester"].str.contains("bachelor_semestre")]
print(bachelor_df.shape)
bachelor_df.head()

(465, 13)


Unnamed: 0_level_0,civilité,ecole_echange,filière_opt.,mineur,nom_prénom,section,semester,spécialisation,statut,type_echange,wat,year_start,year_end
no_sciper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
235688,Monsieur,,,,Abbey Alexandre,informatique,bachelor_semestre_1,,Présent,,(235_ét.),2016,2017
274015,Monsieur,,,,Ahn Seongho,informatique,bachelor_semestre_1,,Présent,,(235_ét.),2016,2017
268410,Madame,,,,Alemanno Sara,informatique,bachelor_semestre_1,,Présent,,(235_ét.),2016,2017
271464,Monsieur,,,,Althaus Luca,informatique,bachelor_semestre_1,,Présent,,(235_ét.),2016,2017
274518,Monsieur,,,,Assi Karim,informatique,bachelor_semestre_1,,Présent,,(235_ét.),2016,2017


In [156]:
# ...from year 2007 and above
bachelor_from_2007_df = bachelor_df[bachelor_df["year_start"] >= 2007]
print(bachelor_from_2007_df.shape)
bachelor_from_2007_df.head()

(465, 13)


Unnamed: 0_level_0,civilité,ecole_echange,filière_opt.,mineur,nom_prénom,section,semester,spécialisation,statut,type_echange,wat,year_start,year_end
no_sciper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
235688,Monsieur,,,,Abbey Alexandre,informatique,bachelor_semestre_1,,Présent,,(235_ét.),2016,2017
274015,Monsieur,,,,Ahn Seongho,informatique,bachelor_semestre_1,,Présent,,(235_ét.),2016,2017
268410,Madame,,,,Alemanno Sara,informatique,bachelor_semestre_1,,Présent,,(235_ét.),2016,2017
271464,Monsieur,,,,Althaus Luca,informatique,bachelor_semestre_1,,Présent,,(235_ét.),2016,2017
274518,Monsieur,,,,Assi Karim,informatique,bachelor_semestre_1,,Présent,,(235_ét.),2016,2017


In [157]:
# search for students(rows) who either have bachelor_semestre_1 or bachelor_semestre_6
searchfor = ['bachelor_semestre_1', 'bachelor_semestre_6']
first_and_last_sem_df = bachelor_from_2007_df[bachelor_from_2007_df["semester"].str.contains('|'.join(searchfor))]

# just to visualise
by_name = first_and_last_sem_df.groupby(['nom_prénom', 'semester'])
by_name.first()

Unnamed: 0_level_0,Unnamed: 1_level_0,civilité,ecole_echange,filière_opt.,mineur,section,spécialisation,statut,type_echange,wat,year_start,year_end
nom_prénom,semester,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Abate Bryan Jeremy,bachelor_semestre_6,Monsieur,University of Bristol,,,informatique,,Congé,Erasmus,(24_ét.),2016,2017
Abbey Alexandre,bachelor_semestre_1,Monsieur,,,,informatique,,Présent,,(235_ét.),2016,2017
Ahn Seongho,bachelor_semestre_1,Monsieur,,,,informatique,,Présent,,(235_ét.),2016,2017
Alami-Idrissi Ali,bachelor_semestre_6,Monsieur,Linköping University,5 - Signal and Image Processing,,informatique,,Congé,Erasmus,(24_ét.),2016,2017
Alemanno Sara,bachelor_semestre_1,Madame,,,,informatique,,Présent,,(235_ét.),2016,2017
Althaus Luca,bachelor_semestre_1,Monsieur,,,,informatique,,Présent,,(235_ét.),2016,2017
Aoun Leonardo,bachelor_semestre_6,Monsieur,"University of Washington, Seattle",,,informatique,,Congé,Bilatéral,(24_ét.),2016,2017
Assi Karim,bachelor_semestre_1,Monsieur,,,,informatique,,Présent,,(235_ét.),2016,2017
Bachmann Roman Christian,bachelor_semestre_6,Monsieur,Norwegian University of Science and Technology...,,,informatique,,Congé,Erasmus,(24_ét.),2016,2017
Badoux Luc-Antoine,bachelor_semestre_1,Monsieur,,,,informatique,,Présent,,(235_ét.),2016,2017


In [158]:
#WARNING: clear up this cell

# Order by year_start
unique_first_and_last_sem_df = first_and_last_sem_df.sort(['year_start'])
# unique_df = unique_first_and_last_sem_df.drop_duplicates(subset=['nom_prénom', 'semester'], take_last=False)

# count if df contains only 2 semesters (sem_1 and sem_6) how many semesters each student completed
counts = pd.DataFrame(unique_df.groupby(['nom_prénom']).size(), columns=['semester_count'])

# merge with previous dataframe (add semester_count column)
students_df = unique_df.merge(counts, left_on=['nom_prénom'], right_index=True)

# filter out students who didnt complete 6 semeters
students_df = students_df[students_df["semester_count"] == 2]

print(students_df.shape)
students_df.head()
# #235688 - Abbey Alexandre	

(794, 14)




Unnamed: 0_level_0,civilité,ecole_echange,filière_opt.,mineur,nom_prénom,section,semester,spécialisation,statut,type_echange,wat,year_start,year_end,semester_count
no_sciper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
169569,Monsieur,,,,Arévalo Christian,informatique,bachelor_semestre_1,,Présent,,(90_ét.),2007,2008,2
169569,Monsieur,,,,Arévalo Christian,informatique,bachelor_semestre_6,,Présent,,(60_ét.),2009,2010,2
178682,Monsieur,,,,Zoller Roman,informatique,bachelor_semestre_1,,Présent,,(90_ét.),2007,2008,2
178682,Monsieur,"Royal Institute of Technology, (KTH) Stockholm",,,Zoller Roman,informatique,bachelor_semestre_6,,Congé,Erasmus,(60_ét.),2009,2010,2
180854,Monsieur,,,,Vautherin Jonas,informatique,bachelor_semestre_1,,Présent,,(90_ét.),2007,2008,2


In [159]:
by_name = students_df.groupby(['nom_prénom', 'semester'])
by_name.first()

Unnamed: 0_level_0,Unnamed: 1_level_0,civilité,ecole_echange,filière_opt.,mineur,section,spécialisation,statut,type_echange,wat,year_start,year_end,semester_count
nom_prénom,semester,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Abate Bryan Jeremy,bachelor_semestre_1,Monsieur,,,,informatique,,Présent,,(242_ét.),2014,2015,2
Abate Bryan Jeremy,bachelor_semestre_6,Monsieur,University of Bristol,,,informatique,,Congé,Erasmus,(24_ét.),2016,2017,2
Aiulfi Loris Sandro,bachelor_semestre_1,Monsieur,,,,informatique,,Présent,,(153_ét.),2010,2011,2
Aiulfi Loris Sandro,bachelor_semestre_6,Monsieur,,,,informatique,,Présent,,(86_ét.),2013,2014,2
Alami-Idrissi Ali,bachelor_semestre_1,Monsieur,,,,informatique,,Présent,,(242_ét.),2014,2015,2
Alami-Idrissi Ali,bachelor_semestre_6,Monsieur,Linköping University,5 - Signal and Image Processing,,informatique,,Congé,Erasmus,(24_ét.),2016,2017,2
Alfonso Peterssen Alfonso,bachelor_semestre_1,Monsieur,,,,informatique,,Présent,,(206_ét.),2013,2014,2
Alfonso Peterssen Alfonso,bachelor_semestre_6,Monsieur,,6 - Visual computing,,informatique,,Présent,,(104_ét.),2015,2016,2
Alonso Seisdedos Florian,bachelor_semestre_1,Monsieur,,,,informatique,,Présent,,(166_ét.),2011,2012,2
Alonso Seisdedos Florian,bachelor_semestre_6,Monsieur,,,,informatique,,Présent,,(86_ét.),2013,2014,2


In [160]:
# TO DO NEXT:
# - Compute how many months it took each student to go from the first to the sixth semester.
# - Partition the data between male and female students, and compute the average -- is the difference in average statistically significant?
# - Statistical test?

In [163]:
sem_1_df = students_df[students_df["semester"] == "bachelor_semestre_1"]
unique_sem_1_df = sem_1_df.drop_duplicates(subset=['nom_prénom', 'semester'], take_last=False)

sem_6_df = students_df[students_df["semester"] == "bachelor_semestre_6"]
unique_sem_6_df = sem_6_df.drop_duplicates(subset=['nom_prénom', 'semester'], take_last=True)

difference_df = pd.DataFrame(sem_6_df["year_end"]-sem_1_df["year_start"], columns=['year_count'])
difference_df["semester_total"] = difference_df["year_count"]*2
difference_df = difference_df.drop('year_count', 1)
difference_df

  from ipykernel import kernelapp as app


Unnamed: 0_level_0,semester_total
no_sciper,Unnamed: 1_level_1
169569,6
178682,6
180854,6
175280,6
180241,6
178684,6
169795,6
178948,6
171195,6
180959,6


In [175]:
# semesters_df = pd.concat([students_df, difference_df]) #students_df.merge(difference_df, right_index=True)
semesters_df = students_df
semesters_df["semester_total"] =  difference_df["semes"]

MergeError: Must pass right_on or right_index=True