# Fetching the data from the IS Academia API

We start by getting the HTML response of the tabular student data from ISAcademia.
For this, we use the [Requests](http://docs.python-requests.org/en/master/) library.


In [1]:
# We are going to use requests to do the HTTP-calls for gathering data, and BeautifulSoup for parsing the 
# HTML that we recieve
import requests
from bs4 import BeautifulSoup

# re will help us parse the html by using regular expressions
import re

# Furthermore, we will use the normal stack of pandas, numpy, matplotlib and seaborn
import pandas as pd
import numpy as np
import matplotlib.pyplot as pls
import seaborn as sb

# Statistical test library
import scipy.stats as stats

## Making the reqest

*Warning*: we are loading a lot of data, thus the loading takes quite a long time. Therefore, don't run this unless it's needed

To not spam the API too much, we collect all the data in one try, and filter it afterwords.

We use the following parameters:

## TODO: Update this
~~~~~~~~~~~~~~~~
- ww_x_GPS:-1
- ww_i_reportModel:133685247
- ww_i_reportModelXsl:133685270
- ww_x_UNITE_ACAD:249847
- ww_x_PERIODE_ACAD:null
- ww_x_PERIODE_PEDAGO:null
- ww_x_HIVERETE:null


Which leads to the following request:
http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html?ww_x_GPS=-1&ww_i_reportModel=133685247&ww_i_reportModelXsl=133685270&ww_x_UNITE_ACAD=249847&ww_x_PERIODE_ACAD=null&ww_x_PERIODE_PEDAGO=null&ww_x_HIVERETE=null
fetching data for all Computer Science students (Informatique) for all available years and semesters.
Such querring technique might be problematic with larger datasets (would probably result in server timeout) but since it works for our problem we stick to it.
~~~~~~~~~~~~~~~~

In [2]:
DEBUG = False

# TODO: make the request by using parameters to the function call, instead of coding it in the URI.
# TODO: verify that the uri is correct, and that we get all the data that we want

if DEBUG:
    # For testing and development we use the test_uri, which only loads data from 2016-2017
    uri = "http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html?ww_x_GPS=-1&ww_i_reportModel=133685247&ww_i_reportModelXsl=133685270&ww_x_UNITE_ACAD=249847&ww_x_PERIODE_ACAD=355925344&ww_x_PERIODE_PEDAGO=null&ww_x_HIVERETE=null"
else:
    # For 'production', collect all the data available from ISAcademia, for students at the IC-section
    uri = "http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html?ww_x_GPS=-1&ww_i_reportModel=133685247&ww_i_reportModelXsl=133685270&ww_x_UNITE_ACAD=249847&ww_x_PERIODE_ACAD=null&ww_x_PERIODE_PEDAGO=null&ww_x_HIVERETE=null"

req = requests.get(uri)

## Parsing the result

In [3]:
# Defining some helper functions, for clarity
def clean(string):
    return string.strip().lower().replace(' ', '_')

def is_semester_info(data):
    return len(data) <= 2

def is_header(data):
    return not ((len(data) > 2) and data[-2].isdigit())

def parse_table(table):
    students = []
    header = ''
    semester = ''

    for tr in table:
        row_data = []
        for td in tr:
            value = td.get_text().strip().replace('\xa0', ' ')
            row_data.append(value)

                     
        if is_semester_info(row_data):
            info = [clean(value) for value in row_data[0].split(', ')]
            section = info[0]
            year = info[1]
            semester, wat = info[2].split('\n_')
        elif(is_header(row_data)):
            header = [ clean(val) for val  in  row_data] 
        else:
            person = {'year': year, 'semester': semester, 'section': section, 'wat': wat}
            for i, key in enumerate(header):
                val = row_data[i].strip()
                if val: 
                    person[key] = val
                    
            students.append(person)
    
    return students

In [4]:
soup = BeautifulSoup(req.text, 'html.parser')
students_table = soup.find('table')

students = parse_table(students_table)

df = pd.DataFrame(students)
df.set_index(['no_sciper'], inplace=True)

original = df.copy()

In [80]:
#this here is for debug only if I mess up df somewhere down below in the code
df = original
#Lets list some basic info about parsed data
print(df.shape)
print(df.dtypes)
df.head()

(20107, 14)
civilité          object
ecole_echange     object
filière_opt.      object
mineur            object
nom_prénom        object
section           object
semester          object
spécialisation    object
statut            object
type_echange      object
wat               object
year              object
year_start         int64
year_end           int64
dtype: object


Unnamed: 0_level_0,civilité,ecole_echange,filière_opt.,mineur,nom_prénom,section,semester,spécialisation,statut,type_echange,wat,year,year_start,year_end
no_sciper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
170337,Monsieur,Universidad de Granada,,,Arco Arredondo Rafael,echange_in,semestre_automne,,Présent,Erasmus,(24_ét.),2005-2006,2005,2006
171585,Monsieur,Ecole Supérieure de Chimie Physique Electroniq...,,,Balas Marc,echange_in,semestre_automne,,Présent,Erasmus,(24_ét.),2005-2006,2005,2006
170200,Madame,Linköping University,,,Bergendal Anna,echange_in,semestre_automne,,Présent,Erasmus,(24_ét.),2005-2006,2005,2006
171880,Monsieur,Ecole Supérieure de Chimie Physique Electroniq...,,,Bret Jean-Sébastien,echange_in,semestre_automne,,Présent,Erasmus,(24_ét.),2005-2006,2005,2006
171505,Monsieur,Ecole Supérieure de Chimie Physique Electroniq...,,,Burret Nicolas,echange_in,semestre_automne,,Présent,Erasmus,(24_ét.),2005-2006,2005,2006


In [81]:
# Well first of all we noticed that parsing all data without specifing date 
# also resulted in data of students from years before 2007 and we don't want that.

# Lets split year column into year_start and year_end
df['year_start'], df['year_end'] = df['year'].str.split('-', 1).str
# Cast from object to int
df[['year_start','year_end']] = df[['year_start','year_end']].apply(pd.to_numeric)
# Drop year column 
new_df = df.drop("year", axis=1)

# Verify
print(new_df.dtypes)
new_df.head()


civilité          object
ecole_echange     object
filière_opt.      object
mineur            object
nom_prénom        object
section           object
semester          object
spécialisation    object
statut            object
type_echange      object
wat               object
year_start         int64
year_end           int64
dtype: object


Unnamed: 0_level_0,civilité,ecole_echange,filière_opt.,mineur,nom_prénom,section,semester,spécialisation,statut,type_echange,wat,year_start,year_end
no_sciper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
170337,Monsieur,Universidad de Granada,,,Arco Arredondo Rafael,echange_in,semestre_automne,,Présent,Erasmus,(24_ét.),2005,2006
171585,Monsieur,Ecole Supérieure de Chimie Physique Electroniq...,,,Balas Marc,echange_in,semestre_automne,,Présent,Erasmus,(24_ét.),2005,2006
170200,Madame,Linköping University,,,Bergendal Anna,echange_in,semestre_automne,,Présent,Erasmus,(24_ét.),2005,2006
171880,Monsieur,Ecole Supérieure de Chimie Physique Electroniq...,,,Bret Jean-Sébastien,echange_in,semestre_automne,,Présent,Erasmus,(24_ét.),2005,2006
171505,Monsieur,Ecole Supérieure de Chimie Physique Electroniq...,,,Burret Nicolas,echange_in,semestre_automne,,Présent,Erasmus,(24_ét.),2005,2006


In [82]:
bachelor_df = new_df[new_df["semester"].str.contains("bachelor_semestre")]
print(bachelor_df.shape)
bachelor_df.head()

(7271, 13)


Unnamed: 0_level_0,civilité,ecole_echange,filière_opt.,mineur,nom_prénom,section,semester,spécialisation,statut,type_echange,wat,year_start,year_end
no_sciper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
154168,Monsieur,,,,Aghamahdi Mohammad Hossein,informatique,bachelor_semestre_1,,Présent,,(107_ét.),2004,2005
160104,Monsieur,,,,Alves Sergio,informatique,bachelor_semestre_1,,Présent,,(107_ét.),2004,2005
154157,Madame,,,,Andriambololona Riana Miarantsoa,informatique,bachelor_semestre_1,,Présent,,(107_ét.),2004,2005
166876,Monsieur,,,,Aslan Unal,informatique,bachelor_semestre_1,,Présent,,(107_ét.),2004,2005
166258,Monsieur,,,,Balet Ken,informatique,bachelor_semestre_1,,Présent,,(107_ét.),2004,2005


In [83]:
# ...from year 2007 and above
bachelor_from_2007_df = bachelor_df[bachelor_df["year_start"] >= 2007]
print(bachelor_from_2007_df.shape)
bachelor_from_2007_df.head()

(5807, 13)


Unnamed: 0_level_0,civilité,ecole_echange,filière_opt.,mineur,nom_prénom,section,semester,spécialisation,statut,type_echange,wat,year_start,year_end
no_sciper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
169569,Monsieur,,,,Arévalo Christian,informatique,bachelor_semestre_1,,Présent,,(90_ét.),2007,2008
174905,Monsieur,,,,Aubelle Flavien,informatique,bachelor_semestre_1,,Présent,,(90_ét.),2007,2008
173922,Monsieur,,,,Badoud Morgan,informatique,bachelor_semestre_1,,Présent,,(90_ét.),2007,2008
179406,Monsieur,,,,Baeriswyl Jonathan,informatique,bachelor_semestre_1,,Présent,,(90_ét.),2007,2008
179428,Monsieur,,,,Barroco Michael,informatique,bachelor_semestre_1,,Présent,,(90_ét.),2007,2008


In [84]:
# search for students(rows) who either have bachelor_semestre_1 or bachelor_semestre_6
searchfor = ['bachelor_semestre_1', 'bachelor_semestre_6']
first_and_last_sem_df = bachelor_from_2007_df[bachelor_from_2007_df["semester"].str.contains('|'.join(searchfor))]

# just to visualise
by_name = first_and_last_sem_df.groupby(['nom_prénom', 'semester'])
by_name.first()

Unnamed: 0_level_0,Unnamed: 1_level_0,civilité,ecole_echange,filière_opt.,mineur,section,spécialisation,statut,type_echange,wat,year_start,year_end
nom_prénom,semester,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Abate Bryan Jeremy,bachelor_semestre_1,Monsieur,,,,informatique,,Présent,,(242_ét.),2014,2015
Abate Bryan Jeremy,bachelor_semestre_6,Monsieur,University of Bristol,,,informatique,,Congé,Erasmus,(24_ét.),2016,2017
Abbey Alexandre,bachelor_semestre_1,Monsieur,,,,informatique,,Présent,,(216_ét.),2015,2016
Abboud Magaly,bachelor_semestre_1,Madame,,,,informatique,,Présent,,(242_ét.),2014,2015
Abdallah Jad,bachelor_semestre_1,Monsieur,,,,informatique,,Présent,,(117_ét.),2009,2010
Achour Maher Ali,bachelor_semestre_1,Monsieur,,,,informatique,,Présent,,(153_ét.),2010,2011
Adler Yves-Fredricq Samuel,bachelor_semestre_1,Monsieur,,,,informatique,,Présent,,(206_ét.),2013,2014
Aeby Prisca,bachelor_semestre_6,Madame,University of Bristol,,,informatique,,Congé,Erasmus,(116_ét.),2014,2015
Ahluwalia Samit,bachelor_semestre_1,Monsieur,,,,informatique,,Présent,,(96_ét.),2008,2009
Ahmed Fares,bachelor_semestre_1,Monsieur,,,,informatique,,Présent,,(242_ét.),2014,2015


In [85]:
#WARNING: clear up this cell

# Order by year_start
year_start_order_df = first_and_last_sem_df.sort_values(by=['year_start'])
year_start_order_df

Unnamed: 0_level_0,civilité,ecole_echange,filière_opt.,mineur,nom_prénom,section,semester,spécialisation,statut,type_echange,wat,year_start,year_end
no_sciper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
169569,Monsieur,,,,Arévalo Christian,informatique,bachelor_semestre_1,,Présent,,(90_ét.),2007,2008
166344,Monsieur,,,,Blatter Jérémy,informatique,bachelor_semestre_6,,Présent,,(38_ét.),2007,2008
161279,Monsieur,,,,Biollay Jean Isaac Jamal Pachacutec,informatique,bachelor_semestre_6,,Présent,,(38_ét.),2007,2008
166701,Monsieur,,,,Beuret Thibaut,informatique,bachelor_semestre_6,,Présent,,(38_ét.),2007,2008
170220,Monsieur,,,,Barras Florian,informatique,bachelor_semestre_6,,Présent,,(38_ét.),2007,2008
178682,Monsieur,,,,Zoller Roman,informatique,bachelor_semestre_1,,Présent,,(90_ét.),2007,2008
174340,Madame,,,,Wüthrich Nathalie,informatique,bachelor_semestre_1,,Présent,,(90_ét.),2007,2008
175834,Monsieur,,,,Wicht Mathieu,informatique,bachelor_semestre_1,,Présent,,(90_ét.),2007,2008
180185,Monsieur,,,,Vo Nhu-Hoài Robert,informatique,bachelor_semestre_1,,Présent,,(90_ét.),2007,2008
174120,Monsieur,,,,Vlassov Nikita,informatique,bachelor_semestre_1,,Présent,,(90_ét.),2007,2008


In [86]:
by_name = year_start_order_df.groupby(['nom_prénom', 'semester'])
by_name.first()
# 1839 rows/students

Unnamed: 0_level_0,Unnamed: 1_level_0,civilité,ecole_echange,filière_opt.,mineur,section,spécialisation,statut,type_echange,wat,year_start,year_end
nom_prénom,semester,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Abate Bryan Jeremy,bachelor_semestre_1,Monsieur,,,,informatique,,Présent,,(242_ét.),2014,2015
Abate Bryan Jeremy,bachelor_semestre_6,Monsieur,University of Bristol,,,informatique,,Congé,Erasmus,(24_ét.),2016,2017
Abbey Alexandre,bachelor_semestre_1,Monsieur,,,,informatique,,Présent,,(216_ét.),2015,2016
Abboud Magaly,bachelor_semestre_1,Madame,,,,informatique,,Présent,,(242_ét.),2014,2015
Abdallah Jad,bachelor_semestre_1,Monsieur,,,,informatique,,Présent,,(117_ét.),2009,2010
Achour Maher Ali,bachelor_semestre_1,Monsieur,,,,informatique,,Présent,,(153_ét.),2010,2011
Adler Yves-Fredricq Samuel,bachelor_semestre_1,Monsieur,,,,informatique,,Présent,,(206_ét.),2013,2014
Aeby Prisca,bachelor_semestre_6,Madame,University of Bristol,,,informatique,,Congé,Erasmus,(116_ét.),2014,2015
Ahluwalia Samit,bachelor_semestre_1,Monsieur,,,,informatique,,Présent,,(96_ét.),2008,2009
Ahmed Fares,bachelor_semestre_1,Monsieur,,,,informatique,,Présent,,(242_ét.),2014,2015


In [87]:
# Warning. We are not EPFL students so it is extermly hard for us to tell how IS-Academia system really works.
# We assume that in order to consider bachelor studies to be completed student has to be registered for 
# both bachelor_semestre_1 and bachelor_semestre_6. Since during those six semesters there could be multiple different 
# situations as gap year, failed semeter, exchange semeter etc. we simplyfy our problem and assume that
# number of semeters spent @ EPFL is equal to (year of graduiation - year of bachelor start) * 2.
# Obviously in real life scenario this assumption is invalid but from this dataset there is really no possibility
# to tell what was the actual amount of semesters required for graduation. (Even getting to 6th semester doesn't imply
# that student succefully graduated!) Moreover it seams strange that student is required to retake whole year 
# if he fails only one semester (from data it seems that failing on 5th semester means you cannot attempt 6th 
# and have to wait one semester to retake 5th) - but thats what we assumed. 
#
# Thus our dataset becomes significantly chopped down - from 1839 IC students who attempted either semester 1 OR 6
# to 397 IC students who managed to attempt semester 1 AND 6.

sem_1_df = year_start_order_df[year_start_order_df["semester"] == "bachelor_semestre_1"]
unique_sem_1_df = sem_1_df.drop_duplicates(subset=['nom_prénom', 'semester'], keep='first')

sem_6_df = year_start_order_df[year_start_order_df["semester"] == "bachelor_semestre_6"]
unique_sem_6_df = sem_6_df.drop_duplicates(subset=['nom_prénom', 'semester'], keep='last')

difference_df = pd.DataFrame(unique_sem_6_df["year_end"]-unique_sem_1_df["year_start"], columns=['year_count'])
difference_df.dropna(inplace=True)
difference_df["semester_total"] = difference_df["year_count"]*2
difference_df = difference_df.drop('year_count', 1)
difference_df

Unnamed: 0_level_0,semester_total
no_sciper,Unnamed: 1_level_1
147008,6.0
169569,6.0
169731,8.0
169795,8.0
171195,6.0
171619,6.0
174905,10.0
175190,6.0
175280,8.0
175379,8.0


In [88]:
semesters_df = year_start_order_df
semesters_df["semester_total"] = difference_df["semester_total"]
semesters_df = semesters_df[pd.notnull(semesters_df['semester_total'])]
semesters_df

Unnamed: 0_level_0,civilité,ecole_echange,filière_opt.,mineur,nom_prénom,section,semester,spécialisation,statut,type_echange,wat,year_start,year_end,semester_total
no_sciper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
169569,Monsieur,,,,Arévalo Christian,informatique,bachelor_semestre_1,,Présent,,(90_ét.),2007,2008,6.0
178682,Monsieur,,,,Zoller Roman,informatique,bachelor_semestre_1,,Présent,,(90_ét.),2007,2008,6.0
180854,Monsieur,,,,Vautherin Jonas,informatique,bachelor_semestre_1,,Présent,,(90_ét.),2007,2008,6.0
175280,Monsieur,,,,Uberti Quentin,informatique,bachelor_semestre_1,,Présent,,(90_ét.),2007,2008,8.0
180241,Monsieur,,,,Sondag Pierre-Antoine,informatique,bachelor_semestre_1,,Présent,,(90_ét.),2007,2008,6.0
178684,Monsieur,,,,Schwery Thomas,informatique,bachelor_semestre_1,,Présent,,(90_ét.),2007,2008,6.0
169795,Monsieur,,,,Scheiben Pascal,informatique,bachelor_semestre_1,,Présent,,(90_ét.),2007,2008,8.0
178948,Monsieur,,,,Schädeli Andreas,informatique,bachelor_semestre_1,,Présent,,(90_ét.),2007,2008,6.0
171195,Monsieur,,,,Richter Arnaud,informatique,bachelor_semestre_1,,Présent,,(90_ét.),2007,2008,6.0
180959,Monsieur,,,,Restani Stéphane,informatique,bachelor_semestre_1,,Présent,,(90_ét.),2007,2008,6.0


In [89]:
semesters_df.loc["174905"]

Unnamed: 0_level_0,civilité,ecole_echange,filière_opt.,mineur,nom_prénom,section,semester,spécialisation,statut,type_echange,wat,year_start,year_end,semester_total
no_sciper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
174905,Monsieur,,,,Aubelle Flavien,informatique,bachelor_semestre_1,,Présent,,(90_ét.),2007,2008,10.0
174905,Monsieur,,,,Aubelle Flavien,informatique,bachelor_semestre_6,,Présent,,(52_ét.),2010,2011,10.0
174905,Monsieur,,,,Aubelle Flavien,informatique,bachelor_semestre_6,,Présent,,(52_ét.),2011,2012,10.0


In [90]:
# TO DO NEXT:
# - Partition the data between male and female students, and compute the average -- is the difference in average statistically significant?
# - Statistical test - two-sided hypothesis testing