# Fetching the data from the IS Academia API

We start by getting the HTML response of the tabular student data from ISAcademia.
For this, we use the [Requests](http://docs.python-requests.org/en/master/) library.


In [10]:
# We are going to use requests to do the HTTP-calls for gathering data, and BeautifulSoup for parsing the 
# HTML that we recieve
import requests
from bs4 import BeautifulSoup

# re will help us parse the html by using regular expressions
import re

# Furthermore, we will use the normal stack of pandas, numpy, matplotlib and seaborn
import pandas as pd
import numpy as np
import matplotlib.pyplot as pls
import seaborn as sb

# Statistical test library
import scipy.stats as stats

## Making the reqest

*Warning*: we are loading a lot of data, thus the loading takes quite a long time. Therefore, don't run this unless it's needed

To not spam the API too much, we collect all the data in one try, and filter it afterwords.

We use the following parameters:

## TODO: Update this
~~~~~~~~~~~~~~~~
- ww_x_GPS:-1
- ww_i_reportModel:133685247
- ww_i_reportModelXsl:133685270
- ww_x_UNITE_ACAD:249847
- ww_x_PERIODE_ACAD:null
- ww_x_PERIODE_PEDAGO:null
- ww_x_HIVERETE:null


Which leads to the folloring request:
http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html?ww_x_GPS=-1&ww_i_reportModel=133685247&ww_i_reportModelXsl=133685270&ww_x_UNITE_ACAD=249847&ww_x_PERIODE_ACAD=null&ww_x_PERIODE_PEDAGO=null&ww_x_HIVERETE=null
~~~~~~~~~~~~~~~~

In [6]:
DEBUG = True

# TODO: make the request by using parameters to the function call, instead of coding it in the URI.
# TODO: verify that the uri is correct, and that we get all the data that we want

if DEBUG:
    # For testing and development we use the test_uri, which only loads data from 2016-2017
    uri = "http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html?ww_x_GPS=-1&ww_i_reportModel=133685247&ww_i_reportModelXsl=133685270&ww_x_UNITE_ACAD=249847&ww_x_PERIODE_ACAD=355925344&ww_x_PERIODE_PEDAGO=null&ww_x_HIVERETE=null"
else:
    # For 'production', collect all the data available from ISAcademia, for students at the IC-section
    uri = "http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html?ww_x_GPS=-1&ww_i_reportModel=133685247&ww_i_reportModelXsl=133685270&ww_x_UNITE_ACAD=249847&ww_x_PERIODE_ACAD=null&ww_x_PERIODE_PEDAGO=null&ww_x_HIVERETE=null"

req = requests.get(uri)

## Parsing the result

In [7]:
# Defining some helper functions, for clarity
def clean(string):
    return string.strip().lower().replace(' ', '_')

def is_semester_info(data):
    return len(data) <= 2

def is_header(data):
    return not ((len(data) > 2) and data[-2].isdigit())

def parse_table(table):
    students = []
    header = ''
    semester = ''

    for tr in table:
        row_data = []
        for td in tr:
            value = td.get_text().strip().replace('\xa0', ' ')
            row_data.append(value)

                     
        if is_semester_info(row_data):
            info = [clean(value) for value in row_data[0].split(', ')]
            section = info[0]
            year = info[1]
            semester, wat = info[2].split('\n_')
        elif(is_header(row_data)):
            header = [ clean(val) for val  in  row_data] 
        else:
            person = {'year': year, 'semester': semester, 'section': section, 'wat': wat}
            for i, key in enumerate(header):
                val = row_data[i].strip()
                if val: 
                    person[key] = val
                    
            students.append(person)
    
    return students

In [48]:
soup = BeautifulSoup(req.text, 'html.parser')
students_table = soup.find('table')

students = parse_table(students_table)

df = pd.DataFrame(students)
df.set_index(['no_sciper'], inplace=True)

original = df.copy()

In [94]:
#this here is for debug only if I mess up df somewhere down below in the code
df = original
#Lets list some basic info about parsed data
print(df.shape)
print(df.dtypes)
df.head()

(20102, 14)
civilité          object
ecole_echange     object
filière_opt.      object
mineur            object
nom_prénom        object
section           object
semester          object
spécialisation    object
statut            object
type_echange      object
wat               object
year              object
year_start         int64
year_end           int64
dtype: object


Unnamed: 0_level_0,civilité,ecole_echange,filière_opt.,mineur,nom_prénom,section,semester,spécialisation,statut,type_echange,wat,year,year_start,year_end
no_sciper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
170337,Monsieur,Universidad de Granada,,,Arco Arredondo Rafael,echange_in,semestre_automne,,Présent,Erasmus,(24_ét.),2005-2006,2005,2006
171585,Monsieur,Ecole Supérieure de Chimie Physique Electroniq...,,,Balas Marc,echange_in,semestre_automne,,Présent,Erasmus,(24_ét.),2005-2006,2005,2006
170200,Madame,Linköping University,,,Bergendal Anna,echange_in,semestre_automne,,Présent,Erasmus,(24_ét.),2005-2006,2005,2006
171880,Monsieur,Ecole Supérieure de Chimie Physique Electroniq...,,,Bret Jean-Sébastien,echange_in,semestre_automne,,Présent,Erasmus,(24_ét.),2005-2006,2005,2006
171505,Monsieur,Ecole Supérieure de Chimie Physique Electroniq...,,,Burret Nicolas,echange_in,semestre_automne,,Présent,Erasmus,(24_ét.),2005-2006,2005,2006


In [95]:
# Well first of all we noticed that parsing all data without specifing date 
# also resulted in data of students from years before 2007 and we don't want that.

# Lets split year column into year_start and year_end
df['year_start'], df['year_end'] = df['year'].str.split('-', 1).str
# Cast from object to int
df[['year_start','year_end']] = df[['year_start','year_end']].apply(pd.to_numeric)
# Drop year column 
new_df = df.drop("year", axis=1)

# Verify
print(new_df.dtypes)
new_df.head()


civilité          object
ecole_echange     object
filière_opt.      object
mineur            object
nom_prénom        object
section           object
semester          object
spécialisation    object
statut            object
type_echange      object
wat               object
year_start         int64
year_end           int64
dtype: object


Unnamed: 0_level_0,civilité,ecole_echange,filière_opt.,mineur,nom_prénom,section,semester,spécialisation,statut,type_echange,wat,year_start,year_end
no_sciper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
170337,Monsieur,Universidad de Granada,,,Arco Arredondo Rafael,echange_in,semestre_automne,,Présent,Erasmus,(24_ét.),2005,2006
171585,Monsieur,Ecole Supérieure de Chimie Physique Electroniq...,,,Balas Marc,echange_in,semestre_automne,,Présent,Erasmus,(24_ét.),2005,2006
170200,Madame,Linköping University,,,Bergendal Anna,echange_in,semestre_automne,,Présent,Erasmus,(24_ét.),2005,2006
171880,Monsieur,Ecole Supérieure de Chimie Physique Electroniq...,,,Bret Jean-Sébastien,echange_in,semestre_automne,,Présent,Erasmus,(24_ét.),2005,2006
171505,Monsieur,Ecole Supérieure de Chimie Physique Electroniq...,,,Burret Nicolas,echange_in,semestre_automne,,Présent,Erasmus,(24_ét.),2005,2006


In [96]:
# Get bachelor students...
bachelor_df = new_df[new_df["semester"].str.contains("bachelor_semestre")]
print(bachelor_df.shape)
bachelor_df.head()

(7272, 13)


Unnamed: 0_level_0,civilité,ecole_echange,filière_opt.,mineur,nom_prénom,section,semester,spécialisation,statut,type_echange,wat,year_start,year_end
no_sciper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
154168,Monsieur,,,,Aghamahdi Mohammad Hossein,informatique,bachelor_semestre_1,,Présent,,(107_ét.),2004,2005
160104,Monsieur,,,,Alves Sergio,informatique,bachelor_semestre_1,,Présent,,(107_ét.),2004,2005
154157,Madame,,,,Andriambololona Riana Miarantsoa,informatique,bachelor_semestre_1,,Présent,,(107_ét.),2004,2005
166876,Monsieur,,,,Aslan Unal,informatique,bachelor_semestre_1,,Présent,,(107_ét.),2004,2005
166258,Monsieur,,,,Balet Ken,informatique,bachelor_semestre_1,,Présent,,(107_ét.),2004,2005


In [97]:
# ...from year 2007 and above
bachelor_from_2007_df = bachelor_df[bachelor_df["year_start"] >= 2007]
print(bachelor_from_2007_df.shape)
bachelor_from_2007_df.head()

(5808, 13)


Unnamed: 0_level_0,civilité,ecole_echange,filière_opt.,mineur,nom_prénom,section,semester,spécialisation,statut,type_echange,wat,year_start,year_end
no_sciper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
169569,Monsieur,,,,Arévalo Christian,informatique,bachelor_semestre_1,,Présent,,(90_ét.),2007,2008
174905,Monsieur,,,,Aubelle Flavien,informatique,bachelor_semestre_1,,Présent,,(90_ét.),2007,2008
173922,Monsieur,,,,Badoud Morgan,informatique,bachelor_semestre_1,,Présent,,(90_ét.),2007,2008
179406,Monsieur,,,,Baeriswyl Jonathan,informatique,bachelor_semestre_1,,Présent,,(90_ét.),2007,2008
179428,Monsieur,,,,Barroco Michael,informatique,bachelor_semestre_1,,Présent,,(90_ét.),2007,2008


In [98]:
# count how many semesters each student completed
counts = pd.DataFrame(bachelor_from_2007_df.groupby(['nom_prénom']).size(), columns=['semester_count'])

# merge with previous dataframe (add semester_count column)
students_df = bachelor_from_2007_df.merge(counts, left_on=['nom_prénom'], right_index=True)

# filter out students who didnt complete 6 semeters
students_df = students_df[students_df["semester_count"] == 6]

print(students_df.shape)
students_df.head()

(1554, 14)


Unnamed: 0_level_0,civilité,ecole_echange,filière_opt.,mineur,nom_prénom,section,semester,spécialisation,statut,type_echange,wat,year_start,year_end,semester_count
no_sciper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
169569,Monsieur,,,,Arévalo Christian,informatique,bachelor_semestre_1,,Présent,,(90_ét.),2007,2008,6
169569,Monsieur,,,,Arévalo Christian,informatique,bachelor_semestre_2,,Présent,,(72_ét.),2007,2008,6
169569,Monsieur,,,,Arévalo Christian,informatique,bachelor_semestre_3,,Présent,,(53_ét.),2008,2009,6
169569,Monsieur,,,,Arévalo Christian,informatique,bachelor_semestre_4,,Présent,,(51_ét.),2008,2009,6
169569,Monsieur,,,,Arévalo Christian,informatique,bachelor_semestre_5,,Présent,,(75_ét.),2009,2010,6


In [99]:
by_name = students_df.groupby(['nom_prénom', 'semester'])
by_name.first()

Unnamed: 0_level_0,Unnamed: 1_level_0,civilité,ecole_echange,filière_opt.,mineur,section,spécialisation,statut,type_echange,wat,year_start,year_end,semester_count
nom_prénom,semester,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Abate Bryan Jeremy,bachelor_semestre_1,Monsieur,,,,informatique,,Présent,,(242_ét.),2014,2015,6
Abate Bryan Jeremy,bachelor_semestre_2,Monsieur,,,,informatique,,Présent,,(164_ét.),2014,2015,6
Abate Bryan Jeremy,bachelor_semestre_3,Monsieur,,,,informatique,,Présent,,(92_ét.),2015,2016,6
Abate Bryan Jeremy,bachelor_semestre_4,Monsieur,,,,informatique,,Présent,,(89_ét.),2015,2016,6
Abate Bryan Jeremy,bachelor_semestre_5,Monsieur,University of Bristol,6 - Visual computing,,informatique,,Congé,Erasmus,(120_ét.),2016,2017,6
Abate Bryan Jeremy,bachelor_semestre_6,Monsieur,University of Bristol,,,informatique,,Congé,Erasmus,(24_ét.),2016,2017,6
Alami-Idrissi Ali,bachelor_semestre_1,Monsieur,,,,informatique,,Présent,,(242_ét.),2014,2015,6
Alami-Idrissi Ali,bachelor_semestre_2,Monsieur,,,,informatique,,Présent,,(164_ét.),2014,2015,6
Alami-Idrissi Ali,bachelor_semestre_3,Monsieur,,,,informatique,,Présent,,(92_ét.),2015,2016,6
Alami-Idrissi Ali,bachelor_semestre_4,Monsieur,,,,informatique,,Présent,,(89_ét.),2015,2016,6


In [100]:
# TO DO NEXT:
# - Compute how many months it took each student to go from the first to the sixth semester.
# - Partition the data between male and female students, and compute the average -- is the difference in average statistically significant?
# - Statistical test?