# Interactive Viz

In [229]:
import googlemaps
import folium
import pandas as pd
import requests
import os.path
import json
import html5lib
from bs4 import BeautifulSoup
import re
import numpy as np

First thing first, lets import the data

In [252]:
data = pd.read_csv('Data/P3_GrantExport.csv',sep=';')
data.set_index(data.columns[0], inplace=True)
print(data.shape)
data.head(100)

(63969, 14)


Unnamed: 0_level_0,Project Title,Project Title English,Responsible Applicant,Funding Instrument,Funding Instrument Hierarchy,Institution,University,Discipline Number,Discipline Name,Discipline Name Hierarchy,Start Date,End Date,Approved Amount,Keywords
"﻿""Project Number""",Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,Schlussband (Bd. VI) der Jacob Burckhardt-Biog...,,Kaegi Werner,Project funding (Div. I-III),Project funding,,Nicht zuteilbar - NA,10302,Swiss history,Human and Social Sciences;Theology & religious...,01.10.1975,30.09.1976,11619.00,
4,Batterie de tests à l'usage des enseignants po...,,Massarenti Léonard,Project funding (Div. I-III),Project funding,Faculté de Psychologie et des Sciences de l'Ed...,Université de Genève - GE,10104,Educational science and Pedagogy,"Human and Social Sciences;Psychology, educatio...",01.10.1975,30.09.1976,41022.00,
5,"Kritische Erstausgabe der ""Evidentiae contra D...",,Kommission für das Corpus philosophorum medii ...,Project funding (Div. I-III),Project funding,Kommission für das Corpus philosophorum medii ...,"NPO (Biblioth., Museen, Verwalt.) - NPO",10101,Philosophy,Human and Social Sciences;Linguistics and lite...,01.03.1976,28.02.1985,79732.00,
6,Katalog der datierten Handschriften in der Sch...,,Burckhardt Max,Project funding (Div. I-III),Project funding,Abt. Handschriften und Alte Drucke Bibliothek ...,Universität Basel - BS,10302,Swiss history,Human and Social Sciences;Theology & religious...,01.10.1975,30.09.1976,52627.00,
7,Wissenschaftliche Mitarbeit am Thesaurus Lingu...,,Schweiz. Thesauruskommission,Project funding (Div. I-III),Project funding,Schweiz. Thesauruskommission,"NPO (Biblioth., Museen, Verwalt.) - NPO",10303,Ancient history and Classical studies,Human and Social Sciences;Theology & religious...,01.01.1976,30.04.1978,120042.00,
8,Die schweizerische Wirtschaftspolitik seit dem...,,Kleinewefers Henner,Project funding (Div. I-III),Project funding,"Séminaire de politique économique, d'économie ...",Université de Fribourg - FR,10203,Economics,"Human and Social Sciences;Economics, law",01.01.1976,31.12.1978,53009.00,
9,Theologische Forschungen zur Oekumene (Studien...,,Stirnimann Heinrich,Project funding (Div. I-III),Project funding,Institut für ökumenische Studien Université de...,Université de Fribourg - FR,10102,"Religious sciences, Theology",Human and Social Sciences;Theology & religious...,01.01.1976,31.12.1976,25403.00,
10,Konfuzianische Kulturwerte in der sozialen Ent...,,Deuchler Martina,Project funding (Div. I-III),Project funding,Ostasiatisches Seminar Universität Zürich,Universität Zürich - ZH,10301,History in general,Human and Social Sciences;Theology & religious...,01.10.1975,31.03.1977,47100.00,
11,Edizione degli scritti di Aurelio de' Giorgi B...,,Stäuble Antonio,Project funding (Div. I-III),Project funding,,Université de Lausanne - LA,10502,Romance languages and literature,Human and Social Sciences;Linguistics and lite...,01.10.1975,31.03.1977,25814.00,
13,La construction de nouveautés au sein des morp...,,Piaget Jean,Project funding (Div. I-III),Project funding,Laboratoire de Didactique et Epistémologie des...,Université de Genève - GE,10105,Psychology,"Human and Social Sciences;Psychology, educatio...",01.10.1975,30.09.1978,360000.00,


Then we keep only the columns we need (hence the university and the approved amount).
Plus, we remove the rows where no university is specified.

In [253]:
data = data[['University', 'Approved Amount']]
data.dropna(subset=['University'], inplace=True)
data = data.ix[data['University'] != 'Nicht zuteilbar - NA'] #NA university
data = data.ix[data['University'] != 'NPO (Biblioth., Museen, Verwalt.) - NPO'] #public institution
data = data.ix[data['University'] != 'Firmen/Privatwirtschaft - FP'] #private society
print(data.shape)
data.head()

(46428, 2)


Unnamed: 0_level_0,University,Approved Amount
"﻿""Project Number""",Unnamed: 1_level_1,Unnamed: 2_level_1
4,Université de Genève - GE,41022.0
6,Universität Basel - BS,52627.0
8,Université de Fribourg - FR,53009.0
9,Université de Fribourg - FR,25403.0
10,Universität Zürich - ZH,47100.0


Small adjustment as some values are not provided ("data not included in")

In [254]:
data = data.ix[data['Approved Amount'] != 'data not included in P3']
    
data['Approved Amount'] = data['Approved Amount'].apply(pd.to_numeric)
data['Approved Amount'].dtype

dtype('float64')

The next step is to map each university to its canton. We first tried to use GeoNames, and then Google, but they are not efficient enough to provide a canton for each university. This is why when GeoNames doesn't find the university, then we look for it on P3. In more details, we actually look for the first project linked to the university, find the institute related to it, fetch the institute details page on P3 and read the postal code in the address. This being done, we get the canton from GeoNames with the postal code.

In [160]:
GEONAMES_NPA_URL = 'http://api.geonames.org/postalCodeSearchJSON?country=CH&maxRows=1&username=luc4s'
BASE_URL_SEARCH = 'http://api.geonames.org/searchJSON?country=CH&maxRows=1&username=luc4s'

P3_BASE_URL = 'http://p3.snf.ch'
P3_PROJECT = P3_BASE_URL + '/project-'
P3_INSTITUTE = P3_BASE_URL + '/institute-'
    
UNIVERSITIES_CANTONS = 'Data/universities_cantons.xls'

def get_canton_from_p3(idx):
    r = requests.get(P3_PROJECT + str(idx))
    soup = BeautifulSoup(r.text, 'html.parser')
    institute = soup.find(id="ctl00_MainContent_urlInstitite") #found it manually
    
    if institute == None:
        return None
    
    #some
    if(not 'href' in institute.prettify()):
        return None
    
    r = requests.post(P3_BASE_URL + institute['href'])#, data=payload)
    soup = BeautifulSoup(r.text, "html5lib")
    address_soup = soup.find_all('div', attrs={'class' : 'institute'})[0].table.tbody
    postal_code = re.compile("CH\-[0-9]{4}").findall(address_soup.prettify())
    if(len(postal_code) < 1):
        return None #shouldn't happen, but you never know
    
    postal_code = int(postal_code[0][3:])
    
    #Now that we have the postal code, we can use GeoNames to get the cantons, and we're done!
    payload = {'postalcode': postal_code}
    r = requests.get(GEONAMES_NPA_URL, params=payload)
    res = r.json()
    if(len(res['postalCodes']) < 1):
        return None #just in case
    
    return res['postalCodes'][0]['adminCode1']

#this function is called first, it simply looks on GeoNames
def get_canton(idx, university):
    value = university.split(" - ")
    payload = {'q': value}
    r = requests.get(BASE_URL_SEARCH, params=payload)
    response = json.loads(r.text)
    
    if response['totalResultsCount'] > 0:
        return response['geonames'][0]['adminCode1']
    else:
        return get_canton_from_p3(idx)
    
def get_uni_cantons(update=False):
    if(not os.path.isfile(UNIVERSITIES_CANTONS) or update):
        #we do this so we don't have to redo the mapping every time
        universities = data['University'].drop_duplicates().to_frame()
        
        universities['Canton'] = universities.index.map(lambda x: get_canton(x, universities.loc[x]['University']))
        
        universities.reset_index(drop=True).to_excel(UNIVERSITIES_CANTONS)
        return universities
    else:
        return pd.read_excel(UNIVERSITIES_CANTONS)

uni_cantons = get_uni_cantons()


Now lets see how many of them we got

In [161]:
uni_cantons[uni_cantons.Canton.isnull()]

Unnamed: 0,University,Canton
41,Interkant. Hochschule für Heilpädagogik ZH - HfH,
49,Haute école pédagogique du canton de Vaud - HEPL,
56,Istituto Svizzero di Roma - ISR,


We only missed 3 of them, let's complete them by hand

In [162]:
uni_cantons['Canton'].loc[41] = 'ZH'
uni_cantons['Canton'].loc[49] = 'VD'
uni_cantons['Canton'].loc[56] = 'TI'
uni_cantons[uni_cantons.Canton.isnull()]

Unnamed: 0,University,Canton


Great, with this done, we can start the actual work. First, some adjustment:

In [163]:
uni_cantons.set_index(uni_cantons.University, inplace=True)
uni_cantons.index.is_unique
uni_cantons = uni_cantons.to_dict()['Canton']

Lets begin by remapping the university field of the DataFrame containing the grants

In [256]:
data['Canton'] = data['University'].map(uni_cantons)
data.reset_index(drop=True, inplace=True)
grants = data.drop('University', 1)
grants.head()

Unnamed: 0,Approved Amount,Canton
0,41022.0,GE
1,52627.0,BS
2,53009.0,FR
3,25403.0,FR
4,47100.0,ZH


Marvelous, now we can do the real stuff, like sum all the grants

In [285]:
grants_sum = grants.groupby(grants.Canton).sum()
grants_sum.head()
grant_per_canton = grants_sum['Approved Amount'].to_dict()

This nice dataframe lets us fill our plot:

In [301]:
ch_geo = r'ch-cantons.topojson.json'


p3data = pd.DataFrame({'Canton':['JU', 'ZH', 'BE', 'LU', 'SG', 'NE', 'VD', 'VS', 'GE', 'TI', 
                                 'SO', 'UR', 'OW', 'NW', 'SZ', 'GL', 'ZG', 'BL', 'BS', 'FR',
                                'SH', 'AR', 'AI', 'GR', 'AG', 'TG']})

p3data['Grants'] = 0
p3data['Grants'] = p3data['Canton'].map(grant_per_canton)

map = folium.Map(location=[46.82244,8.22410], zoom_start=8)
map.choropleth(data=p3data,
                columns=['Canton', 'Grants'], 
                geo_path=ch_geo, 
                topojson='objects.cantons', 
                threshold_scale=[0, 20000, 100000, 500000, 1000000, 2000000, 500],
                fill_color='YlOrRd', fill_opacity=0.7, line_opacity=0.2,
                key_on='feature.id')

map.save('ch_map.html')
map