# 03 - Interactive Viz

In [3]:
import folium
import pandas as pd
import numpy as np
import requests
import json

The goal is to build a Cloropleth showing how much grant money goes to each Swiss canton.

To do so, we start by defining a function which will rely on the GeoNames API to retrieve information about each University, namely the canton, the latitude and the longitude. Besides restricting the results to Switzerland, we also try to restrict the search to Universities ('fcodeName'). We keep looking for a result that has a canton ('adminCode1') associated.

In [4]:
def getData(place):
    base_url = 'http://api.geonames.org/searchJSON'
    payload = {'username': 'adaisp',
               'country' : 'CH',
               'q' : place,
               'fcodeName' : 'university',
                }
    response = requests.get(base_url, params=payload)
    data = json.loads(response.text)
    try:
        i = 0
        results = {
                'canton': data['geonames'][i]['adminCode1'],
                'lat' : data['geonames'][i]['lat'],
                'long' : data['geonames'][i]['lng'],
            }
        while (data['geonames'][i]['adminName1'] == ''):
            i += 1
            results = {
                'canton': data['geonames'][i]['adminCode1'],
                'lat' : data['geonames'][i]['lat'],
                'long' : data['geonames'][i]['lng'],
            }
    except:
        results = {
            'canton': 'No Match',
            'lat' : 'No Match',
            'long' : 'No Match',
        }
        
    return results

We then build a DataFrame keeping the information about the University and the Approved Ammount of each grant.

In [5]:
grant_export = 'P3_GrantExport.csv'
raw_data = pd.read_csv(grant_export, sep=';', error_bad_lines=False)

uni_data = raw_data.loc[:, ['University', 'Approved Amount']]
uni_data[['Approved Amount']] = uni_data[['Approved Amount']].apply(pd.to_numeric, errors='coerce')

By reading the documentation, we see that the 'University' field is only filled if the research is carried out at a Swiss institution. Therefore, we can do some filtering and discard the records which have that field blank ('Nicht zuteilbar - NA').
We keep filtering the data and we also discard records corresponding to companies and institutions from the private sector, *NPOs (Non-Profit Organisation)* and records falling into the category of '*other hospitals*' (too generic).

In [6]:
# drop the records where the field University is blank
uni_data = uni_data[uni_data.University != 'Nicht zuteilbar - NA']

# drop the records corresponding to companies or private sector
uni_data = uni_data[uni_data.University != 'Firmen/Privatwirtschaft - FP']

# drop the records corresponding to NPO
uni_data = uni_data[uni_data.University != 'NPO (Biblioth., Museen, Verwalt.) - NPO']

# drop the entry 'Weitere Spitäler' that means 'Other Hospitals' and so should be everywhere in the country
uni_data = uni_data[uni_data.University != 'Weitere Spitäler - ASPIT']

We can now group the results by University and get the grant money given to each one of them.

In [7]:
# compute the approved amount by University
uni_data = uni_data.groupby(['University'])['Approved Amount'].sum()

uni_data.sort_values(ascending=False, inplace=True)

Let's now retrieve the canton information (using the function previously described). We add four columns to the DataFrame: number of records for each University, canton code, latitude and longitude. These last three columns are filled with the retrieved information from the GeoNames API.

Note that we split the string corresponding to the institution name. We first try to get a matching with the full name and in case the search isn't successful we try with the acronym.

In [11]:
data = pd.DataFrame(uni_data)
data['Records number'] = raw_data['University'].value_counts()
data['Canton'], data['Latitude'], data['Longitude'] = (['']*len(data),)*3

for univ,row in data.iterrows():
    info = getData(univ.split(' - ')[0])
    
    if (info.get('canton') == 'No Match' and len(univ.split(' - ')) > 1):
        info = getData(univ.split(' - ')[1])
    
    data.set_value(univ, 'Canton', info.get('canton'))
    data.set_value(univ, 'Latitude', info.get('lat'))
    data.set_value(univ, 'Longitude', info.get('long'))

Since we want to cover 95% of the records and the GeoNames API doesn't provide information for most of the Universities, we try to look for patterns in the University/Institution name and fill those records manually.

In [12]:
keywords = {
    "Luzern": "LU",
    "Zürcher": "ZH",
    "Zürich": "ZH",
    "Svizzera italiana": "TI",
    "Bern": "BE",
    "St. Gallen": "SG",
    "Vaud": "VD",
    "Wallis": "VS",
    "Thurgau": "TG",
}

for kw in keywords:
    for univ, row in data[data.index.str.contains(kw)].iterrows():
        data.set_value(univ, 'Canton', keywords.get(kw))

After manually filling the DataFrame, we actually have more than 95% of the records covered, so we stop gathering data here.

In [13]:
canton_found = data[data.Canton != 'No Match']
nbr_records = data['Records number'].sum()
nbr_records_found = canton_found['Records number'].sum()
print('We mapped ' + str(round(100 * nbr_records_found / nbr_records)) + "% of the University records.")

We mapped 97% of the University records.


Let's take a look at the first entries of our DataFrame:

In [14]:
data.head(10)

Unnamed: 0_level_0,Approved Amount,Records number,Canton,Latitude,Longitude
University,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Université de Genève - GE,1838237000.0,6394,GE,46.20222,6.14569
Universität Zürich - ZH,1826843000.0,6774,ZH,47.37092,8.53434
ETH Zürich - ETHZ,1635597000.0,6153,ZH,47.3763,8.54805
Universität Bern - BE,1519373000.0,5473,BE,46.95096,7.43548
Universität Basel - BS,1352251000.0,4746,BS,47.55832,7.58403
Université de Lausanne - LA,1183291000.0,4092,VD,46.52376,6.58409
EPF Lausanne - EPFL,1175316000.0,4428,VD,46.51939,6.56673
Université de Fribourg - FR,457526200.0,2079,FR,46.80683,7.15317
Université de Neuchâtel - NE,383204600.0,1596,NE,46.99385,6.93789
Paul Scherrer Institut - PSI,115269000.0,538,AG,47.5385,8.23028


In order for the map to work as expected, we must have an entry for each canton found in the TopoJSON file. So we group the information from the previous DataFrame by canton (to get the total grant money) and then create a new DataFrame, filling with 0 the cantons which don't have any Universities mapped.

We use a **logarithmic scale** for the approved grant money which goes to each canton to make the visualization more readable later on.

In [15]:
all_cantons = np.array(['AG','AI','AR','BE','BL','BS','FR','GE','GL','GR','LU',
                        'NE','NW','OW','SG','SO','SZ','TG','TI','UR','VD','VS','ZG','ZH','SH','JU'])
amounts = canton_found.groupby(['Canton'])['Approved Amount'].sum()

cantons_amount = []
for canton in all_cantons:
    if amounts.get(canton):
        cantons_amount.append(np.log10(amounts.get(canton)))
    else:
        cantons_amount.append(0)

cantons_amount_df = pd.DataFrame()
cantons_amount_df['Canton'] = all_cantons
cantons_amount_df['Amount'] = cantons_amount

To place markers on the map, we also create a DataFrame containing the universities / institutions whose coordinates could be retrieved with the GeoNames API. We can see that only 20 places were actually found by the API.

In [16]:
univ_found_api = canton_found[canton_found.Latitude != 'No Match']
print('The Geonames API have only found the location of ' + str(len(univ_found_api)) + ' universities.')

The Geonames API have only found the location of 20 universities.


We can now build the Choropleth map. We plot the markers, add the TopoJSON overlay and use the geographical information from the DataFrame to do so.

In [17]:
cantons_geo = 'ch-cantons.topojson.json'

swiss_map = folium.Map(location=[46.801111,8.226667], zoom_start=8)

for univ, row in univ_found_api.iterrows():
     folium.Marker([row['Latitude'], row['Longitude']],
                   popup=univ, 
                   icon = folium.Icon(color = 'red', icon = 'glyphicon-book')).add_to(swiss_map)

swiss_map.choropleth(geo_path=cantons_geo, 
                     topojson='objects.cantons', 
                     data=cantons_amount_df,columns=['Canton', 'Amount'],
                     key_on='feature.id',
                     threshold_scale=[4, 5, 6, 7, 8, 9],
                     line_opacity=0.2, fill_opacity=0.7, fill_color='YlOrBr',
                     legend_name='Approved amount (logarithmic scale)',
                    )

swiss_map.save("grant_money_map.html")

***NB:*** we use a **logarithmic scale** in the legend of the map to make the visualization more readable.

In [18]:
from IPython.display import IFrame
IFrame('grant_money_map.html', width=900, height=650)

## Bonus

We define the cantons which belongs to the french part of switzerland as the following cantons: berne,fribourg, genève, vaud ,neuchatel , valais ,jura.
The rest belongs to the german part :D

In [20]:
german_part = np.array(['AG','AI','AR','BL','BS','GL','LU','NW','OW','SG','SO','SZ','TG','UR','ZG','ZH','SH'])
french_part = np.array(['GE', 'JU', 'NE', 'VD'])
italian_part = np.array(['TS'])
french_german = np.array(['BE', 'FR', 'VS'])
italian_german = np.array(['GR'])

boundaries = {'GR' : ['lat', 46.49],
              'BE': ['lat', 47.10],
              'FR': ['lng', 7.14],
              'VS': ['lng', 7.57]
             }

swiss_french_population = sum([105378, 213636, 481868, 72597, 178059, 242463, 767294])
swiss_italian_population = sum([335720, 14307])
swiss_german_population = 8.341e6 - swiss_french_population - swiss_german_population

In [24]:
total_amount = univ_found_api['Approved Amount'].sum()
french_amount = univ_found_api[univ_found_api['Canton'].isin(french_part)]['Approved Amount'].sum()
german_amount = univ_found_api[univ_found_api['Canton'].isin(german_part)]['Approved Amount'].sum()
italian_amount = univ_found_api[univ_found_api['Canton'].isin(italian_part)]['Approved Amount'].sum()

for ct in boundaries:
    if ct in french_german:
        if boundaries.get(ct)[0] == 'lat':
            french_amount += univ_found_api[univ_found_api.Canton == ct and univ_found_api.Latitude >= boundaries.get(ct)[1]]['Approved Amount'].sum()
            german_amount += univ_found_api[univ_found_api['Canton'] == ct and univ_found_api['Latitude'] < boundaries.get(ct)[1]]['Approved Amount'].sum()
        else:
            french_amount += univ_found_api[univ_found_api['Canton'] == ct and univ_found_api['Longitude'] <= boundaries.get(ct)[1]]['Approved Amount'].sum()
            german_amount += univ_found_api[univ_found_api['Canton'] == ct and univ_found_api['Latitude'] > boundaries.get(ct)[1]]['Approved Amount'].sum()
    else:
        german_amount += univ_found_api[univ_found_api['Canton'] == ct and univ_found_api['Latitude'] >= boundaries.get(ct)[1]]['Approved Amount'].sum()
        italian_amount += univ_found_api[univ_found_api['Canton'] == ct and univ_found_api['Latitude'] < boundaries.get(ct)[1]]['Approved Amount'].sum()
    

#found_french_part_perinh = univ_found_api[univ_found_api['Canton'].isin(french_part)]['Approved Amount'].sum()/swiss_french_population
#found_german_part_perinh = (found_total -found_french_part)/swiss_german_population

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [None]:
print(found_french_part_perinh,found_german_part_perinh)

There is much more money in the french part. It is hard to compute a test because each canton has 