# Interactive Visualization

In [1]:
#WARNING guys! In order to use folium you are gonna need to run
# pip install folium
#in your terminal.
import folium

#Probably gonna use this:

# Normal stack of pandas, numpy, matplotlib and seaborn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns

# For handeling the web requests
import requests
from collections import defaultdict
from pprint import *

# Statistical test library
import scipy.stats as stats
from helpers import *

%matplotlib inline

Folium: https://github.com/python-visualization/folium
 
 Documentation:
 1. https://folium.readthedocs.io/en/latest/
 2. https://media.readthedocs.org/pdf/folium/latest/folium.pdf

In [2]:
#Folium test
map_osm = folium.Map(location=[46.8076878,7.1004592], zoom_start=10)
map_osm

# Data wrangeling

Data source (P3_GrantExport.csv)
and description of the columns: http://p3.snf.ch/Pages/DataAndDocumentation.aspx

In [3]:
# Load the data we need. (Columns: Insti)
columns = ['Institution', 'University','Approved Amount']
p3 = pd.read_csv("P3_GrantExport.csv", delimiter=';', usecols=columns, na_values=['data not included in P3', 'nan'])
p3.describe()



Unnamed: 0,Approved Amount
count,53059.0
mean,245487.7
std,318341.6
min,0.0
25%,
50%,
75%,
max,15487750.0


To be able to work with the data, we have to filter out the values where we have enough information.

We start by filtering out the projects where we know at least the University or the institution related to the project

In [4]:
missing__info = p3[p3[['Institution', 'University']].isnull().all(axis=1)].index
has_info = p3.drop(missing__info, axis=0)
print(has_info.shape)
print('Dropped from original:', get_dropped_perc(p3, has_info))


(62481, 3)
Dropped from original: 2.3261267176288514


As we can see, this leaves out 2.33% of the original data.

In [5]:
grant_info = has_info[has_info[['Approved Amount']].isnull().all(axis=1) == False]
grants_data = has_info.loc[grant_info.index]
grants_data.head()

Unnamed: 0,Institution,University,Approved Amount
0,,Nicht zuteilbar - NA,11619.0
1,Faculté de Psychologie et des Sciences de l'Ed...,Université de Genève - GE,41022.0
2,Kommission für das Corpus philosophorum medii ...,"NPO (Biblioth., Museen, Verwalt.) - NPO",79732.0
3,Abt. Handschriften und Alte Drucke Bibliothek ...,Universität Basel - BS,52627.0
4,Schweiz. Thesauruskommission,"NPO (Biblioth., Museen, Verwalt.) - NPO",120042.0


In [6]:
print('Dropped from original:', get_dropped_perc(p3, grants_data))
print('Dropped from last step:', get_dropped_perc(has_info, grants_data))

Dropped from original: 18.340133502165113
Dropped from last step: 16.395384196795824


So, for about 16.4% of the data, we don't have any information about the grant given. 

# Get the geo info

From the description of the dataset, we have the folloring description:
> ### Institution
> According to the information submitted by the responsible applicant, this is the research institution where the project will largely be carried out. Typically, this is the institution or specific lab where the responsible applicant works. The institution is therefore linked to the application. This field is not structured, but its content is consolidated internally. There is no link in the database to the field University.

> ### University
> This is the institution where the project will largely be carried out according to the application. Pick list. This field is only filled if the research is carried out at a Swiss institution, otherwise the field remains blank. In the case of mobility fellowships, it is generally left empty.


So, we know that for the rows where the university is given, the grant was awarded to a swizz university. However, we do not know much about the cases where we only have the institution information. We descided to look into these cases further.

In [7]:
has_uni = has_info[has_info['University'].isnull() == False]
has_uni.head()

Unnamed: 0,Institution,University,Approved Amount
0,,Nicht zuteilbar - NA,11619.0
1,Faculté de Psychologie et des Sciences de l'Ed...,Université de Genève - GE,41022.0
2,Kommission für das Corpus philosophorum medii ...,"NPO (Biblioth., Museen, Verwalt.) - NPO",79732.0
3,Abt. Handschriften und Alte Drucke Bibliothek ...,Universität Basel - BS,52627.0
4,Schweiz. Thesauruskommission,"NPO (Biblioth., Museen, Verwalt.) - NPO",120042.0


In [8]:
print('Dropped from original:', get_dropped_perc(p3, has_uni))
print('Dropped from last step:', get_dropped_perc(grants_data, has_uni))

Dropped from original: 20.292641748346853
Dropped from last step: 2.3910255183107743


In [9]:
has_code = has_uni['University'].str.extract('(?P<University>.*?(?= -))(?P<delim>\ -\ )(?P<Code>.*)', expand=True).drop('delim', axis=1)
has_code.head()

Unnamed: 0,University,Code
0,Nicht zuteilbar,
1,Université de Genève,GE
2,"NPO (Biblioth., Museen, Verwalt.)",NPO
3,Universität Basel,BS
4,"NPO (Biblioth., Museen, Verwalt.)",NPO


In [10]:
# Drop NAs and NPOs in Canton Codes (not avaialable and NGOs)
has_code = has_code[has_code.Code != 'NPO']
has_code = has_code[has_code.Code != 'NA']
has_code.head()

Unnamed: 0,University,Code
1,Université de Genève,GE
3,Universität Basel,BS
5,Université de Fribourg,FR
6,Université de Fribourg,FR
7,Universität Zürich,ZH


In [11]:
# For these rows, we allready have the canton code
has_canton_code = has_code['Code'][has_code['Code'].apply(is_canton_code)].rename('Canton code')

has_canton_code.index

Int64Index([    1,     3,     5,     6,     7,     9,    10,    12,    13,
               14,
            ...
            63918, 63919, 63921, 63922, 63927, 63928, 63932, 63934, 63955,
            63958],
           dtype='int64', length=27702)

In [12]:
# For the rest, we will try to get the canton code by requesting the GeoNames web service
has_uni_code = has_code['Code'][has_code['Code'].apply(is_canton_code) == False]

has_uni_no_cantoncode = has_uni.loc[has_uni_code.index]
has_uni_no_cantoncode.head()

Unnamed: 0,Institution,University,Approved Amount
8,,Université de Lausanne - LA,25814.0
19,,Université de Lausanne - LA,14138.0
29,"Eidg. Forschungsanstalt für Wald, Schnee und L...","Eidg. Forschungsanstalt für Wald,Schnee,Land -...",445198.0
47,Chair of English Literature ETH-Zentrum,ETH Zürich - ETHZ,46200.0
51,Faculté des lettres Université de Lausanne,Université de Lausanne - LA,74617.0


In [13]:
for place in has_uni_no_cantoncode['University'].unique():
    if place:
        p = geo_lookup(place)
    if p:
        print(place)
        pprint(p)

For qurery parmas:
'name=Universit%C3%A9+de+Lausanne+-+LA&country=ch&username=username&maxRows=1'
{'status': {'message': 'user account not enabled to use the free webservice. '
                       'Please enable it on your account page: '
                       'http://www.geonames.org/manageaccount ',
            'value': 10}}
For qurery parmas:
'name=Eidg.+Forschungsanstalt+f%C3%BCr+Wald%2CSchnee%2CLand+-+WSL&country=ch&username=username&maxRows=1'
{'status': {'message': 'user account not enabled to use the free webservice. '
                       'Please enable it on your account page: '
                       'http://www.geonames.org/manageaccount ',
            'value': 10}}
For qurery parmas:
'name=ETH+Z%C3%BCrich+-+ETHZ&country=ch&username=username&maxRows=1'
{'status': {'message': 'user account not enabled to use the free webservice. '
                       'Please enable it on your account page: '
                       'http://www.geonames.org/manageaccount ',
          

UnboundLocalError: local variable 'r' referenced before assignment

In [14]:
def map_institution_canton(df):
    for place in df['Institution'].unique():
        if str(place) == 'nan':
            continue
        if str(place).startswith('Uni'):
            continue
        #c = get_canton(place)
        #if c == '':
        p = geo_lookup(place)
        if p:
            print('found', p, 'in the api')
            canton_dict[place] = p
        else:
            has_looked_up[place] = True
            print(place, p)


map_institution_canton(has_info)

For qurery parmas:
'name=Facult%C3%A9+de+Psychologie+et+des+Sciences+de+l%27Education+Universit%C3%A9+de+Gen%C3%A8ve&country=ch&username=username&maxRows=1'
{'status': {'message': 'user account not enabled to use the free webservice. '
                       'Please enable it on your account page: '
                       'http://www.geonames.org/manageaccount ',
            'value': 10}}
Faculté de Psychologie et des Sciences de l'Education Université de Genève None
For qurery parmas:
'name=Kommission+f%C3%BCr+das+Corpus+philosophorum+medii+aevi+der+SGG&country=ch&username=username&maxRows=1'


UnboundLocalError: local variable 'r' referenced before assignment

In [15]:
# Load data for swiss cities
columns = ['Name', 'Kanton']
swiss_cities = pd.read_csv("swiss_cities.csv", delimiter='\t', usecols=columns, na_values=['data not included in P3', 'nan'])
swiss_cities['Kanton'] = swiss_cities['Kanton'].str.split().str[2]
swiss_cities.head(10)

Unnamed: 0,Name,Kanton
0,Zürich,Zürich
1,Genf,Genf
2,Basel,Basel-Stadt
3,Lausanne,Waadt
4,Bern,Bern
5,Winterthur,Zürich
6,Luzern,Luzern
7,St. Gallen,Gallen
8,Lugano,Tessin
9,Biel/Bienne,Bern


In [16]:
# Load data for swiss cities
columns = ['Code', 'Kanton']
swiss_cantons_full = pd.read_csv("swiss_cantons_full.csv", delimiter='\t', usecols=columns, na_values=['data not included in P3', 'nan'])
swiss_cantons_full['Kanton'] = swiss_cantons_full['Kanton'].str.split().str[2]
swiss_cantons_full.head(10)

Unnamed: 0,Code,Kanton
0,ZH,Zürich
1,BE,Bern
2,LU,Luzern
3,UR,Uri
4,SZ,Schwyz
5,OW,Obwalden
6,NW,Nidwalden
7,GL,Glarus
8,ZG,Zug
9,FR,Freiburg


In [17]:
swiss_cities_german = swiss_cities.merge(swiss_cantons_full, on='Kanton')
del swiss_cities_german['Kanton']
swiss_cities_german.columns = ['Name', 'Canton']
swiss_cities_german.head(20)

Unnamed: 0,Name,Canton
0,Zürich,ZH
1,Winterthur,ZH
2,Uster,ZH
3,Dübendorf,ZH
4,Dietikon,ZH
5,Wetzikon,ZH
6,Wädenswil,ZH
7,Horgen,ZH
8,Bülach,ZH
9,Adliswil,ZH


In [18]:
# # Clean Kantons up
# swiss_cities['Kanton'] = swiss_cities['Kanton'].apply(lambda x:get_canton_code(x.rsplit()[-1])) #str.rsplit(None, 1)[0]) # = swiss_cities['Kanton'].map(.str.rsplit(None, 1)[0]
# swiss_cities.head(20)

In [19]:
# Load data for swiss towns
columns = ['Town', 'Canton']
swiss_cities_english = pd.read_csv("swiss_towns.csv", delimiter='\t', usecols=columns, na_values=['data not included in P3', 'nan'])
#swiss_towns.set_index('Town', inplace=True)
swiss_cities_english.columns = ['Name', 'Canton']
swiss_cities_english.head(10)

Unnamed: 0,Name,Canton
0,Aarau,AG
1,Aarberg,BE
2,Aarburg,AG
3,Adliswil,ZH
4,Aesch (BL),BL
5,Affoltern am Albis,ZH
6,Agno,TI
7,Aigle,VD
8,Allschwil,BL
9,Altdorf (UR),UR


In [20]:
# Load data for swiss towns
columns = ['Ville']
swiss_cities_french = pd.read_csv("swiss_cities_french.csv", delimiter='\t', usecols=columns, na_values=['french nan', 'nan'])
#swiss_towns.set_index('Ville', inplace=True)
swiss_cities_french = swiss_cities_french['Ville'].str.split(', ', expand=True)
swiss_cities_french.columns = ['Name', 'Canton']
swiss_cities_french.head(10)

Unnamed: 0,Name,Canton
0,Zurich,ZH
1,Genève,GE
2,Bâle,BS
3,Lausanne,VD
4,Berne,BE
5,Winterthour,ZH
6,Lucerne,LU
7,Saint-Gall,SG
8,Lugano,TI
9,Bienne,BE


In [21]:
swiss_cities_dict = pd.concat([swiss_cities_english, swiss_cities_german, swiss_cities_french])
swiss_cities_dict.drop_duplicates('Name', inplace=True)
swiss_cities_dict.set_index('Name', inplace=True)
swiss_cities_dict.head(10)
# gd = swiss_cities_dict.groupby('Canton')
# gd.get_group('GE')

Unnamed: 0_level_0,Canton
Name,Unnamed: 1_level_1
Aarau,AG
Aarberg,BE
Aarburg,AG
Adliswil,ZH
Aesch (BL),BL
Affoltern am Albis,ZH
Agno,TI
Aigle,VD
Allschwil,BL
Altdorf (UR),UR


In [22]:
# Test for particular town
#swiss_towns.loc[swiss_towns.Town == 'Basel']
#swiss_towns.loc['Basel']['Canton']
has_info.shape

(62481, 3)

In [23]:
# Direct mapping of canton with Switzerland town list from wikipedia https://en.wikipedia.org/wiki/List_of_cities_in_Switzerland

has_info['Canton'] = np.nan

def map_inst_canton(df, column):
    for place in df[column].unique():
        if str(place) == 'nan':
            continue
        for word in place.split():
            if word in swiss_cities_dict.index:
                p = swiss_cities_dict.loc[word]['Canton']
                print(place, " -> ", p)
                df['Canton'][df[column] == place] = p

map_inst_canton(has_info, 'Institution')
map_inst_canton(has_info, 'University')

Faculté de Psychologie et des Sciences de l'Education Université de Genève  ->  GE
Abt. Handschriften und Alte Drucke Bibliothek der Universität Basel  ->  BS
Institut für ökumenische Studien Université de Fribourg  ->  FR


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Ostasiatisches Seminar Universität Zürich  ->  ZH
Laboratoire de Didactique et Epistémologie des Sciences Université de Genève  ->  GE
Klinische Psychologie und Psychotherapie Institut für Psychologie Universität Bern  ->  BE
Schweizerische Rechtsquellen c/o Universität Zürich / RWI  ->  ZH
Département de Sociologie Faculté des Sciences de la Société Université de Genève  ->  GE
Institut für Volkswirtschaft WWZ / FGS Universität Basel  ->  BS
Soziologisches Institut Universität Zürich  ->  ZH
Département des langues et des littératures françaises et latines médiévales Faculté des Lettres / Université de Genève  ->  GE
Schweizerisches Nationalmuseum Landesmuseum Zürich  ->  ZH
Institut für Iberoromanistik Universität Basel  ->  BS
Historische und Antiquarische Gesellschaft zu Basel  ->  BS
Faculté de Théologie Université de Genève  ->  GE
Bibliothèque de Genève Département des périodiques  ->  GE
Unité d'anthropologie Département de génétique et évolution Université de Genève  ->  GE
Un

In [25]:
clean_cantons = has_info.dropna(subset = ['Canton'])
clean_cantons.head(30)
#has_info.describe()
clean_cantons.shape

(43540, 4)

In [26]:
print("Dropped from has_info", get_dropped_perc(has_info, clean_cantons))
print(clean_cantons.shape)
print(has_info.shape)

Dropped from has_info 30.314815703974006
(43540, 4)
(62481, 4)


In [27]:
clean_cantons['Canton'].unique()

array(['GE', 'BS', 'FR', 'ZH', 'VD', 'BE', 'NE', 'SG', 'LU', 'AI', 'AG',
       'TI', 'ZG', 'SZ', 'SH', 'GR', 'VS', 'BL', 'SO'], dtype=object)

In [28]:
grouped_cantons = clean_cantons.groupby('Canton')['Approved Amount'].sum()
#grouped_cantons.first()
grants_cantons = pd.DataFrame(grouped_cantons)
grants_cantons = grants_cantons.reset_index()
grants_cantons.head(26)

Unnamed: 0,Canton,Approved Amount
0,AG,2924342.0
1,AI,140000.0
2,BE,1567640000.0
3,BL,219518.0
4,BS,1375805000.0
5,FR,461595500.0
6,GE,1863555000.0
7,GR,12778460.0
8,LU,54715420.0
9,NE,386342000.0


In [29]:
# Load data for swiss towns
columns = ['Name', 'Canton']
swiss_cantons = pd.read_csv("swiss_cantons.csv", delimiter='\t', usecols=columns, na_values=['Not there', 'nan'])
swiss_cantons.set_index('Canton', inplace=True)
swiss_cantons = swiss_cantons.reset_index()
swiss_cantons.head(26)

Unnamed: 0,Canton,Name
0,ZH,Zürich
1,BE,Bern
2,LU,Luzern
3,UR,Uri
4,SZ,Schwyz
5,OW,Obwalden
6,NW,Nidwalden
7,GL,Glarus
8,ZG,Zug
9,FR,Fribourg


In [30]:
grants_cantons = swiss_cantons.merge(grants_cantons, how='left')
grants_cantons.head(26)

Unnamed: 0,Canton,Name,Approved Amount
0,ZH,Zürich,3504244000.0
1,BE,Bern,1567640000.0
2,LU,Luzern,54715420.0
3,UR,Uri,
4,SZ,Schwyz,1074747.0
5,OW,Obwalden,
6,NW,Nidwalden,
7,GL,Glarus,
8,ZG,Zug,868915.0
9,FR,Fribourg,461595500.0


In [31]:
grants_cantons.fillna(0, inplace=True)

In [32]:
grants_cantons_df = grants_cantons.reset_index()

In [33]:
main_map = folium.Map(location=[46.50,8.20], zoom_start=8)
main_map

In [34]:
#topo_json = pd.read_json('ch-cantons.topojson.json') might not be needed
cantons_topo = 'ch-cantons.topojson.json'

In [35]:
#TODO cantons columns needed in dataframe

main_map.choropleth(geo_path=cantons_topo, 
                     data=grants_cantons_df,
                     columns=['Canton', 'Approved Amount'],
                     key_on='feature.id',
                     topojson='objects.cantons',
                     fill_color='YlGnBu',
                     legend_name = 'Random numbers'
                    )
main_map



In [36]:
pprint(canton_dict)

''


In [37]:
grants_with_has_code = university_grants.join(has_canton_code)
grants_with_has_code.head()

NameError: name 'university_grants' is not defined

In [38]:
university_grants.head()

NameError: name 'university_grants' is not defined

In [39]:
extracts = university_grants['University'].str.extract('(?P<University>.*-)(?P<Code>.*)', expand=True)
has_code = pd.DataFrame( extracts['Code'] )
has_code.Code.apply(is_canton_code)
grouped_has_code = has_code.groupby('Code')

NameError: name 'university_grants' is not defined

In [40]:
only_institution_data = grants_data[grants_data[['University']].isnull().all(axis=1)].index

## TODO:

1. clean up this mess (:

1. make one function for mapping a row to a canton
   Suggested strategy: 
   
   (we can only make 200 req. pr hour to the api, so we should try to keep the number down)
   
   1. Unique institutions, unique universities
   2. Lookup institution name, place result in hashmap: institution -> canton
   3. Parse university. If no canton code is given, lookup. Place result in the hashmap
   4. In the function, do a query in our hashmap for each column. If conflict, return (?)

1. apply the function to all rows in the data, add canton as a column
1. make a df of Canton, Approved Amount
1. Groupby canton, sum approved_amount, add a count column
1. Plot the results on the map etc
1 Bonus
