# Interactive Visualization

In [1]:
import folium

# Normal stack of pandas, numpy, matplotlib and seaborn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns

# For handeling the web requests
import requests
from collections import defaultdict
from pprint import *

# Statistical test library
import scipy.stats as stats
from helpers import *

# Output handling
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

Folium: https://github.com/python-visualization/folium
 
 Documentation:
 1. https://folium.readthedocs.io/en/latest/
 2. https://media.readthedocs.org/pdf/folium/latest/folium.pdf

In [2]:
#Folium test
map_osm = folium.Map(location=[46.8076878,7.1004592], zoom_start=10)
map_osm

# Data wrangling

Data source (P3_GrantExport.csv)
and description of the columns: http://p3.snf.ch/Pages/DataAndDocumentation.aspx

In [3]:
# Load the data we need. (Columns: Insti)
columns = ['Institution', 'University','Approved Amount']
p3 = pd.read_csv("P3_GrantExport.csv", delimiter=';', usecols=columns, na_values=['data not included in P3', 'nan'])

To be able to work with the data, we have to filter out the values where we have enough information.

We start by filtering out the projects where we know at least the University or the institution related to the project

In [4]:
missing__info = p3[p3[['Institution', 'University']].isnull().all(axis=1)].index
has_info = p3.drop(missing__info, axis=0)

print('Dropped from original:', get_dropped_perc(p3, has_info))
print(has_info.shape)

Dropped from original: 2.3261267176288514
(62481, 3)


As we can see, this leaves out 2.33% of the original data.

In [5]:
grant_info =  has_info.copy()
grant_info = grant_info[grant_info[['Approved Amount']].isnull().all(axis=1) == False]
grants_data = grant_info.loc[grant_info.index]
grants_data.head()

Unnamed: 0,Institution,University,Approved Amount
0,,Nicht zuteilbar - NA,11619.0
1,Faculté de Psychologie et des Sciences de l'Ed...,Université de Genève - GE,41022.0
2,Kommission für das Corpus philosophorum medii ...,"NPO (Biblioth., Museen, Verwalt.) - NPO",79732.0
3,Abt. Handschriften und Alte Drucke Bibliothek ...,Universität Basel - BS,52627.0
4,Schweiz. Thesauruskommission,"NPO (Biblioth., Museen, Verwalt.) - NPO",120042.0


In [6]:
print('Dropped from original:', get_dropped_perc(p3, grants_data))
print('Dropped from last step:', get_dropped_perc(has_info, grants_data))

Dropped from original: 18.340133502165113
Dropped from last step: 16.395384196795824


So, for about 16.4% of the data, we don't have any information about the grant given. 

# Get the geo info

From the description of the dataset, we have the folloring description:
> ### Institution
> According to the information submitted by the responsible applicant, this is the research institution where the project will largely be carried out. Typically, this is the institution or specific lab where the responsible applicant works. The institution is therefore linked to the application. This field is not structured, but its content is consolidated internally. There is no link in the database to the field University.

> ### University
> This is the institution where the project will largely be carried out according to the application. Pick list. This field is only filled if the research is carried out at a Swiss institution, otherwise the field remains blank. In the case of mobility fellowships, it is generally left empty.


So, we know that for the rows where the university is given, the grant was awarded to a swizz university. However, we do not know much about the cases where we only have the institution information. We descided to look into these cases further.

In [7]:
has_uni = has_info[has_info['University'].isnull() == False]

has_uni[has_uni['University'] == 'Eidg. Material und Prüfungsanstalt - EMPA']
print(has_uni.shape)
has_uni.head()

(50988, 3)


Unnamed: 0,Institution,University,Approved Amount
0,,Nicht zuteilbar - NA,11619.0
1,Faculté de Psychologie et des Sciences de l'Ed...,Université de Genève - GE,41022.0
2,Kommission für das Corpus philosophorum medii ...,"NPO (Biblioth., Museen, Verwalt.) - NPO",79732.0
3,Abt. Handschriften und Alte Drucke Bibliothek ...,Universität Basel - BS,52627.0
4,Schweiz. Thesauruskommission,"NPO (Biblioth., Museen, Verwalt.) - NPO",120042.0


In [8]:
print('Dropped from original:', get_dropped_perc(p3, has_uni))
print('Dropped from last step:', get_dropped_perc(grants_data, has_uni))

Dropped from original: 20.292641748346853
Dropped from last step: 2.3910255183107743


In [9]:
uni_grouped = has_uni.groupby(['University'])

In [10]:
res, err = get_geo_dict(uni_grouped)

NameError: name 'get_geo_dict' is not defined

In [None]:
geolocalized_df = pd.DataFrame.from_dict(res).T
geolocalized_df[['lat','long']] = geolocalized_df[['lat','long']].apply(pd.to_numeric)
geolocalized_df.describe()

Standard deviation for latitude and longitude is less than 1 so we are still operating within Swiss boarders ;)

In [None]:
geolocalized_df = geolocalized_df.reset_index()
geolocalized_df.columns = ['University', 'Canton', 'Lat', 'Lon']
geolocalized_df.head()

In [None]:
has_code = has_uni['University'].str.extract('(?P<University_name>.*?(?= -))(?P<delim>\ -\ )(?P<Canton>.*)', expand=True).drop('delim', axis=1)
combined = has_uni.join(has_code, how="left")

# For the rest, we will try to get the canton code by requesting the GeoNames web service
combined_is_canton_code = combined[combined['Canton'].apply(is_canton_code) == True]
geolocalized_1 = geolocalized_df.append(combined_is_canton_code[['University', 'Canton']])
geolocalized_final = geolocalized_1.drop_duplicates(['University'])

In [None]:
geo_uni = has_uni.merge(geolocalized_final, on='University', how='left')
geo_uni

In [None]:
geo_uni = geo_uni[['Canton','Lat','Lon']]
geo_all = has_info.join(geo_uni, how='left')
print(geo_all.shape)
geo_all

### For the rest, we will try to get the canton code by parsing city name from university/institution column

In [None]:
# Load data for swiss # Load data for swiss cities (german names)
columns = ['Name', 'Kanton']
swiss_cities = pd.read_csv("swiss_cities.csv", delimiter='\t', usecols=columns, na_values=['data not included in P3', 'nan'])
swiss_cities['Kanton'] = swiss_cities['Kanton'].str.split().str[2]
swiss_cities.head(10)

In [None]:
# Load data for swiss cantons
columns = ['Code', 'Kanton', 'Sprache']
swiss_cantons_full = pd.read_csv("swiss_cantons_full.csv", delimiter='\t', usecols=columns, na_values=['data not included in P3', 'nan'])
swiss_cantons_full['Kanton'] = swiss_cantons_full['Kanton'].str.split().str[2]
swiss_cantons_full.head(10)

In [None]:
# Load data for swiss towns (english names)
columns = ['Town', 'Canton']
swiss_cities_english = pd.read_csv("swiss_towns.csv", delimiter='\t', usecols=columns, na_values=['data not included in P3', 'nan'])
swiss_cities_english.columns = ['Name', 'Canton']
swiss_cities_english.head(10)

In [None]:
#Add canton code to our loaded cities
swiss_cities_german = swiss_cities.merge(swiss_cantons_full, on='Kanton')
del swiss_cities_german['Kanton']
swiss_cities_german.columns = ['Name', 'Canton']
swiss_cities_german.head(20)

In [None]:
# Load data for swiss towns (French names)
columns = ['Ville']
swiss_cities_french = pd.read_csv("swiss_cities_french.csv", delimiter='\t', usecols=columns, na_values=['french nan', 'nan'])
swiss_cities_french = swiss_cities_french['Ville'].str.split(', ', expand=True)
swiss_cities_french.columns = ['Name', 'Canton']
swiss_cities_french.head(10)

In [None]:
# Create data frame containing city names in all available languages:
# (german, english and french)
swiss_cities_dict = pd.concat([swiss_cities_english, swiss_cities_german, swiss_cities_french])
swiss_cities_dict.drop_duplicates('Name', inplace=True)
swiss_cities_dict.set_index('Name', inplace=True)
swiss_cities_dict.head(10)

In [None]:
geo_all_copy = geo_all.copy()

In [None]:
# Direct mapping of canton with our Switzerland town list 

def map_inst_canton(df, column):
    for place in df[column].unique():
        place = str(place)
        if place == np.nan:
            continue
        for word in place.split():
            if word in swiss_cities_dict.index:
                p = swiss_cities_dict.loc[word]['Canton']
                #print(place, " -> ", p)
                df['Canton'][df[column] == place] = p

map_inst_canton(geo_all_copy, 'Institution')
map_inst_canton(geo_all_copy, 'University')

In [None]:
# Check how many entries still don't have a canton assigned
geo_all[geo_all['Canton'].isnull()].shape

In [None]:
# Drop all the entries that don't have a canton
dropped = geo_all_copy[geo_all_copy['Canton'].isnull()]


In [None]:
print('Coverage so far from original data containing set: ', get_dropped_perc(has_info, dropped))

In [None]:
dropped['University'].value_counts()

## Manual labour

In [None]:
geo_all[(geo_all['University'] == 'Paul Scherrer Institut') & (geo_all['Canton'] == np.nan)]['Canton'] = 'AG'
geo_all[(geo_all['University'] == 'Università della Svizzera italiana - USI') & (geo_all['Canton'] == np.nan)]['Canton'] = 'TI'
geo_all[(geo_all['University'] == 'Eidg. Anstalt für Wasserversorgung') & (geo_all['Canton'] == np.nan)]['Canton'] = 'ZH'
geo_all[(geo_all['University'] == 'Zürcher Fachhochschule (ohne PH) - ZFH') & (geo_all['Canton'] == np.nan)]['Canton'] = 'ZH'
geo_all[(geo_all['University'] == 'Eidg. Material und Prüfungsanstalt - EMPA') & (geo_all['Canton'] == np.nan)]['Canton'] = 'ZH'
geo_all[(geo_all['University'] == 'Institut Universitaire Kurt Bösch - IUKB') & (geo_all['Canton'] == np.nan)]['Canton'] = 'VS'
geo_all[(geo_all['University'] == 'Allergie- und Asthmaforschung - SIAF') & (geo_all['Canton'] == np.nan)]['Canton'] = 'GR'
geo_all[(geo_all['University'] == 'Swiss Center for Electronics & Microtech. - CSEM') & (geo_all['Canton'] == np.nan)]['Canton'] = 'NE'
geo_all[(geo_all['University'] == 'Pädagogische Hochschule Nordwestschweiz - PHFHNW') & (geo_all['Canton'] == np.nan)]['Canton'] = 'AG'
geo_all[(geo_all['University'] == 'Kantonsspital St. Gallen - KSPSG') & (geo_all['Canton'] == np.nan)]['Canton'] = 'SG'
geo_all[(geo_all['University'] == 'Berner Fachhochschule - BFH') & (geo_all['Canton'] == np.nan)]['Canton'] = 'BE'
geo_all[(geo_all['University'] == 'SUP della Svizzera italiana - SUPSI') & (geo_all['Canton'] == np.nan)]['Canton'] = 'TI'
geo_all[(geo_all['University'] == 'Idiap Research Institute - IDIAP') & (geo_all['Canton'] == np.nan)]['Canton'] = 'VS'
geo_all[(geo_all['University'] == 'Inst. de Hautes Etudes Internat. et du Dév - IHEID') & (geo_all['Canton'] == np.nan)]['Canton'] = 'GE'
geo_all[(geo_all['University'] == 'Friedrich Miescher Institute - FMI ') & (geo_all['Canton'] == np.nan)]['Canton'] = 'BS'
geo_all[(geo_all['University'] == 'Forschungsanstalten Agroscope - AGS') & (geo_all['Canton'] == np.nan)]['Canton'] = 'VD'
geo_all[(geo_all['University'] == 'HES de Suisse occidentale - HES-SO') & (geo_all['Canton'] == np.nan)]['Canton'] = 'JU'
geo_all[(geo_all['University'] == 'Fachhochschule Nordwestschweiz (ohne PH) - FHNW') & (geo_all['Canton'] == np.nan)]['Canton'] = 'SO'
geo_all[(geo_all['University'] == 'Universität St. Gallen') & (geo_all['Canton'] == np.nan)]['Canton'] = 'SG'
geo_all[(geo_all['University'] == 'Eidg. Forschungsanstalt für Wald,Schnee,L& - WSL') & (geo_all['Canton'] == np.nan)]['Canton'] = 'ZH'

In [None]:
clean_cantons = geo_all.dropna(subset = ['Canton'])
clean_cantons.head()

In [None]:
print(clean_cantons.shape)
print(has_info.shape)
print('Dropped from original:', get_dropped_perc(has_info, clean_cantons))

We are going to parse more results than that but lets work on this small dataset for now.

In [None]:
# List of parsed cantons (Note that some are missing! We will handle that!) 
clean_cantons['Canton'].unique()

In [None]:
#Lets get the amount of subsidy for each canton
grouped_cantons = clean_cantons.groupby('Canton')['Approved Amount'].sum()
grants_cantons = pd.DataFrame(grouped_cantons)
grants_cantons = grants_cantons.reset_index()
grants_cantons.head(26)

In [None]:
# Load data for all swiss cantons
columns = ['Name', 'Canton']
swiss_cantons = pd.read_csv("swiss_cantons.csv", delimiter='\t', usecols=columns, na_values=['Not there', 'nan'])
swiss_cantons.set_index('Canton', inplace=True)
swiss_cantons = swiss_cantons.reset_index()
swiss_cantons.head(26)

In [None]:
# Lets merge our parsed cantons with all cantons in CH
grants_cantons = swiss_cantons.merge(grants_cantons, how='left')
grants_cantons.head(26)

In [None]:
#Fill out those NaN's
grants_cantons.fillna(0, inplace=True)
grants_cantons

In [None]:
# And make a first sketch on the map
cantons_topo = 'ch-cantons.topojson.json'

main_map = folium.Map(location=[46.50,8.20], zoom_start=8)
main_map.choropleth(geo_path=cantons_topo, 
                     data=grants_cantons,
                     columns=['Canton', 'Approved Amount'],
                     key_on='feature.id',
                     topojson='objects.cantons',
                     fill_color='YlGnBu',
                     legend_name = 'Random numbers'
                    )
main_map

## TODO's
* merge geolocalized_df with has_uni
* supply it to chris's method
* display on map
* do a bonus exercise

In [None]:
grants_cantons