# Interactive Visualization

In [1]:
import folium

# Normal stack of pandas, numpy, matplotlib and seaborn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns

# For handeling the web requests
import requests
from collections import defaultdict
from pprint import *

# Statistical test library
import scipy.stats as stats
from helpers import *

%matplotlib inline

Folium: https://github.com/python-visualization/folium
 
 Documentation:
 1. https://folium.readthedocs.io/en/latest/
 2. https://media.readthedocs.org/pdf/folium/latest/folium.pdf

In [2]:
#Folium test
map_osm = folium.Map(location=[46.8076878,7.1004592], zoom_start=10)
map_osm

# Data wrangling

Data source (P3_GrantExport.csv)
and description of the columns: http://p3.snf.ch/Pages/DataAndDocumentation.aspx

In [3]:
# Load the data we need. (Columns: Insti)
columns = ['Institution', 'University','Approved Amount']
p3 = pd.read_csv("P3_GrantExport.csv", delimiter=';', usecols=columns, na_values=['data not included in P3', 'nan'])

To be able to work with the data, we have to filter out the values where we have enough information.

We start by filtering out the projects where we know at least the University or the institution related to the project

In [4]:
missing__info = p3[p3[['Institution', 'University']].isnull().all(axis=1)].index
has_info = p3.drop(missing__info, axis=0)

print('Dropped from original:', get_dropped_perc(p3, has_info))
print(has_info.shape)

Dropped from original: 2.3261267176288514
(62481, 3)


As we can see, this leaves out 2.33% of the original data.

In [5]:
grant_info =  has_info.copy()
grant_info = grant_info[grant_info[['Approved Amount']].isnull().all(axis=1) == False]
grants_data = grant_info.loc[grant_info.index]
grants_data.head()

Unnamed: 0,Institution,University,Approved Amount
0,,Nicht zuteilbar - NA,11619.0
1,Faculté de Psychologie et des Sciences de l'Ed...,Université de Genève - GE,41022.0
2,Kommission für das Corpus philosophorum medii ...,"NPO (Biblioth., Museen, Verwalt.) - NPO",79732.0
3,Abt. Handschriften und Alte Drucke Bibliothek ...,Universität Basel - BS,52627.0
4,Schweiz. Thesauruskommission,"NPO (Biblioth., Museen, Verwalt.) - NPO",120042.0


In [6]:
print('Dropped from original:', get_dropped_perc(p3, grants_data))
print('Dropped from last step:', get_dropped_perc(has_info, grants_data))

Dropped from original: 18.340133502165113
Dropped from last step: 16.395384196795824


So, for about 16.4% of the data, we don't have any information about the grant given. 

# Get the geo info

From the description of the dataset, we have the folloring description:
> ### Institution
> According to the information submitted by the responsible applicant, this is the research institution where the project will largely be carried out. Typically, this is the institution or specific lab where the responsible applicant works. The institution is therefore linked to the application. This field is not structured, but its content is consolidated internally. There is no link in the database to the field University.

> ### University
> This is the institution where the project will largely be carried out according to the application. Pick list. This field is only filled if the research is carried out at a Swiss institution, otherwise the field remains blank. In the case of mobility fellowships, it is generally left empty.


So, we know that for the rows where the university is given, the grant was awarded to a swizz university. However, we do not know much about the cases where we only have the institution information. We descided to look into these cases further.

In [7]:
has_uni = has_info[has_info['University'].isnull() == False]
print(has_uni.shape)
has_uni.head()

(50988, 3)


Unnamed: 0,Institution,University,Approved Amount
0,,Nicht zuteilbar - NA,11619.0
1,Faculté de Psychologie et des Sciences de l'Ed...,Université de Genève - GE,41022.0
2,Kommission für das Corpus philosophorum medii ...,"NPO (Biblioth., Museen, Verwalt.) - NPO",79732.0
3,Abt. Handschriften und Alte Drucke Bibliothek ...,Universität Basel - BS,52627.0
4,Schweiz. Thesauruskommission,"NPO (Biblioth., Museen, Verwalt.) - NPO",120042.0


In [8]:
print('Dropped from original:', get_dropped_perc(p3, has_uni))
print('Dropped from last step:', get_dropped_perc(grants_data, has_uni))

Dropped from original: 20.292641748346853
Dropped from last step: 2.3910255183107743


In [9]:
uni_grouped = has_uni.groupby(['University'])

In [None]:
USERNAME = 'user1'
URL = r'http://api.geonames.org/searchJSON?'


geo_cache = defaultdict(str)
def geo_query(name):
    """Do a lookup on geonames.org for name. Returns the first query result."""
    try:
        # Encode the arguments to avoid problems with special characters, spaces etc. 
        encoded_args = urlencode({'name': name, 'country':'ch', 'maxRows':'1', 'username': USERNAME} )
        
        # Request the geonames API
        r = requests.get(URL + encoded_args)
        
        # Parse the result as json
        result = r.json()
        
        if result['totalResultsCount'] == 1:
            # If there was a positive result, get the info from the result
            geonames = result['geonames']
            geo = geonames[0]
            
            canton = geo['adminCode1']
            lat = geo['lat']
            lng = geo['lng']
            
            return {'canton': canton, 'lat': lat, 'long': lng}
        else:
            # Else, we return false
            return False
        
    except BaseException as e:
        # Sometimes we get some strange results back, which leads our parsing to crash.
        # This is allso returned as False
        print('For name:', name, e)
        return False
    
def geo_lookup(name):
    """Cached version of geo_query """
    if geo_cache[name] != '':
        result = geo_cache[name]
    else:
        result = geo_query(name)
        geo_cache[name] = result
    return result

def get_geo_dict(group):
    """
    Returns a dictionary of name -> {canton info} mapping, achieved by querying geonames.org
    Input is a pandas.groupby object. """
    
    # Initialize result sets
    geo_res = {}
    geo_err = set()
    
    
    for ind, group in group:
        # The group name is the first element of the splitted index
        name = ind.split(' - ')[0]
        
        # Do a lookup with our cached API-function, using the whole name
        res = geo_lookup(name)
        
        
        if res:
            # If there is a result, save it
            geo_res[name] = res
        else:
            # If the result is negative, try to do a query with only the last word,
            # which is often the name of the city
            name = name.split(' ')[-1].strip(')')
            res = geo_lookup(name)
            print(name)
            
            if res:
                # If positive, save the result
                geo_res[name] = res
            else:
                # Else, save it as an error to be handeled later
                geo_err.add(ind)
    
    return geo_res, geo_err

In [10]:
## Just for debugging
res, err = get_geo_dict(uni_grouped)

count = 0
for e in err:
    try:
        print(e, has_info[has_info['University'] == e].size)
    except:
        count += 1
#has_info['University'].value_counts()

Schweiz. Institut für Kunstwissenschaft - SIK-ISEA 60
Schweizer Paraplegiker Forschung - SPF 30
Weitere Spitäler - ASPIT 243
Facoltà di Teologia di Lugano - FTL 6
Haute école pédagogique BE, JU, NE - HEPBEJUNE 21
Fachhochschule Kalaidos - FHKD 9
Université de Fribourg - FR 6237
Forschungskommission SAGW 3
Pädagogische Hochschule Zug - PHZG 21
Schweizer Kompetenzzentrum Sozialwissensch. - FORS 93
Interkant. Hochschule für Heilpädagogik ZH - HfH 66
Biotechnologie Institut Thurgau - BITG 24
Eidg. Forschungsanstalt für Wald,Schnee,Land - WSL 669
Physikal.-Meteorolog. Observatorium Davos - PMOD 144
EPF Lausanne - EPFL 13284
Berner Fachhochschule - BFH 408
Pädagogische Hochschule Bern - PHBern 39
Universität Luzern - LU 642
Université de Genève - GE 19182
Pädagogische Hochschule Schaffhausen - PHSH 3
Pädagogische Hochschule St. Gallen - PHSG 39
Hochschule Luzern - HSLU 186
Swiss Institute of Bioinformatics - SIB 93
Nicht zuteilbar - NA 7785
Instituto Ricerche Solari Locarno - IRSOL 15
Instit

In [11]:
geo_res = {}

for name, group in uni_grouped:
    name = name.split(' - ')[0]
    
    res = geo_lookup(name)
    
    if res != -1:
        geo_res[name] = res
    else:
        second_try = name.split(' ')[-1].strip(')')
        res = geo_lookup(second_try)
        if res != -1:
            geo_res[name] = res
        else:
            print('-', name, '/', second_try)
               
# for key in geo_res.keys():
#     print(key, geo_res[key])

In [12]:
geo_res

{'AO Research Institute': False,
 'Allergie- und Asthmaforschung': False,
 'Berner Fachhochschule': False,
 'Biotechnologie Institut Thurgau': False,
 "Centre de rech. sur l'environnement alpin": False,
 'EPF Lausanne': False,
 'ETH Zürich': False,
 'Eidg. Anstalt für Wasserversorgung': False,
 'Eidg. Forschungsanstalt für Wald,Schnee,Land': False,
 'Eidg. Hochschulinstitut für Berufsbildung': False,
 'Eidg. Material und Prüfungsanstalt': False,
 'Ente Ospedaliero Cantonale': False,
 'Fachhochschule Kalaidos': False,
 'Fachhochschule Nordwestschweiz (ohne PH)': False,
 'Fachhochschule Ostschweiz': False,
 'Facoltà di Teologia di Lugano': False,
 'Fernfachhochschule Schweiz (Mitglied SUPSI)': False,
 'Firmen/Privatwirtschaft': False,
 'Forschungsanstalten Agroscope': False,
 'Forschungsinstitut für Opthalmologie': False,
 'Forschungsinstitut für biologischen Landbau': False,
 'Forschungskommission SAGW': False,
 'Franklin University Switzerland': False,
 'Friedrich Miescher Institute': 

In [13]:
geolocalized_df = pd.DataFrame.from_dict(geo_res).T
geolocalized_df[['lat','long']] = geolocalized_df[['lat','long']].apply(pd.to_numeric)
geolocalized_df.describe()

Unnamed: 0,lat,long
count,77.0,77.0
mean,3.062769,0.513327
std,11.699011,1.976302
min,0.0,0.0
25%,0.0,0.0
50%,0.0,0.0
75%,0.0,0.0
max,47.55832,9.38826


Standard deviation for latitude and longitude is less than 1 so we are still operating within Swiss boarders ;)

In [14]:
geolocalized_df = geolocalized_df.reset_index()
geolocalized_df.columns = ['name', 'canton', 'lat', 'long']
geolocalized_df.head()

Unnamed: 0,name,canton,lat,long
0,AO Research Institute,False,0.0,0.0
1,Allergie- und Asthmaforschung,False,0.0,0.0
2,Berner Fachhochschule,False,0.0,0.0
3,Biotechnologie Institut Thurgau,False,0.0,0.0
4,Centre de rech. sur l'environnement alpin,False,0.0,0.0


In [15]:
has_uni.head()

Unnamed: 0,Institution,University,Approved Amount
0,,Nicht zuteilbar - NA,11619.0
1,Faculté de Psychologie et des Sciences de l'Ed...,Université de Genève - GE,41022.0
2,Kommission für das Corpus philosophorum medii ...,"NPO (Biblioth., Museen, Verwalt.) - NPO",79732.0
3,Abt. Handschriften und Alte Drucke Bibliothek ...,Universität Basel - BS,52627.0
4,Schweiz. Thesauruskommission,"NPO (Biblioth., Museen, Verwalt.) - NPO",120042.0


## TODO's
* merge geolocalized_df with has_uni
* supply it to chris's method
* display on map
* do a bonus exercise

In [16]:
# TO BE VERIFIED IF STILL NEEDED
has_code = has_uni['University'].str.extract('(?P<University>.*?(?= -))(?P<delim>\ -\ )(?P<Code>.*)', expand=True).drop('delim', axis=1)
has_code.head()

Unnamed: 0,University,Code
0,Nicht zuteilbar,
1,Université de Genève,GE
2,"NPO (Biblioth., Museen, Verwalt.)",NPO
3,Universität Basel,BS
4,"NPO (Biblioth., Museen, Verwalt.)",NPO


In [17]:
# TO BE VERIFIED IF STILL NEEDED

# Drop NAs and NPOs in Canton Codes (not avaialable and NGOs)
has_code = has_code[has_code.Code != 'NPO']
has_code = has_code[has_code.Code != 'NA']
has_code.head()

Unnamed: 0,University,Code
1,Université de Genève,GE
3,Universität Basel,BS
5,Université de Fribourg,FR
6,Université de Fribourg,FR
7,Universität Zürich,ZH


In [18]:
# TO BE VERIFIED IF STILL NEEDED

# For these rows, we allready have the canton code
has_canton_code = has_code['Code'][has_code['Code'].apply(is_canton_code)].rename('Canton code')

print(has_canton_code.index.shape)

(27702,)


In [19]:
# For the rest, we will try to get the canton code by parsing city name from university/institution column

# Load data for swiss cities (german names)
columns = ['Name', 'Kanton']
swiss_cities = pd.read_csv("swiss_cities.csv", delimiter='\t', usecols=columns, na_values=['data not included in P3', 'nan'])
swiss_cities['Kanton'] = swiss_cities['Kanton'].str.split().str[2]
swiss_cities.head(10)

Unnamed: 0,Name,Kanton
0,Zürich,Zürich
1,Genf,Genf
2,Basel,Basel-Stadt
3,Lausanne,Waadt
4,Bern,Bern
5,Winterthur,Zürich
6,Luzern,Luzern
7,St. Gallen,Gallen
8,Lugano,Tessin
9,Biel/Bienne,Bern


In [20]:
# Load data for swiss cantons
columns = ['Code', 'Kanton']
swiss_cantons_full = pd.read_csv("swiss_cantons_full.csv", delimiter='\t', usecols=columns, na_values=['data not included in P3', 'nan'])
swiss_cantons_full['Kanton'] = swiss_cantons_full['Kanton'].str.split().str[2]
swiss_cantons_full.head(10)

Unnamed: 0,Code,Kanton
0,ZH,Zürich
1,BE,Bern
2,LU,Luzern
3,UR,Uri
4,SZ,Schwyz
5,OW,Obwalden
6,NW,Nidwalden
7,GL,Glarus
8,ZG,Zug
9,FR,Freiburg


In [21]:
#Add canton code to our loaded cities
swiss_cities_german = swiss_cities.merge(swiss_cantons_full, on='Kanton')
del swiss_cities_german['Kanton']
swiss_cities_german.columns = ['Name', 'Canton']
swiss_cities_german.head(20)

Unnamed: 0,Name,Canton
0,Zürich,ZH
1,Winterthur,ZH
2,Uster,ZH
3,Dübendorf,ZH
4,Dietikon,ZH
5,Wetzikon,ZH
6,Wädenswil,ZH
7,Horgen,ZH
8,Bülach,ZH
9,Adliswil,ZH


In [22]:
# Load data for swiss towns (english names)
columns = ['Town', 'Canton']
swiss_cities_english = pd.read_csv("swiss_towns.csv", delimiter='\t', usecols=columns, na_values=['data not included in P3', 'nan'])
#swiss_towns.set_index('Town', inplace=True)
swiss_cities_english.columns = ['Name', 'Canton']
swiss_cities_english.head(10)

Unnamed: 0,Name,Canton
0,Aarau,AG
1,Aarberg,BE
2,Aarburg,AG
3,Adliswil,ZH
4,Aesch (BL),BL
5,Affoltern am Albis,ZH
6,Agno,TI
7,Aigle,VD
8,Allschwil,BL
9,Altdorf (UR),UR


In [23]:
# Load data for swiss towns (French names)
columns = ['Ville']
swiss_cities_french = pd.read_csv("swiss_cities_french.csv", delimiter='\t', usecols=columns, na_values=['french nan', 'nan'])
#swiss_towns.set_index('Ville', inplace=True)
swiss_cities_french = swiss_cities_french['Ville'].str.split(', ', expand=True)
swiss_cities_french.columns = ['Name', 'Canton']
swiss_cities_french.head(10)

Unnamed: 0,Name,Canton
0,Zurich,ZH
1,Genève,GE
2,Bâle,BS
3,Lausanne,VD
4,Berne,BE
5,Winterthour,ZH
6,Lucerne,LU
7,Saint-Gall,SG
8,Lugano,TI
9,Bienne,BE


In [None]:
# Create data frame containing city names in all available languages:
# (german, english and french)
swiss_cities_dict = pd.concat([swiss_cities_english, swiss_cities_german, swiss_cities_french])
swiss_cities_dict.drop_duplicates('Name', inplace=True)
swiss_cities_dict.set_index('Name', inplace=True)
swiss_cities_dict.head(10)

Unnamed: 0_level_0,Canton
Name,Unnamed: 1_level_1
Aarau,AG
Aarberg,BE
Aarburg,AG
Adliswil,ZH
Aesch (BL),BL
Affoltern am Albis,ZH
Agno,TI
Aigle,VD
Allschwil,BL
Altdorf (UR),UR


In [None]:
# Direct mapping of canton with our Switzerland town list 

has_info['Canton'] = np.nan

def map_inst_canton(df, column):
    for place in df[column].unique():
        if str(place) == 'nan':
            continue
        for word in place.split():
            if word in swiss_cities_dict.index:
                p = swiss_cities_dict.loc[word]['Canton']
                #print(place, " -> ", p)
                df['Canton'][df[column] == place] = p

map_inst_canton(has_info, 'Institution')
map_inst_canton(has_info, 'University')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [None]:
clean_cantons = has_info.dropna(subset = ['Canton'])
clean_cantons.head()

In [None]:
print('Dropped from original:', get_dropped_perc(has_info, clean_cantons))

We are going to parse more results than that but lets work on this small dataset for now.

In [None]:
# List of parsed cantons (Note that some are missing! We will handle that!) 
clean_cantons['Canton'].unique()

In [None]:
#Lets get the amount of subsidy for each canton
grouped_cantons = clean_cantons.groupby('Canton')['Approved Amount'].sum()
grants_cantons = pd.DataFrame(grouped_cantons)
grants_cantons = grants_cantons.reset_index()
grants_cantons.head(26)

In [None]:
# Load data for all swiss cantons
columns = ['Name', 'Canton']
swiss_cantons = pd.read_csv("swiss_cantons.csv", delimiter='\t', usecols=columns, na_values=['Not there', 'nan'])
swiss_cantons.set_index('Canton', inplace=True)
swiss_cantons = swiss_cantons.reset_index()
swiss_cantons.head(26)

In [None]:
# Lets merge our parsed cantons with all cantons in CH
grants_cantons = swiss_cantons.merge(grants_cantons, how='left')
grants_cantons.head(26)

In [None]:
#Fill out those NaN's
grants_cantons.fillna(0, inplace=True)
grants_cantons

In [None]:
# And make a first sketch on the map
cantons_topo = 'ch-cantons.topojson.json'

main_map = folium.Map(location=[46.50,8.20], zoom_start=8)
main_map.choropleth(geo_path=cantons_topo, 
                     data=grants_cantons,
                     columns=['Canton', 'Approved Amount'],
                     key_on='feature.id',
                     topojson='objects.cantons',
                     fill_color='YlGnBu',
                     legend_name = 'Random numbers'
                    )
main_map

## TODO's
* merge geolocalized_df with has_uni
* supply it to chris's method
* display on map
* do a bonus exercise