# Interactive Visualization

In [3]:
#WARNING guys! In order to use folium you are gonna need to run
# pip install folium
#in your terminal.
import folium

#Probably gonna use this:
ext
# Normal stack of pandas, numpy, matplotlib and seaborn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns

# For handeling the web requests
import requests
from collections import defaultdict
from pprint import *

# Statistical test library
import scipy.stats as stats
from helpers import *

%matplotlib inline

Folium: https://github.com/python-visualization/folium
 
 Documentation:
 1. https://folium.readthedocs.io/en/latest/
 2. https://media.readthedocs.org/pdf/folium/latest/folium.pdf

In [2]:
#Folium test
map_osm = folium.Map(location=[46.8076878,7.1004592], zoom_start=10)
map_osm

# Data wrangeling

Data source (P3_GrantExport.csv)
and description of the columns: http://p3.snf.ch/Pages/DataAndDocumentation.aspx

In [4]:
# Load the data we need. (Columns: Insti)
columns = ['Institution', 'University','Approved Amount']
p3 = pd.read_csv("P3_GrantExport.csv", delimiter=';', usecols=columns, na_values=['data not included in P3', 'nan'])

To be able to work with the data, we have to filter out the values where we have enough information.

We start by filtering out the projects where we know at least the University or the institution related to the project

In [5]:
missing__info = p3[p3[['Institution', 'University']].isnull().all(axis=1)].index
has_info = p3.drop(missing__info, axis=0)

print('Dropped from original:', get_dropped_perc(p3, has_info))


Dropped from original: 2.3261267176288514


As we can see, this leaves out 2.33% of the original data.

In [6]:
grant_info = has_info[has_info[['Approved Amount']].isnull().all(axis=1) == False]
grants_data = has_info.loc[grant_info.index]
grants_data.head()

Unnamed: 0,Institution,University,Approved Amount
0,,Nicht zuteilbar - NA,11619.0
1,Faculté de Psychologie et des Sciences de l'Ed...,Université de Genève - GE,41022.0
2,Kommission für das Corpus philosophorum medii ...,"NPO (Biblioth., Museen, Verwalt.) - NPO",79732.0
3,Abt. Handschriften und Alte Drucke Bibliothek ...,Universität Basel - BS,52627.0
4,Schweiz. Thesauruskommission,"NPO (Biblioth., Museen, Verwalt.) - NPO",120042.0


In [7]:
print('Dropped from original:', get_dropped_perc(p3, grants_data))
print('Dropped from last step:', get_dropped_perc(has_info, grants_data))

Dropped from original: 18.340133502165113
Dropped from last step: 16.395384196795824


So, for about 16.4% of the data, we don't have any information about the grant given. 

# Get the geo info

From the description of the dataset, we have the folloring description:
> ### Institution
> According to the information submitted by the responsible applicant, this is the research institution where the project will largely be carried out. Typically, this is the institution or specific lab where the responsible applicant works. The institution is therefore linked to the application. This field is not structured, but its content is consolidated internally. There is no link in the database to the field University.

> ### University
> This is the institution where the project will largely be carried out according to the application. Pick list. This field is only filled if the research is carried out at a Swiss institution, otherwise the field remains blank. In the case of mobility fellowships, it is generally left empty.


So, we know that for the rows where the university is given, the grant was awarded to a swizz university. However, we do not know much about the cases where we only have the institution information. We descided to look into these cases further.

In [8]:
has_uni = has_info[has_info['University'].isnull() == False]
has_uni.head()
has_uni.shape

(50988, 3)

defaultdict(str, {})

In [10]:
uni_grouped = has_uni.groupby(['University'])
for group in uni_grouped:
    print(group)

('AO Research Institute - AORI',                                              Institution  \
4900                         AO Forschungsinstitut (ARI)   
5026                         AO Forschungsinstitut (ARI)   
5103                         AO Forschungsinstitut (ARI)   
5554                         AO Forschungsinstitut (ARI)   
5629                         AO Forschungsinstitut (ARI)   
6777                         AO Forschungsinstitut (ARI)   
7621                         AO Forschungsinstitut (ARI)   
13833                        AO Forschungsinstitut (ARI)   
21745                        AO Forschungsinstitut (ARI)   
36018                        AO Forschungsinstitut (ARI)   
39633  Biomaterials & Tissue Engineering Program AO R...   
39648  Biomaterials & Tissue Engineering Program AO R...   
47171  Biomaterials & Tissue Engineering Program AO R...   
48067  Biomaterials & Tissue Engineering Program AO R...   
54679  Biomaterials & Tissue Engineering Program AO R...   
57368  

In [106]:
username = 'user1'
url = r'http://api.geonames.org/searchJSON?'
geo_cache = defaultdict(str)

''' Do a lookup on geonames.org for name. Returns the first query result. '''
def query_geonames(name):
    # Insert username
    try:
        encoded_args = urlencode({'name': name, 'country':'ch', 'maxRows':'10', 'username': username} )
        r = requests.get(url + encoded_args)
        result = r.json()
        
        if result['totalResultsCount'] > 0:
            geonames = result['geonames']
            geo = geonames[0]
            
            canton = geo['adminCode1']
            lat = geo['lat']
            long = geo['lng']
            
            return {'canton': canton, 'lat': lat, 'long': long}
        else:
            return -1
    except BaseException as e:
        print('For name:', name)
        pprint(e)
        return -1
        
def geo_lookup(name):
    if geo_cache[name] != '':
        result = geo_cache[name]
    else:
        result = query_geonames(name)
        geo_cache[name] = result
    return result

def get_geo_dict(group):
    geo_res = {}
    geo_err = set()
    
    for ind, group in group:
        name = ind.split(' - ')[0]

        res = geo_lookup(name)

        if res != -1:
            geo_res[name] = res
        else:
            second_try = name.split(' ')[-1].strip(')')
            res = geo_lookup(second_try)
            if res != -1:
                geo_res[name] = res
            else:
                geo_err.add(ind)
    
    return geo_res, geo_err

In [114]:
res, err = get_geo_dict(uni_grouped)

count = 0
for e in err:
    try:
        print(e, has_info[has_info['University'] == e].size)
    except:
        count += 1
#has_info['University'].value_counts()

Allergie- und Asthmaforschung - SIAF 132
Istituto Svizzero di Roma - ISR 18
Forschungsinstitut für biologischen Landbau - FIBL 51
Schweizer Kompetenzzentrum Sozialwissensch. - FORS 93
Inst. universit. romand de Santé au Travail - IST 105
NPO (Biblioth., Museen, Verwalt.) - NPO 4419
HES de Suisse occidentale - HES-SO 813
Eidg. Anstalt für Wasserversorgung - EAWAG 999
Schweizer Paraplegiker Forschung - SPF 30
Eidg. Material und Prüfungsanstalt - EMPA 714
Swiss Center for Electronics and Microtech. - CSEM 84
Forschungsinstitut für Opthalmologie - IRO 42
Forschungskommission SAGW 3
Weitere Spitäler - ASPIT 243
Nicht zuteilbar - NA 7785
Firmen/Privatwirtschaft - FP 1476
Schweiz. Institut für Kunstwissenschaft - SIK-ISEA 60
Inst. Suisse de Spéléologie et Karstologie - ISSKA 9
Inst. de Hautes Etudes Internat. et du Dév - IHEID 594
Eidg. Forschungsanstalt für Wald,Schnee,Land - WSL 669
Fachhochschule Kalaidos - FHKD 9
Haute école pédagogique fribourgeoise - HEPFR 21
Swiss Institute of Bioinfor

In [80]:
geo_res = {}

for name, group in uni_grouped:
    name = name.split(' - ')[0]
    
    res = geo_lookup(name)
    
    if res != -1:
        geo_res[name] = res
    else:
        second_try = name.split(' ')[-1].strip(')')
        res = geo_lookup(second_try)
        if res != -1:
            geo_res[name] = res
        else:
            print('-', name, '/', second_try)
               
for key in geo_res.keys():
    print(key, geo_res[key])

- Allergie- und Asthmaforschung / Asthmaforschung
- Eidg. Anstalt für Wasserversorgung / Wasserversorgung
- Eidg. Forschungsanstalt für Wald,Schnee,Land / Wald,Schnee,Land
- Eidg. Hochschulinstitut für Berufsbildung / Berufsbildung
- Eidg. Material und Prüfungsanstalt / Prüfungsanstalt
- Fachhochschule Kalaidos / Kalaidos
- Firmen/Privatwirtschaft / Firmen/Privatwirtschaft
- Forschungsinstitut für Opthalmologie / Opthalmologie
- Forschungsinstitut für biologischen Landbau / Landbau
- Forschungskommission SAGW / SAGW
For name: occidentale
KeyError('adminCode1',)
- HES de Suisse occidentale / occidentale
- Haute école pédagogique fribourgeoise / fribourgeoise
- Inst. Suisse de Spéléologie et Karstologie / Karstologie
- Inst. de Hautes Etudes Internat. et du Dév / Dév
- Inst. universit. romand de Santé au Travail / Travail
- Istituto Svizzero di Roma / Roma
- NPO (Biblioth., Museen, Verwalt.) / Verwalt.
- Nicht zuteilbar / zuteilbar
- Schweiz. Institut für Kunstwissenschaft / Kunstwissens

In [74]:

df = pd.DataFrame.from_dict(geo_res).T
df.describe()

df[['lat','long']] = df[['lat','long']].apply(pd.to_numeric)
df.describe()



Unnamed: 0,lat,long
count,53.0,53.0
mean,46.979823,8.183785
std,0.530679,0.900651
min,46.01008,6.14569
25%,,
50%,,
75%,,
max,47.69732,9.83723


In [None]:
df.pivot(index='')

In [None]:
has_uni.head(100)

In [None]:
p = geo_lookup('University of Geneve')

In [None]:
print(p)

In [None]:
print('Dropped from original:', get_dropped_perc(p3, has_uni))
print('Dropped from last step:', get_dropped_perc(grants_data, has_uni))

Lets extract country code from University name and make a column out of it

In [None]:
has_code = has_uni['University'].str.extract('(?P<University>.*?(?= -))(?P<delim>\ -\ )(?P<Code>.*)', expand=True).drop('delim', axis=1)
has_code.head()

In [None]:
# Drop NAs and NPOs in Canton Codes (not avaialable and NGOs)
has_code = has_code[has_code["Code"] != 'NPO']
has_code = has_code[has_code["Code"] != 'NA']
has_code.head()

In [None]:
# For these rows, we allready have the canton code
has_canton_code = has_code['Code'][has_code['Code'].apply(is_canton_code)].rename('Canton code')

has_canton_code.index

In [None]:
# For the rest, we will try to get the canton code by requesting the GeoNames web service
has_uni_code = has_code['Code'][has_code['Code'].apply(is_canton_code) == False]

has_uni_no_cantoncode = has_uni.loc[has_uni_code.index]
has_uni_no_cantoncode.head()

We don't want to be banned from GeoLookup so first lets check how many missing values are we considering:

In [None]:
has_uni_no_cantoncode['University'].unique().shape

In [None]:
## Whats the point of this loop?

for place in has_uni_no_cantoncode['University'].unique():
    if place:
        p = geo_lookup(place)
    if p:
        print(place)
        pprint(p)

In [None]:
has_info['Institution'].unique().shape

In [None]:
'''
def map_institution_canton(df):
    for place in df['Institution'].unique():
        if str(place) == 'nan':
            continue
        if str(place).startswith('Uni'):
            continue
        #c = get_canton(place)
        #if c == '':
        p = geo_lookup(place)
        if p:
            print('found', p, 'in the api')
            canton_dict[place] = p
        else:
            has_looked_up[place] = True
            print(place, p)


map_institution_canton(has_info)
'''

canton_dict['Université de Genève'] = 'GE'

In [None]:
# Load data for swiss cities
columns = ['Name', 'Kanton']
swiss_cities = pd.read_csv("swiss_cities.csv", delimiter='\t', usecols=columns, na_values=['data not included in P3', 'nan'])
swiss_cities.head(10)

In [None]:
# Clean Kantons up
#swiss_cities['Kanton'] = swiss_cities['Kanton'].apply(lambda x:get_canton_code(x.rsplit()[-1])) #str.rsplit(None, 1)[0]) # = swiss_cities['Kanton'].map(.str.rsplit(None, 1)[0]
#swiss_cities.head(20)
# Broken since German names

In [None]:
# Load data for swiss towns
columns = ['Town', 'Canton']
swiss_towns = pd.read_csv("swiss_towns.csv", delimiter='\t', usecols=columns, na_values=['data not included in P3', 'nan'])
swiss_towns.set_index('Town', inplace=True)
swiss_towns.head(10)

In [None]:
# Test for particular town
#swiss_towns.loc[swiss_towns.Town == 'Basel']
swiss_towns.loc['Basel']['Canton']

In [None]:
# Direct mapping of canton with Switzerland town list from wikipedia https://en.wikipedia.org/wiki/List_of_cities_in_Switzerland

def map_inst_canton(df):
    for place in df['Institution'].unique():
        if str(place) == 'nan':
            continue
        for word in place.split():
            #print(word)
            if word in swiss_towns.index:
                p = swiss_towns.loc[word]['Canton']
                print(place, " -> ", p)
                df['Canton'] = p

map_inst_canton(has_info)

In [None]:
has_info.head(200)

In [None]:
pprint(canton_dict)

In [None]:
grants_with_has_code = university_grants.join(has_canton_code)
grants_with_has_code.head()

In [None]:
university_grants.head()

In [None]:
extracts = university_grants['University'].str.extract('(?P<University>.*-)(?P<Code>.*)', expand=True)
has_code = pd.DataFrame( extracts['Code'] )
has_code.Code.apply(is_canton_code)
grouped_has_code = has_code.groupby('Code')

In [None]:
only_institution_data = grants_data[grants_data[['University']].isnull().all(axis=1)].index

## TODO:

1. clean up this mess (:

1. make one function for mapping a row to a canton
   Suggested strategy: 
   
   (we can only make 200 req. pr hour to the api, so we should try to keep the number down)
   
   1. Unique institutions, unique universities
   2. Lookup institution name, place result in hashmap: institution -> canton
   3. Parse university. If no canton code is given, lookup. Place result in the hashmap
   4. In the function, do a query in our hashmap for each column. If conflict, return (?)

1. apply the function to all rows in the data, add canton as a column
1. make a df of Canton, Approved Amount
1. Groupby canton, sum approved_amount, add a count column
1. Plot the results on the map etc
1 Bonus
