# 1. Setup

In [34]:
# importing modules
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
from shapely.geometry import Point, Polygon
import fiona
import gdal

# setting up paths
path_languages = 'data/language_data/'
path_boundaries = 'data/boundaries/'
path_output = 'data/preprocessed/'

# setting up file names
fname_master = '1_Masterlist_161019.xlsx'
fname_cantons = 'boundaries_swiss_cantons.shp'

# 2. Loading data

Loading all language data (Bildung, Sprachschulen and Botschaften) and adding adding geometry columns 

In [35]:
sheet_names = ['Bildung','Sprachschulen','Botschaften']
dfs = [pd.read_excel(f'{path_languages}{fname_master}', sheet_name=name) for name in sheet_names]
df_bildung, df_sprachschulen, df_botschaften = dfs

In [36]:
df_bildung['geometry'] = df_bildung.apply(lambda x: Point(x['x '],x['y ']), axis=1)

# assign abbreviations for the different types of schools
dict_type_abbreviations = {
    'Agrupación de lengua y cultura española': 'ALCE',
    'Diploma de español lengua extranjera': 'DELE',
    'Instituto de ensenãnza media': 'IEM',
    'Universidad': 'U',
    'Centro universitário de idiomas': 'CUI',
    'Escuela de enseñanza primaria ': 'EEP'
}
df_bildung['type_abbr'] = df_bildung.apply(lambda x: dict_type_abbreviations[x['Typ']], axis=1)

gdf_bildung = gpd.GeoDataFrame(df_bildung, geometry='geometry')


gdf_bildung.head()

Unnamed: 0,ID,Typ,Name,x,y,Total,geometry,type_abbr
0,#1,Agrupación de lengua y cultura española,ALCE,8.045701,47.390434,30,POINT (8.0457015 47.390434),ALCE
1,#2,Agrupación de lengua y cultura española,ALCE,7.600514,47.474485,38,POINT (7.600514 47.474485),ALCE
2,#3,Agrupación de lengua y cultura española,ALCE,8.309816,47.471368,71,POINT (8.309816 47.471368),ALCE
3,#4,Agrupación de lengua y cultura española,ALCE,7.605457,47.563638,66,POINT (7.605457 47.563638),ALCE
4,#5,Agrupación de lengua y cultura española,ALCE,7.588968,47.569927,67,POINT (7.588968 47.569927),ALCE


In [37]:
df_sprachschulen['geometry'] = df_sprachschulen.apply(lambda x: Point(x['x'],x['y']), axis=1)
gdf_sprachschulen = gpd.GeoDataFrame(df_sprachschulen, geometry='geometry')
gdf_sprachschulen.head()

Unnamed: 0,ID,Typ,Name,y,x,Adresse,geometry
0,#230,Escuela de idiomas,Klubschule Migros St. Gallen,47.425059,9.376588,"Marktgasse 17, 9000 St. Gallen",POINT (9.376587799999999 47.4250593)
1,#231,Escuela de idiomas,Klubschule Migros Arbon,47.514926,9.430403,"Rebenstrasse 20, 9320 Arbon",POINT (9.430402600000001 47.5149257)
2,#232,Escuela de idiomas,Klubschule Migros Buchs,7.164124,9.47369,"Churerstrasse 7, 9470 Buchs",POINT (9.47369 7.164124)
3,#233,Escuela de idiomas,Klubschule Migros Chur,46.852501,9.525666,"Gartenstrasse 5, 7001 Chur",POINT (9.525665999999999 46.852501)
4,#234,Escuela de idiomas,Klubschule Migros Frauenfeld,47.556236,8.896414,"Rheinstrasse 10, 8500 Frauenfeld",POINT (8.896413900000001 47.5562359)


In [38]:
# adding geometry column
df_botschaften['geometry'] = df_botschaften.apply(lambda x: Point(x['y'],x['x']), axis=1)
gdf_botschaften = gpd.GeoDataFrame(df_botschaften, geometry='geometry')

# adding column for leaflet marker (png file)
gdf_botschaften['marker'] = gdf_botschaften.apply(lambda x: f"{x['Länder'].lower()}_flag.png", axis=1)

gdf_botschaften.head()

Unnamed: 0,ID,Länder,Institution,laut EDA,Adresse,Unnamed: 5,x,y,geometry,marker
0,#345,Argentinien,Botschaft,Sección consular de la Embajada de la Republic...,"Jungfraustrasse 1, 3005 Berne",http://www.esuiz.mrecic.gob.ar,46.943993,7.454704,POINT (7.454704 46.943993),argentinien_flag.png
1,#346,Bolivia,Konsulat,Consulado général de la Republica de Bolivia,"Rue de Lausanne 72, 1202 Genève",http://www.Konsulatboliviasuiza.com,46.215722,6.147492,POINT (6.147492 46.215722),bolivia_flag.png
2,#347,Chile,Botschaft,Sección consular de la Embajada de Chile,"Eigerplatz 5, 3007 Berne",http://chileabroad.gov.cl/berna,46.940963,7.431628,POINT (7.431628 46.940963),chile_flag.png
3,#348,Chile,Konsulat,Consulado de la Republica de Chile,"Fuhrstrasse 12, Unterer Leihof, 8820 Wädenswil",,47.225837,8.669301,POINT (8.669301000000001 47.225837),chile_flag.png
4,#349,Colombia,Konsulat,Sección consular de la Embajada de Colombia,"Zieglerstrasse 29, 3007 Berne",http://berna.Konsulat.gov.co,46.945014,7.42996,POINT (7.42996 46.945014),colombia_flag.png


Loading Swiss canton boundaries, fixing canton names, and adding centroid point to each canton

In [39]:
gdf_cantons = gpd.read_file(f'{path_boundaries}{fname_cantons}')
gdf_cantons = gdf_cantons[['NAME','geometry']]
gdf_cantons = gdf_cantons.rename(columns={'NAME': 'canton'})

gdf_cantons['boundary'] = gdf_cantons['geometry']

# adding centroid
gdf_cantons['geometry'] = gdf_cantons.apply(lambda x: Polygon(x['geometry']).centroid, axis=1)

# remapping names
def remap_names(row):
    dict_remap_names = {
        'Graub�nden': 'Graubünden',
        'Z�rich': 'Zürich',
        'Gen�ve': 'Genève',
        'Neuch�tel': 'Neuchâtel',
    }    
    old_name = row['canton']
    new_name = dict_remap_names.get(old_name,False)
    if not new_name:
        return old_name
    else:
        return new_name    
gdf_cantons['canton'] = gdf_cantons.apply(remap_names, axis=1)

gdf_cantons.head()

Unnamed: 0,canton,geometry,boundary
0,Graubünden,POINT (9.62862381268349 46.65606579160512),POLYGON Z ((8.877053154798531 46.8129134746790...
1,Bern,POINT (7.624744294648847 46.82208787558518),POLYGON Z ((7.153521643312189 46.9862818266617...
2,Valais,POINT (7.605940034344669 46.20935513994819),POLYGON Z ((8.47762548140741 46.52761948354787...
3,Vaud,POINT (6.646681769033475 46.55944971911612),POLYGON Z ((6.779825385513289 46.8529610472984...
4,Ticino,POINT (8.808554728958551 46.29606041574468),POLYGON Z ((8.47762548140741 46.52761948354787...


# 3. Preprocessing

Adding canton to each point entry for all data frames

In [40]:
list_names = list(gdf_cantons['canton'])
list_boundaries = list(gdf_cantons['boundary'])

def in_canton(row):
    point = Point(row['geometry'])
    for name, boundary in zip(list_names, list_boundaries):
        if point.within(boundary):
            return name
    return 'not found'

gdf_bildung['canton'] = gdf_bildung.apply(in_canton, axis=1)
gdf_sprachschulen['canton'] = gdf_sprachschulen.apply(in_canton, axis=1)
gdf_botschaften['canton'] = gdf_botschaften.apply(in_canton, axis=1)

Getting total number of students (Bildung) per type for each canton

In [41]:
dict_total = gdf_bildung.groupby(['type_abbr','canton']).sum()['Total'].to_dict()
def n_students(row):

    totals = [dict_total.get((type_abbr,row['canton']),0) for type_abbr in list(dict_type_abbreviations.values())]
    return totals

total_names_bildung = [f'total_{type_abbr}' for type_abbr in list(dict_type_abbreviations.values())]
gdf_cantons[total_names_bildung] = gdf_cantons.apply(n_students, axis=1, result_type='expand')
gdf_cantons.head()

Unnamed: 0,canton,geometry,boundary,total_ALCE,total_DELE,total_IEM,total_U,total_CUI,total_EEP
0,Graubünden,POINT (9.62862381268349 46.65606579160512),POLYGON Z ((8.877053154798531 46.8129134746790...,0,52,0,0,0,0
1,Bern,POINT (7.624744294648847 46.82208787558518),POLYGON Z ((7.153521643312189 46.9862818266617...,394,70,196,120,0,0
2,Valais,POINT (7.605940034344669 46.20935513994819),POLYGON Z ((8.47762548140741 46.52761948354787...,84,21,0,0,0,0
3,Vaud,POINT (6.646681769033475 46.55944971911612),POLYGON Z ((6.779825385513289 46.8529610472984...,612,43,902,302,401,0
4,Ticino,POINT (8.808554728958551 46.29606041574468),POLYGON Z ((8.47762548140741 46.52761948354787...,67,0,468,0,0,0


Getting total number of Sprachschulen

In [42]:
dict_schulen = gdf_sprachschulen.groupby('canton').size().to_dict()
print(dict_schulen)
def n_schulen(row):
    total = dict_schulen.get(row['canton'],0)
    return total

gdf_cantons['total_Schulen'] = gdf_cantons.apply(n_schulen, axis=1)
gdf_cantons.head()

{'Aargau': 9, 'Basel-Stadt': 2, 'Bern': 13, 'Fribourg': 3, 'Genève': 10, 'Glarus': 1, 'Graubünden': 2, 'Luzern': 2, 'Neuchâtel': 5, 'Schaffhausen': 1, 'Solothurn': 5, 'St. Gallen': 6, 'Thurgau': 3, 'Ticino': 11, 'Valais': 13, 'Vaud': 8, 'Zug': 3, 'Zürich': 17, 'not found': 1}


Unnamed: 0,canton,geometry,boundary,total_ALCE,total_DELE,total_IEM,total_U,total_CUI,total_EEP,total_Schulen
0,Graubünden,POINT (9.62862381268349 46.65606579160512),POLYGON Z ((8.877053154798531 46.8129134746790...,0,52,0,0,0,0,2
1,Bern,POINT (7.624744294648847 46.82208787558518),POLYGON Z ((7.153521643312189 46.9862818266617...,394,70,196,120,0,0,13
2,Valais,POINT (7.605940034344669 46.20935513994819),POLYGON Z ((8.47762548140741 46.52761948354787...,84,21,0,0,0,0,13
3,Vaud,POINT (6.646681769033475 46.55944971911612),POLYGON Z ((6.779825385513289 46.8529610472984...,612,43,902,302,401,0,8
4,Ticino,POINT (8.808554728958551 46.29606041574468),POLYGON Z ((8.47762548140741 46.52761948354787...,67,0,468,0,0,0,11


Getting total number of Botschaften and Konsulate for each canton

In [43]:
dict_institutions = gdf_botschaften.groupby(['Institution','canton']).size().to_dict()
print(dict_institutions)

types_institutions = ['Botschaft','Konsulat']

def n_institutions(row):    
    totals = [dict_institutions.get((type_,row['canton']),0) for type_ in types_institutions]
    return totals


column_names = [f'total_{type_}' for type_ in types_institutions]
gdf_cantons[column_names] = gdf_cantons.apply(n_institutions, axis=1, result_type='expand')
gdf_cantons.head()

{('Botschaft', 'Bern'): 12, ('Botschaft', 'Genève'): 4, ('Konsulat', 'Basel-Stadt'): 1, ('Konsulat', 'Bern'): 5, ('Konsulat', 'Genève'): 5, ('Konsulat', 'Ticino'): 2, ('Konsulat', 'Vaud'): 1, ('Konsulat', 'Zürich'): 7, ('Konsulat', 'not found'): 1}


Unnamed: 0,canton,geometry,boundary,total_ALCE,total_DELE,total_IEM,total_U,total_CUI,total_EEP,total_Schulen,total_Botschaft,total_Konsulat
0,Graubünden,POINT (9.62862381268349 46.65606579160512),POLYGON Z ((8.877053154798531 46.8129134746790...,0,52,0,0,0,0,2,0,0
1,Bern,POINT (7.624744294648847 46.82208787558518),POLYGON Z ((7.153521643312189 46.9862818266617...,394,70,196,120,0,0,13,12,5
2,Valais,POINT (7.605940034344669 46.20935513994819),POLYGON Z ((8.47762548140741 46.52761948354787...,84,21,0,0,0,0,13,0,0
3,Vaud,POINT (6.646681769033475 46.55944971911612),POLYGON Z ((6.779825385513289 46.8529610472984...,612,43,902,302,401,0,8,0,1
4,Ticino,POINT (8.808554728958551 46.29606041574468),POLYGON Z ((8.47762548140741 46.52761948354787...,67,0,468,0,0,0,11,0,2


# 4. Exporting

All files are exported as GeoJSON format

In [44]:

# canton data
keys = ['canton','geometry']


keys.extend(total_names_bildung)
gdf_cantons = gdf_cantons[keys]
gdf_cantons.to_file(f'{path_output}cantons.geojson', driver='GeoJSON')


# Bildung
gdf_bildung.to_file(f'{path_output}bildung.geojson', driver='GeoJSON')

# Sprachschulen
gdf_sprachschulen.to_file(f'{path_output}sprachschulen.geojson', driver='GeoJSON')

# Botschaften
gdf_botschaften.to_file(f'{path_output}botschaften.geojson', driver='GeoJSON')