# Data Wrangling - Assignment 1

## 0. Setup

In [None]:
%pip install -r requirements.txt

In [202]:
from pathlib import Path
from shutil import rmtree

import numpy as np
import pandas as pd
import requests

from bs4 import BeautifulSoup

## 1. Fetching Data and Preprocessing

If `FETCH_DATA` is set to True, the raw data from previous executions will be deleted. All data is then fetched again from these data sources:


| Data                      | Source                         | URL                                                                                  | Datatype          |
|:---                       |:---                            |:---                                                                                  | :---              |
| Basic Data on Communes    | Bundesamt für Statistik        | https://dam-api.bfs.admin.ch/hub/api/dam/assets/15864450/master                      | XLSX File         |
| Coat of Arms of Communes  | Staatsarchiv Kanton Luzern     | https://staatsarchiv.lu.ch/kantonsgeschichte/gemeinden/Gemeindewappen/*\[commune\]*  | JPG Files         |

In [212]:
FETCH_DATA = True
RAW_DATA_PATH = Path().cwd() / "raw_data"
COMMUNES_PATH = RAW_DATA_PATH /  "communes.xlsx"
COAT_OF_ARMS_PATH = RAW_DATA_PATH / "coat_of_arms"

In [213]:
if FETCH_DATA:
    if RAW_DATA_PATH.exists():
        rmtree(RAW_DATA_PATH)
    
    RAW_DATA_PATH.mkdir()
    COAT_OF_ARMS_PATH.mkdir()

### 1.1. Basic Data: XLSX File for Data on all Comunes

#### 1.1.1. Fetch XLSX File

In [214]:
if FETCH_DATA:
    bfs_url = "https://dam-api.bfs.admin.ch/hub/api/dam/assets/15864450/master"

    request = requests.get(bfs_url)
    request.raise_for_status()

    with open(COMMUNES_PATH, "wb") as f:
        f.write(request.content)

#### 1.1.2. Preprocess XLSX File

In [215]:
communes = pd.read_excel(
              COMMUNES_PATH,
              skiprows=[0, 1, 2, 3, 4, 6, 7],
              skipfooter=16,
              index_col=0
            )

communes.rename(columns = {
    communes.columns[14]: communes.columns[14][:-3],
    communes.columns[32]: communes.columns[32][:-3],
    },
    inplace=True
)

communes.drop(labels=np.nan, axis=0, inplace=True)
communes.set_index(communes.index.astype('int'), inplace=True)

communes_lu = communes.loc[1001:1151]
communes_lu.loc[:, 'Gemeindename'] = communes_lu["Gemeindename"].str.replace(
    r'(?P<name>\w+) \(LU\)',
    lambda m: m.group('name'),
    regex=True
)

communes_lu_names = communes_lu.loc[:, 'Gemeindename'].sort_values()
communes_lu

Unnamed: 0_level_0,Gemeindename,Einwohner,Veränderung in %,Bevölkerungs-dichte pro km²,Ausländer in %,0-19 Jahre,20-64 Jahre,65 Jahre und mehr,Rohe Heiratssziffer,Rohe Scheidungsziffer,...,FDP,CVP,SP,SVP,EVP/CSP,GLP,BDP,PdA/Sol.,GPS,Kleine Rechtsparteien
Gemeindecode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1001,Doppleschwand,789,9.888579,113.525180,4.689480,26.742712,58.681876,14.575412,5.092298,1.273074,...,9.222352,49.198906,1.133255,35.404455,0.078156,3.126221,*,*,1.524033,0.156311
1002,Entlebuch,3280,-0.545785,57.644991,7.012195,21.798780,59.664634,18.536585,4.875076,1.218769,...,17.489127,34.250585,3.688525,38.006022,0.384744,2.316828,*,*,3.604885,0.066912
1004,Flühli,1929,1.847941,17.833041,17.677553,21.876620,60.290306,17.833074,1.028278,0.000000,...,13.944498,39.251691,2.043352,40.066271,0.207097,1.808643,*,*,2.526577,0.110451
1005,Hasle,1736,-0.057571,43.066237,3.456221,22.695853,59.907834,17.396313,2.298851,1.149425,...,9.152593,39.731604,5.352835,41.752111,0.015078,1.537998,*,*,2.246683,0.180941
1007,Romoos,659,-5.451937,17.625033,2.579666,22.003035,59.787557,18.209408,10.566038,3.018868,...,9.183673,50.809289,2.674173,32.653061,0.351865,0.457424,*,*,3.096411,0.774103
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1145,Ufhusen,888,4.716981,72.727273,5.630631,21.959459,60.585586,17.454955,3.351955,2.234637,...,8.81289,46.72805,2.59783,34.824071,1.611312,2.564946,*,*,2.367642,0
1146,Wauwil,2261,30.542725,763.851351,23.175586,20.477665,64.838567,14.683768,7.542147,3.105590,...,17.625199,33.003008,8.653336,27.711909,0.4424,4.67174,*,*,7.485401,0.035392
1147,Wikon,1520,11.355311,183.574879,18.355263,22.828947,59.210526,17.960526,4.591669,1.311906,...,11.936416,22.456647,9.508671,31.50289,2.687861,11.416185,*,*,9.508671,0.433526
1150,Zell,2097,6.717557,150.754853,13.304721,22.317597,59.704340,17.978064,5.320435,1.934704,...,16.972401,36.688151,6.792266,29.862833,0.627995,2.478929,*,*,6.445216,0.033052


### 1.2. Additional Information: Gemeindewappen 

In [217]:
if FETCH_DATA:
    communes_lu_names_dict = communes_lu_names.to_dict()
    sa_lu_url = 'https://staatsarchiv.lu.ch/kantonsgeschichte/gemeinden/Gemeindewappen/'

    # fetch images of coat of arms for all communes
    for cid, commune in communes_lu_names_dict.items():
        # handle concatenated and multi-word commune names
        if '-' in commune:
            commune = commune.split('-')[0]
        elif ' ' in commune:
            commune = commune.split(' ')[0]
        elif commune == 'Willisau':
            commune = 'Willisau-Stadt'

        current_url = sa_lu_url + commune
        print(f'Fetching {current_url}')
        request = requests.get(current_url)
        request.raise_for_status()

        soup = BeautifulSoup(request.text, 'html.parser')
        img = soup.find(id='maincontent_1_imgImage')
        img_url = 'https://staatsarchiv.lu.ch' + img.get('src')

        img_request = requests.get(img_url)
        img_request.raise_for_status()

        current_img_path = COAT_OF_ARMS_PATH / f'{cid}.jpg'
        with open(current_img_path, "wb") as f:
            f.write(img_request.content)

Fetching https://staatsarchiv.lu.ch/kantonsgeschichte/gemeinden/Gemeindewappen/Adligenswil
Fetching https://staatsarchiv.lu.ch/kantonsgeschichte/gemeinden/Gemeindewappen/Aesch
Fetching https://staatsarchiv.lu.ch/kantonsgeschichte/gemeinden/Gemeindewappen/Alberswil
Fetching https://staatsarchiv.lu.ch/kantonsgeschichte/gemeinden/Gemeindewappen/Altbüron
Fetching https://staatsarchiv.lu.ch/kantonsgeschichte/gemeinden/Gemeindewappen/Altishofen
Fetching https://staatsarchiv.lu.ch/kantonsgeschichte/gemeinden/Gemeindewappen/Ballwil
Fetching https://staatsarchiv.lu.ch/kantonsgeschichte/gemeinden/Gemeindewappen/Beromünster
Fetching https://staatsarchiv.lu.ch/kantonsgeschichte/gemeinden/Gemeindewappen/Buchrain
Fetching https://staatsarchiv.lu.ch/kantonsgeschichte/gemeinden/Gemeindewappen/Buttisholz
Fetching https://staatsarchiv.lu.ch/kantonsgeschichte/gemeinden/Gemeindewappen/Büron
Fetching https://staatsarchiv.lu.ch/kantonsgeschichte/gemeinden/Gemeindewappen/Dagmersellen
Fetching https://staatsa