# Data Wrangling - Assignment 1

## 0. Setup

In [None]:
%pip install -r requirements.txt

In [19]:
from pathlib import Path
from shutil import rmtree

import numpy as np
import pandas as pd
import requests


## 1. Fetching Data

If `FETCH_DATA` is set to True, the raw data from previous executions will be deleted. All data is then fetched again from these data sources:


| Data                      | Source                    | URL                                                               | Datatype          |
|:---                       |:---                       |:---                                                               | :---              |
| Basic Data on Communes    | Bundesamt für Statistik   | https://dam-api.bfs.admin.ch/hub/api/dam/assets/15864450/master   | XLSX File         |

In [32]:
FETCH_DATA = True
RAW_DATA_PATH = Path().cwd() / "raw_data"
COMMUNES_PATH = RAW_DATA_PATH /  "communes.xlsx"

In [33]:
if FETCH_DATA:
    if RAW_DATA_PATH.exists():
        rmtree(RAW_DATA_PATH)
    
    RAW_DATA_PATH.mkdir()

### 1.1. Basic Data: XLSX File for Data on all Comunes

In [34]:
if FETCH_DATA:
    bfs_url = "https://dam-api.bfs.admin.ch/hub/api/dam/assets/15864450/master"

    request = requests.get(bfs_url)
    request.raise_for_status()

    with open(COMMUNES_PATH, "wb") as f:
        f.write(request.content)

### 1.2. Additional Information: Gemeindewappen 

## 2. Preprocessing


### 2.1. Basic Data

In [146]:
communes = pd.read_excel(
              COMMUNES_PATH,
              skiprows=[0, 1, 2, 3, 4, 6, 7],
              skipfooter=16,
              index_col=0
            )

communes.rename(columns = {
    communes.columns[14]: communes.columns[14][:-3],
    communes.columns[32]: communes.columns[32][:-3],
    },
    inplace=True
)

communes.drop(labels=np.nan, axis=0, inplace=True)
communes.set_index(communes.index.astype('int'), inplace=True)

communes_lu = communes.loc[1001:1151]
communes_lu.loc[:, 'Gemeindename'] = communes_lu["Gemeindename"].str.replace(
    r'(?P<name>\w+) \(LU\)',
    lambda m: m.group('name'),
    regex=True
)

cummunes_lu_names = communes_lu.loc[:, 'Gemeindename']
cummunes_lu_names.to_dict()

{1001: 'Doppleschwand',
 1002: 'Entlebuch',
 1004: 'Flühli',
 1005: 'Hasle',
 1007: 'Romoos',
 1008: 'Schüpfheim',
 1009: 'Werthenstein',
 1010: 'Escholzmatt-Marbach',
 1021: 'Aesch',
 1023: 'Ballwil',
 1024: 'Emmen',
 1025: 'Ermensee',
 1026: 'Eschenbach',
 1030: 'Hitzkirch',
 1031: 'Hochdorf',
 1032: 'Hohenrain',
 1033: 'Inwil',
 1037: 'Rain',
 1039: 'Römerswil',
 1040: 'Rothenburg',
 1041: 'Schongau',
 1051: 'Adligenswil',
 1052: 'Buchrain',
 1053: 'Dierikon',
 1054: 'Ebikon',
 1055: 'Gisikon',
 1056: 'Greppen',
 1057: 'Honau',
 1058: 'Horw',
 1059: 'Kriens',
 1061: 'Luzern',
 1062: 'Malters',
 1063: 'Meggen',
 1064: 'Meierskappel',
 1065: 'Root',
 1066: 'Schwarzenberg',
 1067: 'Udligenswil',
 1068: 'Vitznau',
 1069: 'Weggis',
 1081: 'Beromünster',
 1082: 'Büron',
 1083: 'Buttisholz',
 1084: 'Eich',
 1085: 'Geuensee',
 1086: 'Grosswangen',
 1088: 'Hildisrieden',
 1089: 'Knutwil',
 1091: 'Mauensee',
 1093: 'Neuenkirch',
 1094: 'Nottwil',
 1095: 'Oberkirch',
 1097: 'Rickenbach',
 1098