# Data Wrangling - Assignment 1

## 0. Setup

In [None]:
%pip install -r requirements.txt

In [40]:
from pathlib import Path
from shutil import rmtree
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd
import requests

from bs4 import BeautifulSoup

## 1. Fetching Data and Preprocessing

If `FETCH_DATA` is set to True, the raw data from previous executions will be deleted. All data is then fetched again from these data sources:


| Data                      | Source                         | URL                                                                                  | Datatype          |
|:---                       |:---                            |:---                                                                                  | :---              |
| Basic Data on Communes    | Bundesamt für Statistik        | https://dam-api.bfs.admin.ch/hub/api/dam/assets/15864450/master                      | XLSX File         |
| Coat of Arms of Communes  | Staatsarchiv Kanton Luzern     | https://staatsarchiv.lu.ch/kantonsgeschichte/gemeinden/Gemeindewappen/*\[commune\]*  | JPG Files         |

In [30]:
FETCH_DATA = True
RAW_DATA_PATH = Path().cwd() / "raw_data"
COMMUNES_PATH = RAW_DATA_PATH /  "communes.xlsx"
COAT_OF_ARMS_PATH = RAW_DATA_PATH / "coat_of_arms"

In [41]:
def delete_raw_data(raw_data_dir: Path, subdirs: List[Path]) -> None:
    '''
    Deletes all raw data if the raw data directory exists.

    Parameters
    ----------
    raw_data_dir : Path
        The path of the directory that contains the raw data.
    subdirs : list[Path]
        A list of subdirs as an absolute path to create inside the raw data directory.
    '''
    if raw_data_dir.exists():
        rmtree(raw_data_dir)
    
    raw_data_dir.mkdir()

    for dir in subdirs:
        dir.mkdir()

In [42]:
if FETCH_DATA:
    delete_raw_data(RAW_DATA_PATH, [COAT_OF_ARMS_PATH])

### 1.1. Basic Data: XLSX File for Data on all Comunes

#### 1.1.1. Fetch XLSX File

In [43]:
def download_binary_file(url: str, destination_dir: Path) -> None:
    '''
    Downloads a file from a given URL via a HTTP GET request.

    Parameters
    ----------
    url : str
        The URL of the file.
    destination_dir : Path
        The path of directory in which the file should be stored.
    Returns
    -------
    None
    '''
    request = requests.get(url)
    request.raise_for_status()

    with open(destination_dir, "wb") as f:
        f.write(request.content)

In [44]:
if FETCH_DATA:
    bfs_url = 'https://dam-api.bfs.admin.ch/hub/api/dam/assets/15864450/master'
    download_binary_file(bfs_url, destination_dir=COMMUNES_PATH)

#### 1.1.2. Preprocess XLSX File

In [50]:
def preprocess_communes_df(raw_communes: pd.DataFrame, min_bfs_nr: int, max_bfs_nr: int) -> Tuple[pd.DataFrame, Dict[int, str]]:
    '''
    Applies the preprocessing steps to the raw communes DataFrame.

    Parameters
    ----------
    raw_communes : DataFrame
        The raw DataFrame after loading from an excel file.
    min_bfs_nr : int
        The BFS number
        (`see Wikipedia <https://en.wikipedia.org/wiki/Community_Identification_Number#Switzerland>`_)
        of the first commune to include in the preprocssed DataFrame.
    max_bfs_nr : int
        The BFS number of the last commune to include in the preprocessed
        DataFrame (The commune with this number will be included in the
        preprocessed DataFrame).
    
    Returns
    -------
    communes : pd.DataFrame
        The complete preprocessed DataFrame.
    communes_names : dict
        A DataFrame that contains only the names of the communes in the
        the range of BFS numbers passed to this function.
    '''
    
    raw_communes.rename(
        columns = {
            raw_communes.columns[14]: raw_communes.columns[14][:-3],
            raw_communes.columns[32]: raw_communes.columns[32][:-3],
        },
        inplace=True
    )

    raw_communes.drop(labels=np.nan, axis=0, inplace=True)
    raw_communes.set_index(raw_communes.index.astype('int'), inplace=True)

    # select communes from that have BFS numbers in the given range
    communes = raw_communes.loc[min_bfs_nr:max_bfs_nr]

    # Remove the LU suffix present for some communes
    communes.loc[:, 'Gemeindename'] = communes["Gemeindename"].str.replace(
        r'(?P<name>\w+) \(LU\)',
        lambda m: m.group('name'),
        regex=True,
    )

    communes_names = communes.loc[:, 'Gemeindename'].sort_values().to_dict()
    return communes, communes_names

In [51]:
communes = pd.read_excel(
    COMMUNES_PATH,
    skiprows=[0, 1, 2, 3, 4, 6, 7],
    skipfooter=16,
    index_col=0,
)

communes_lu, communes_lu_names = preprocess_communes_df(
    raw_communes=communes,
    min_bfs_nr=1001,
    max_bfs_nr=1151,
)

### 1.2. Additional Information: Gemeindewappen 

In [53]:
def download_coat_of_arms(source_url: str, destination_dir: Path, commune_names: Dict[int, str], logging=False) -> None:
    '''
    Downloads images of all coats of arms from Staatsarchiv Luzern.

    Parameters
    ----------
    source_url : str
        The URL to download the images from.
    destination_dir : Path
        The Path of the directory where the coats of arms should be stored.
    commune_names : dict
        A dict that matches all BFS numbers of the communes to their names.
    logging : bool
        Wheter to print logging outputs or not. The Default is False,
        meaning no logging statements will be printed.
    
    Returns
    -------
    None
    '''
    # fetch images of coat of arms for all communes
    for cid, commune in commune_names.items():
        # handle concatenated and multi-word commune names
        if '-' in commune:
            commune = commune.split('-')[0]
        elif ' ' in commune:
            commune = commune.split(' ')[0]
        # The coat of arms of 'Willisau' is stored under 'Willisau-Stadt'
        # in https://staatsarchiv.lu.ch
        elif commune == 'Willisau':
            commune = 'Willisau-Stadt'

        current_url = source_url + commune
        if logging:
            print(f'Fetching {current_url}')
        request = requests.get(current_url)
        request.raise_for_status()

        soup = BeautifulSoup(request.text, 'html.parser')
        img = soup.find(id='maincontent_1_imgImage')
        img_url = 'https://staatsarchiv.lu.ch' + img.get('src')

        img_request = requests.get(img_url)
        img_request.raise_for_status()

        current_img_path = destination_dir / f'{cid}.jpg'
        with open(current_img_path, "wb") as f:
            f.write(img_request.content)

In [54]:
if FETCH_DATA:
    sa_lu_url = 'https://staatsarchiv.lu.ch/kantonsgeschichte/gemeinden/Gemeindewappen/'
    download_coat_of_arms(
        sa_lu_url, COAT_OF_ARMS_PATH,
        communes_lu_names, logging=True
    )

Fetching https://staatsarchiv.lu.ch/kantonsgeschichte/gemeinden/Gemeindewappen/Adligenswil
Fetching https://staatsarchiv.lu.ch/kantonsgeschichte/gemeinden/Gemeindewappen/Aesch
Fetching https://staatsarchiv.lu.ch/kantonsgeschichte/gemeinden/Gemeindewappen/Alberswil
Fetching https://staatsarchiv.lu.ch/kantonsgeschichte/gemeinden/Gemeindewappen/Altbüron
Fetching https://staatsarchiv.lu.ch/kantonsgeschichte/gemeinden/Gemeindewappen/Altishofen
Fetching https://staatsarchiv.lu.ch/kantonsgeschichte/gemeinden/Gemeindewappen/Ballwil
Fetching https://staatsarchiv.lu.ch/kantonsgeschichte/gemeinden/Gemeindewappen/Beromünster
Fetching https://staatsarchiv.lu.ch/kantonsgeschichte/gemeinden/Gemeindewappen/Buchrain
Fetching https://staatsarchiv.lu.ch/kantonsgeschichte/gemeinden/Gemeindewappen/Buttisholz
Fetching https://staatsarchiv.lu.ch/kantonsgeschichte/gemeinden/Gemeindewappen/Büron
Fetching https://staatsarchiv.lu.ch/kantonsgeschichte/gemeinden/Gemeindewappen/Dagmersellen
Fetching https://staatsa