In [1]:
import glob
import os
import pandas as pd

In [2]:
# Read in all the powiats from static data and compare to PL.txt
root_directory = '../../../src/data/data_raw/static_annual_data/'
year_dirs = ['2010-2021', '2015-2021', '2017-2021']
powiat_filenames = []
voivod_filenames = []
for y in year_dirs:
    powiat_mask = os.path.join(root_directory, y, '*powiat*.xlsx')
    voivod_mask = os.path.join(root_directory, y, '*voivodship*.xlsx')
    powiat_filenames += glob.glob(powiat_mask)
    voivod_filenames += glob.glob(voivod_mask)

In [3]:
#crop_production, forest_fires, vehicles and air_pollution_reduction have different header sizes
def get_header_size(filename):
    if filename.find('crop') != -1:
        return [0,1,2,3,4]
    elif filename.find('forest_fires') != -1 or filename.find('vehicles_by_type_and_fuel') != -1 or filename.find('air_pollution_reduction') != -1:
        return [0,1,2,3]
    return [0,1,2]

In [4]:
#get the name of first column
def get_1st_col_name(filename):
    header = get_header_size(filename)
    tmp = []
    for i in range(len(header)-1):
        tmp.append(f"Unnamed: 0_level_{i+1}")
    return tuple(['Code'] + tmp)

In [5]:
years = ['2017','2018','2019','2020','2021']

# Process voivodeship files

In [6]:
def process_voivod_df(df):
    for t in df.columns: #truncate to 2017-2020
        if t[0] not in ['Code','Name']:
            if t[-2] not in years:
                df.drop(columns=[t], inplace=True)
        elif t[0] == 'Name': #fix spelling of voivodeship
            df[t] = df[t].apply(lambda x: x.lower())

In [7]:
for fname in voivod_filenames:
    filename = fname[fname.rfind('/')+1:-15]
    df = pd.read_excel(fname, sheet_name='TABLE', header=get_header_size(fname), dtype={get_1st_col_name(fname): str})
    process_voivod_df(df)
    df.to_excel(f'../../data/data_processed/static_annual_data/{filename}.xlsx')

# Process powiat files

In [8]:
def process_powiat_df(df):
    static_to_geojson = {'powiat Wałbrzych since 2013':'powiat Wałbrzych',
                        'powiat Capital City Warszawa':'powiat Warszawa',
                        'powiat karkonoski':'powiat jeleniogórski'}

    def fix_powiat_spelling(powiat_name):
        if powiat_name.startswith('P'):
            return powiat_name.lower()
        else: 
            return 'powiat' + powiat_name[23:]
        
    for t in df.columns: #truncate to 2017-2020
        if t[0] == 'Name': #fix spelling of powiat
            tname = t
            df[t] = df[t].apply(fix_powiat_spelling)
            for i, powiat_name in enumerate(df[t]):
                if powiat_name in static_to_geojson.keys():
                    df.loc[i,t] = static_to_geojson[powiat_name]
        elif t[0] not in ['Code','Name']:
            if t[-2] not in years:
                df.drop(columns=[t], inplace=True)
    
    df2 = df[df[tname] != 'powiat Wałbrzych to 2002']
    df2 = df2[df2[tname] != 'powiat warszawski']
    return df2

In [9]:
for fname in powiat_filenames:
    filename = fname[fname.rfind('/')+1:-15]
    df = pd.read_excel(fname, sheet_name='TABLE', header=get_header_size(fname), dtype={get_1st_col_name(fname): str})
    df2 = process_powiat_df(df)
    df2.to_excel(f'../../data/data_processed/static_annual_data/{filename}.xlsx')
