# Processing static annual data

Truncation to the period 2017-2021 and distribution of voivodeship data to powiat level

In [1]:
import glob
import os
import pandas as pd

Using the calculated ratios from `powiat_to_voivodeship_ratio_by_area.csv`

In [2]:
ratios = pd.read_csv('../../data/data_processed/static_annual_data/powiat_to_voivodship_ratio_by_area.csv')
ratios.head()

Unnamed: 0.1,Unnamed: 0,county_code,voivod_code,county,voivodship,county_area,voivod_area,county_ratio_by_area,2017_county_pop,2018_county_pop,...,2020_voivod_pop,2021_voivod_pop,county_pop_mean,voivod_pop_mean,2017_county_ratio_by_pop,2018_county_ratio_by_pop,2019_county_ratio_by_pop,2020_county_ratio_by_pop,2021_county_ratio_by_pop,county_ratio_by_pop
0,0,201000,200000,powiat bolesławiecki,dolnośląskie,1304.0,19947.0,0.065373,89976,89976,...,2910229,2898212,89454,2902021,0.031011,0.031006,0.031044,0.030469,0.030595,0.030825
1,1,202000,200000,powiat dzierżoniowski,dolnośląskie,479.0,19947.0,0.024014,102027,101548,...,2910229,2898212,99919,2902021,0.035164,0.034993,0.034872,0.033741,0.033385,0.034431
2,2,203000,200000,powiat głogowski,dolnośląskie,443.0,19947.0,0.022209,89929,89486,...,2910229,2898212,88334,2902021,0.030995,0.030837,0.030722,0.029835,0.029806,0.030439
3,3,204000,200000,powiat górowski,dolnośląskie,738.0,19947.0,0.036998,35424,35424,...,2910229,2898212,34390,2902021,0.012209,0.012207,0.011968,0.011411,0.011459,0.01185
4,4,205000,200000,powiat jaworski,dolnośląskie,582.0,19947.0,0.029177,51216,50634,...,2910229,2898212,49819,2902021,0.017652,0.017448,0.017269,0.016799,0.016668,0.017167


In [3]:
#First, add the missing 0 to the code (the code should be 7 digits))
def amend_code(code: int) -> str:
    temp = str(code)
    if len(temp) != 7:
        return '0' + temp
    return temp

for col in ['voivod_code','county_code']:
    ratios[col] = ratios[col].apply(amend_code)

In [4]:
#Create a dictionary with powiat and voivodeship name and codes, key by powiat code
pdict = {}
for i in range(len(ratios)):
    if ratios.county_code[i] not in pdict:
        pdict[ratios.county_code[i]] = (ratios.county[i], ratios.voivod_code[i], ratios.voivodship[i])

# Read raw data

In [5]:
# Read in all the powiats from static data and compare to PL.txt
root_directory = '../../../src/data/data_raw/static_annual_data/'
year_dirs = ['2010-2021', '2015-2021', '2017-2021']
powiat_filenames = []
voivod_filenames = []
for y in year_dirs:
    powiat_mask = os.path.join(root_directory, y, '*powiat*.xlsx')
    voivod_mask = os.path.join(root_directory, y, '*voivodship*.xlsx')
    powiat_filenames += glob.glob(powiat_mask)
    voivod_filenames += glob.glob(voivod_mask)

In [6]:
#crop_production, forest_fires, vehicles and air_pollution_reduction have different header sizes
def get_header_size(filename):
    if filename.find('crop') != -1:
        return [0,1,2,3,4]
    elif filename.find('forest_fires') != -1 or filename.find('vehicles_by_type_and_fuel') != -1 or filename.find('air_pollution_reduction') != -1:
        return [0,1,2,3]
    return [0,1,2]

In [7]:
#get the name of first column
def get_1st_col_name(filename):
    header = get_header_size(filename)
    tmp = []
    for i in range(len(header)-1):
        tmp.append(f"Unnamed: 0_level_{i+1}")
    return tuple(['Code'] + tmp)

In [8]:
years = ['2017','2018','2019','2020','2021']

# Process powiat files

For each powiat we add the corresponding voivodeship, based on the two first digits of the code. Since the data is already on powiat level, no need for further redistribution.

In [9]:
def process_powiat_df(df,powiatdict=pdict):
    static_to_geojson = {'powiat Wałbrzych since 2013':'powiat Wałbrzych',
                        'powiat Capital City Warszawa':'powiat Warszawa',
                        'powiat karkonoski':'powiat jeleniogórski'}

    def fix_powiat_spelling(powiat_name):
        if powiat_name.startswith('P'):
            return powiat_name.lower()
        else: 
            return 'powiat' + powiat_name[23:]
        
    for t in df.columns: #truncate to 2017-2020
        if t[0] == 'County Name': #fix spelling of powiat
            tname = t
            df[t] = df[t].apply(fix_powiat_spelling)
            for i, powiat_name in enumerate(df[t]):
                if powiat_name in static_to_geojson.keys():
                    df.loc[i,t] = static_to_geojson[powiat_name]
        elif t[0] not in ['County Code','County Name','Voivodeship Code','Voivodeship']:
            if t[-2] not in years:
                df.drop(columns=[t], inplace=True)
    
    df = df[df[tname] != 'powiat Wałbrzych to 2002']
    df = df[df[tname] != 'powiat warszawski']
    return df

In [10]:
def add_voivodeship_data(df,powiatdict=pdict):
    def change_multiindex_order(df):
        mind = list(df.columns)
        mind_new = mind[:2] + mind[-2:] + mind[2:-2]
        return pd.MultiIndex.from_tuples(mind_new)
    
    codetup, voitup = list(df.columns[0]), list(df.columns[1])
    codetup[0] = 'Voivodeship Code'
    voitup[0] = 'Voivodeship'
    df[tuple(codetup)] = df.Code
    df[tuple(voitup)] = df.Name
    for i in range(len(df)):
        powiat_code = df.loc[i,df.columns[0]]
        if powiat_code in powiatdict:
            df.loc[i,tuple(codetup)] = powiatdict[powiat_code][1]
            df.loc[i,tuple(voitup)] = powiatdict[powiat_code][2]
    
    df2 = df.reindex(columns=change_multiindex_order(df))
    df2 = df2.rename(columns={'Code':"County Code"})
    df2 = df2.rename(columns={'Name':"County Name"})
    return df2

In [11]:
for fname in powiat_filenames:
    filename = fname[fname.rfind('/')+1:-15]
    df = pd.read_excel(fname, sheet_name='TABLE', header=get_header_size(fname), dtype={get_1st_col_name(fname): str})
    df = add_voivodeship_data(df)
    df = process_powiat_df(df)
    df.to_excel(f'../../data/data_processed/static_annual_data/{filename}.xlsx')


# Process voivodeship files

For each voivodeship we distribute the data to powiat level both by area (common for all years) and by population (varying per year).

In [12]:
#change spelling to lowercase and truncate to 2017-2021
def process_voivod_df(df):
    for t in df.columns: #truncate to 2017-2021
        if t[0] not in ['Code','Name']:
            if t[-2] not in years:
                df.drop(columns=[t], inplace=True)
        elif t[0] == 'Name': #fix spelling of voivodeship
            df[t] = df[t].apply(lambda x: x.lower())

In [13]:
#create new MultiIndex to include powiat code, name, and columns for distributed values by area and by population
def create_new_multiindex(df):
    process_voivod_df(df)
    df = df.rename(columns={'Code':"Voivodeship Code"})
    df = df.rename(columns={'Name':"Voivodeship"})
    mind = list(df.columns)
    c_code, c_name = list(mind[0]), list(mind[1])
    c_code[0], c_name[0] = "County Code", "County Name"
    new_mind = [tuple(c_code), tuple(c_name), mind[0], mind[1]]
    for column in mind[2:]:
        if column[-2] in years:
            byarea, bypop = list(column), list(column)
            byarea[-2] = column[-2] + "_by_area"
            bypop[-2] = column[-2] + "_by_pop"
            new_mind.append(tuple(byarea))
            new_mind.append(tuple(bypop))
    return pd.MultiIndex.from_tuples(new_mind)

In [14]:
#create new mapping - voivodeship to powiats
vdict = {}
for key, val in pdict.items():
    if val[1] not in vdict:
        vdict[val[1]] = [(val[2], key, val[0])]
    else:
        vdict[val[1]].append((val[2], key, val[0]))

In [15]:
def get_powiat_ratios(powiat_code):
   #ratio_list = [ratio_by_area, 2017_ratio_by_pop, 2018_ratio_by_pop, 2019_ratio_by_pop, 2020_ratio_by_pop, 2021_ratio_by_pop]
   temp_series = ratios[ratios.county_code == powiat_code][['county_ratio_by_area',\
                                             '2017_county_ratio_by_pop','2018_county_ratio_by_pop','2019_county_ratio_by_pop',\
                                               '2020_county_ratio_by_pop','2021_county_ratio_by_pop']]
   return list(temp_series.iloc[0,:])

In [16]:
import numpy as np

def distribute_to_powiat_level(df, powiatdict=pdict,voivoddict=vdict):
    mind = create_new_multiindex(df)
    df_dist = pd.DataFrame(columns=mind)
    for v in range(len(df)):
        for key in vdict:
            if df.loc[v,df.columns[0]] == key:
                for pow in vdict[key]:
                    #get powiat ratios
                    rt = get_powiat_ratios(pow[1])
                    datalist = [pow[1], pow[2], key, pow[0]]
                    for datacol in df.columns[2:]:
                        iyear = int(datacol[-2])
                        value = df.loc[v,datacol]
                        if value.dtype == 'int64':
                            datalist.append(round(rt[0]*value))
                            datalist.append(round(rt[iyear-2016]*value))
                        else:
                            datalist.append(rt[0]*value)
                            datalist.append(rt[iyear-2016]*value)
                    tempdf = pd.DataFrame(np.array(datalist).reshape((1,-1)), columns=mind)
                    tempdf.iloc[0,4:] = tempdf.iloc[0,4:].apply(lambda x: float(x))
                    df_dist = pd.concat([df_dist, tempdf],axis=0, ignore_index=True)
    return df_dist

In [17]:
for fname in voivod_filenames:
    filename = fname[fname.rfind('/')+1:-15]
    df = pd.read_excel(fname, sheet_name='TABLE', header=get_header_size(fname), dtype={get_1st_col_name(fname): str})
    df_dist = distribute_to_powiat_level(df)
    df_dist.to_excel(f'../../data/data_processed/static_annual_data/{filename}_dist.xlsx')