### Calculating the county/powiat to voivodship ratios based on area and population

Resulting file: https://github.com/OmdenaAI/warsaw-poland-chapter-air-pollution/blob/main/src/data/data_processed/static_annual_data/powiat_to_voivodship_ratio_by_area.csv

In [35]:
import pandas as pd

In [36]:
years = ['2017', '2018', '2019', '2020', '2021']

Load & process area_by_powiat.xlsx

In [37]:
df1 = pd.read_excel('area_by_powiat.xlsx')

# Drop unnecessary data & rename columns
df1 = df1.drop([0, 1, 2]).drop(columns=['Unnamed: 0'])
df1.columns = ['code', 'county', '2017', '2018', '2019', '2020', '2021']

# Compute voivodship code for each county
df1["voivod_code"] = (df1['code'].astype(int) / (10**5))
df1["voivod_code"] = df1["voivod_code"].astype(int) * 100000

# Use mean area for county instead of area for each individual year
df1["county_area"] = df1[years].mean(axis=1)

# Assess how much the area differs from year to year
# df1["area_variance"] = df1[years].var(axis=1)
# df1 = df1.sort_values(by='area_variance',ascending=False)

# df1.head()

Load & process area_by_voivodship.xlsx

In [38]:
df2 = pd.read_excel('area_by_voivodship.xlsx')

# Drop unnecessary data & rename columns
df2 = df2.drop([0, 1, 2]).drop(columns=['Unnamed: 0'])
df2.columns = ['code', 'voivodship', '2017', '2018', '2019', '2020', '2021']

# Use mean area for voivodship instead of area for each individual year
df2["voivod_area"] = df2[years].mean(axis=1)

# Assess how much the area differs from year to year
# df2["area_variance"] = df2[years].var(axis=1)
# df2 = df2.sort_values(by='area_variance',ascending=False)

# df2.head()

Merge counties & voivodships

In [39]:
df = pd.merge(df1[['code', 'voivod_code','county','county_area']], df2[['code', 'voivodship', 'voivod_area']], how='inner', left_on = 'voivod_code', right_on = 'code')

# Drop unnecessary data & rename columns
df = df.drop(columns=['voivod_code'])\
    .rename(columns={'code_x': 'county_code', 'code_y': 'voivod_code', 'county_area':'county_area', 'voivod_area':'voivod_area'})

# Rearrange columns
df.insert(1, 'voivod_code', df.pop('voivod_code'))
df.insert(3, 'voivodship', df.pop('voivodship'))

df.head()

Unnamed: 0,county_code,voivod_code,county,voivodship,county_area,voivod_area
0,201000,200000,powiat bolesławiecki,dolnośląskie,1304.0,19947.0
1,202000,200000,powiat dzierżoniowski,dolnośląskie,479.0,19947.0
2,203000,200000,powiat głogowski,dolnośląskie,443.0,19947.0
3,204000,200000,powiat górowski,dolnośląskie,738.0,19947.0
4,205000,200000,powiat jaworski,dolnośląskie,582.0,19947.0


In [40]:
# Auxiliary check step: Compare the sum of powiats' areas vs the area of voivodships
# pd.merge(df.groupby(["voivodship"])["county_area"].sum().reset_index(name="sum_of_pow_areas"), \
#         df[['voivodship', 'voivod_area']].drop_duplicates(subset=["voivodship"]) )

In [41]:
# Compute county to voivodship ratio using area
df['county_ratio_by_area'] = (df['county_area'] / df['voivod_area']).round(6)
# df.head()
# df.shape

Load & process population_density_by_powiat.xlsx

In [42]:
df3 = pd.read_excel('population_density_by_powiat.xlsx')

# Drop unnecessary data & rename columns
df3 = df3.drop([0, 1, 2]).drop(columns=['Unnamed: 0', 'population density of the built-up and urbanized area', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12'])
df3.columns = ['code', 'county', '2017', '2018', '2019', '2020', '2021']
df3['code'] = df3['code'].astype(int)

# df3.head()

Merge area & population data

In [43]:
df = pd.merge(df, df3, how='inner', left_on = 'county_code', right_on = 'code')\
    .rename(columns={'county_x': 'county'})\
    .drop(columns=['code', 'county_y'])

df.head()
# df.shape

Unnamed: 0,county_code,voivod_code,county,voivodship,county_area,voivod_area,county_ratio_by_area,2017,2018,2019,2020,2021
0,201000,200000,powiat bolesławiecki,dolnośląskie,1304.0,19947.0,0.065373,69,69,69,68,68
1,202000,200000,powiat dzierżoniowski,dolnośląskie,479.0,19947.0,0.024014,213,212,211,205,202
2,203000,200000,powiat głogowski,dolnośląskie,443.0,19947.0,0.022209,203,202,201,196,195
3,204000,200000,powiat górowski,dolnośląskie,738.0,19947.0,0.036998,48,48,47,45,45
4,205000,200000,powiat jaworski,dolnośląskie,582.0,19947.0,0.029177,88,87,86,84,83


In [44]:
# Compute total population by county ( population_density * area )
for year in years:
    df[str(year) + '_county_pop'] = (df[str(year)] * df['county_area']).astype(int)

df = df.drop(columns=years)

# Compute total population by voivodship
for year in years:
    df[str(year) + '_voivod_pop'] = df.groupby(["voivodship"])[str(year) + "_county_pop"].transform("sum")

# Compute population averages
df["county_pop_mean"] = df[[s + "_county_pop" for s in years]].mean(axis=1).astype(int)
df["voivod_pop_mean"] = df[[s + "_voivod_pop" for s in years]].mean(axis=1).astype(int)

# Compute population county to voivodship ratios by populatoon
for year in years:
    df[str(year) + '_county_ratio_by_pop'] = (df[str(year) + '_county_pop'] / df[str(year) + '_voivod_pop']).round(6)
df['county_ratio_by_pop'] = (df['county_pop_mean'] / df['voivod_pop_mean']).round(6)

# df.head()

In [45]:
df.to_csv("powiat_to_voivodship_ratio_by_area.csv")