# Find non-matching powiats

In [1]:
import json
from shapely.geometry import shape, GeometryCollection, Point
from typing import Tuple
#Documentation: https://shapely.readthedocs.io/en/stable/

In [2]:
#read geojson file for powiat level
with open('../../data/data_auxiliary/powiaty-max.geojson', "r", encoding="utf-8") as f:
     powiaty = json.load(f)

powiats_geojson = [p['properties']['nazwa'] for p in powiaty['features']]
powiaty = {}

In [3]:
import pandas as pd

df = pd.read_excel('../../data/data_raw/static_annual_data/2010-2021/emission_of_pollutant_gases_by_powiat_2010-2021.xlsx',sheet_name='TABLE', header=[0,1,2])
df.tail()

Unnamed: 0_level_0,Code,Name,total,total,total,total,total,total,total,total,...,dinitrogen monoxide,dinitrogen monoxide,dinitrogen monoxide,dinitrogen monoxide,dinitrogen monoxide,dinitrogen monoxide,dinitrogen monoxide,dinitrogen monoxide,dinitrogen monoxide,dinitrogen monoxide
Unnamed: 0_level_1,Unnamed: 0_level_1,Unnamed: 1_level_1,2010,2011,2012,2013,2014,2015,2016,2017,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
Unnamed: 0_level_2,Unnamed: 0_level_2,Unnamed: 1_level_2,[t/y],[t/y],[t/y],[t/y],[t/y],[t/y],[t/y],[t/y],...,[t/y],[t/y],[t/y],[t/y],[t/y],[t/y],[t/y],[t/y],[t/y],[t/y]
377,3217000,Powiat wałecki,41334,36344,36278,35118,57041,31034,32847,34377,...,0,0,0,0,0,0,0,0,0,0
378,3218000,Powiat łobeski,11910,10139,9840,10245,24806,25093,18617,18946,...,0,0,0,0,0,0,0,0,0,0
379,3261000,City with powiat status Koszalin,151905,118873,122548,119846,109147,112047,120075,124008,...,0,0,0,0,0,0,0,0,0,0
380,3262000,City with powiat status Szczecin,1138264,968535,1649599,1546676,1471439,1584226,1327756,1008914,...,0,0,0,0,0,0,0,0,0,5
381,3263000,City with powiat status Świnoujście,76254,66638,65936,63966,57870,60659,63748,68539,...,0,0,0,0,0,0,0,0,0,0


In [4]:
def fix_spelling(powiat_name):
    if powiat_name.startswith('P'):
        return powiat_name.lower()
    else: 
        return 'powiat' + powiat_name[23:]

df['Name','Unnamed: 1_level_1','Unnamed: 1_level_2'] = df['Name','Unnamed: 1_level_1','Unnamed: 1_level_2'].apply(fix_spelling)
df.tail()

Unnamed: 0_level_0,Code,Name,total,total,total,total,total,total,total,total,...,dinitrogen monoxide,dinitrogen monoxide,dinitrogen monoxide,dinitrogen monoxide,dinitrogen monoxide,dinitrogen monoxide,dinitrogen monoxide,dinitrogen monoxide,dinitrogen monoxide,dinitrogen monoxide
Unnamed: 0_level_1,Unnamed: 0_level_1,Unnamed: 1_level_1,2010,2011,2012,2013,2014,2015,2016,2017,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
Unnamed: 0_level_2,Unnamed: 0_level_2,Unnamed: 1_level_2,[t/y],[t/y],[t/y],[t/y],[t/y],[t/y],[t/y],[t/y],...,[t/y],[t/y],[t/y],[t/y],[t/y],[t/y],[t/y],[t/y],[t/y],[t/y]
377,3217000,powiat wałecki,41334,36344,36278,35118,57041,31034,32847,34377,...,0,0,0,0,0,0,0,0,0,0
378,3218000,powiat łobeski,11910,10139,9840,10245,24806,25093,18617,18946,...,0,0,0,0,0,0,0,0,0,0
379,3261000,powiat Koszalin,151905,118873,122548,119846,109147,112047,120075,124008,...,0,0,0,0,0,0,0,0,0,0
380,3262000,powiat Szczecin,1138264,968535,1649599,1546676,1471439,1584226,1327756,1008914,...,0,0,0,0,0,0,0,0,0,5
381,3263000,powiat Świnoujście,76254,66638,65936,63966,57870,60659,63748,68539,...,0,0,0,0,0,0,0,0,0,0


In [5]:
#Correct powiat names so that they match with geojson
powiats_static = list(df['Name','Unnamed: 1_level_1','Unnamed: 1_level_2'])

In [6]:
s1, s2 = set(powiats_static), set(powiats_geojson)
print("Present in static dataset, but not in geojson: ",s1 - s2)
print("Present in geojson, but not in static dataset:", s2 - s1)

Present in static dataset, but not in geojson:  {'powiat Capital City Warszawa', 'powiat karkonoski', 'powiat warszawski', 'powiat Wałbrzych since 2013', 'powiat Wałbrzych to 2002'}
Present in geojson, but not in static dataset: {'powiat Wałbrzych', 'powiat jeleniogórski', 'powiat Warszawa'}


We drop `powiat Wałbrzych to 2002` and `powiat warszawski` as they don't exist anymore after 2002. We map `powiat Wałbrzych since 2013` to `powiat Wałbrzych`, `powiat Capital City Warszawa` to `powiat Warszawa` and `powiat karkonoski` to `powiat jeleniogórski` (new name since 2021 - sadly gejson doesn't reflect that change yet). 

In [7]:
#Create mapping
static_to_geojson = {'powiat Wałbrzych since 2013':'powiat Wałbrzych',
                     'powiat Capital City Warszawa':'powiat Warszawa',
                     'powiat karkonoski':'powiat jeleniogórski'}

for i, powiat_name in enumerate(df['Name','Unnamed: 1_level_1','Unnamed: 1_level_2']):
    if powiat_name in static_to_geojson.keys():
        df.loc[i,('Name','Unnamed: 1_level_1','Unnamed: 1_level_2')] = static_to_geojson[powiat_name]

In [8]:
#Drop old powiats
df = df[df['Name','Unnamed: 1_level_1','Unnamed: 1_level_2'] != 'powiat Wałbrzych to 2002']
df = df[df['Name','Unnamed: 1_level_1','Unnamed: 1_level_2'] != 'powiat warszawski']

In [9]:
## Check set difference again
powiats_static = list(df['Name','Unnamed: 1_level_1','Unnamed: 1_level_2'])
s1 = set(powiats_static)
print("Present in static dataset, but not in geojson: ",s1 - s2)
print("Present in geojson, but not in static dataset:", s2 - s1)

Present in static dataset, but not in geojson:  set()
Present in geojson, but not in static dataset: set()


# Find non-matching voivodeships

In [10]:
#read geojson file for wojewodztwo level
with open('../../data/data_auxiliary/wojewodztwa-max.geojson', "r", encoding="utf-8") as f:
     wojewodztwa = json.load(f)

voivods_geojson = [v['properties']['nazwa'] for v in wojewodztwa['features']]
wojewodztwa = {}

In [11]:
df = pd.read_excel("../../data/data_raw/static_annual_data/2017-2021/plants_of_significant_nuisance_to_air_quality_by_voivodship_2017-2021.xlsx", sheet_name='TABLE', header=[0,1,2])
df.head()

Unnamed: 0_level_0,Code,Name,total,total,total,total,total,possessing systems to reduce the emission of particulates,possessing systems to reduce the emission of particulates,possessing systems to reduce the emission of particulates,possessing systems to reduce the emission of particulates,possessing systems to reduce the emission of particulates,possessing systems to reduce the emission of gases,possessing systems to reduce the emission of gases,possessing systems to reduce the emission of gases,possessing systems to reduce the emission of gases,possessing systems to reduce the emission of gases
Unnamed: 0_level_1,Unnamed: 0_level_1,Unnamed: 1_level_1,2017,2018,2019,2020,2021,2017,2018,2019,2020,2021,2017,2018,2019,2020,2021
Unnamed: 0_level_2,Unnamed: 0_level_2,Unnamed: 1_level_2,[pcs],[pcs],[pcs],[pcs],[pcs],[pcs],[pcs],[pcs],[pcs],[pcs],[pcs],[pcs],[pcs],[pcs],[pcs]
0,200000,DOLNOŚLĄSKIE,137,135,133,133,130,79,75,68,69,66,26,23,25,23,25
1,400000,KUJAWSKO-POMORSKIE,105,108,106,103,101,75,74,71,70,69,17,18,20,18,19
2,600000,LUBELSKIE,96,97,97,95,90,61,59,55,57,54,5,6,8,8,6
3,800000,LUBUSKIE,70,70,70,68,67,34,34,32,31,29,3,3,5,6,5
4,1000000,ŁÓDZKIE,118,118,117,112,114,78,79,77,71,67,18,18,18,17,21


In [12]:
df['Name','Unnamed: 1_level_1','Unnamed: 1_level_2'] = df['Name','Unnamed: 1_level_1','Unnamed: 1_level_2'].apply(lambda x: x.lower())
df.head()

Unnamed: 0_level_0,Code,Name,total,total,total,total,total,possessing systems to reduce the emission of particulates,possessing systems to reduce the emission of particulates,possessing systems to reduce the emission of particulates,possessing systems to reduce the emission of particulates,possessing systems to reduce the emission of particulates,possessing systems to reduce the emission of gases,possessing systems to reduce the emission of gases,possessing systems to reduce the emission of gases,possessing systems to reduce the emission of gases,possessing systems to reduce the emission of gases
Unnamed: 0_level_1,Unnamed: 0_level_1,Unnamed: 1_level_1,2017,2018,2019,2020,2021,2017,2018,2019,2020,2021,2017,2018,2019,2020,2021
Unnamed: 0_level_2,Unnamed: 0_level_2,Unnamed: 1_level_2,[pcs],[pcs],[pcs],[pcs],[pcs],[pcs],[pcs],[pcs],[pcs],[pcs],[pcs],[pcs],[pcs],[pcs],[pcs]
0,200000,dolnośląskie,137,135,133,133,130,79,75,68,69,66,26,23,25,23,25
1,400000,kujawsko-pomorskie,105,108,106,103,101,75,74,71,70,69,17,18,20,18,19
2,600000,lubelskie,96,97,97,95,90,61,59,55,57,54,5,6,8,8,6
3,800000,lubuskie,70,70,70,68,67,34,34,32,31,29,3,3,5,6,5
4,1000000,łódzkie,118,118,117,112,114,78,79,77,71,67,18,18,18,17,21


In [13]:
#Correct voivod names so that they match with geojson
voivods_static = list(df['Name','Unnamed: 1_level_1','Unnamed: 1_level_2'])

In [14]:
s1, s2 = set(voivods_static), set(voivods_geojson)
print("Present in static dataset, but not in geojson: ",s1 - s2)
print("Present in geojson, but not in static dataset:", s2 - s1)

Present in static dataset, but not in geojson:  set()
Present in geojson, but not in static dataset: set()
