# Climate, Environment & Pollution
## Preprocessing & Merge 

In [7]:
import os
import pandas as pd
pd.set_option('display.max_rows', 150)
#pd.options.mode.chained_assignment = None
from eurostat import *
import numpy as np

## Cooling and heating degree days by NUTS 2 regions - annual data
https://ec.europa.eu/eurostat/cache/metadata/en/nrg_chdd_esms.htm

In [3]:
heatdays = get_data_df('nrg_chddr2_a', flags=False)
heatdays = heatdays.iloc[:,:11].rename(columns={'geo\\time': ' NUTS 2'})

# Merge on all entries which are also in the target variable cities to extract only the interesting cities
target_cities = pd.read_csv("Cities_with_codes.csv")
heatdays_nuts = pd.merge(target_cities, heatdays, on=[' NUTS 2'])


print('any missing values: ' + str(heatdays_nuts.isna().any().any()))
heatdays_nuts.head(2)

any missing values: False


Unnamed: 0,City,City Code,NUTS 2,Country,unit,indic_nrg,2019,2018,2017,2016,2015,2014,2013,2012
0,Amsterdam,NL002C1,NL32,NL,NR,CDD,33.41,18.95,2.4,9.78,12.22,6.81,7.28,6.85
1,Amsterdam,NL002C1,NL32,NL,NR,HDD,2476.15,2524.38,2501.11,2642.52,2601.07,2257.99,3005.27,2818.95


In [4]:
heatdays_cdd = heatdays_nuts.iloc[np.where(heatdays_nuts.indic_nrg == 'CDD')]
heatdays_cdd.to_csv('../../data/heatdays_cdd.csv')
heatdays_hdd = heatdays_nuts.iloc[np.where(heatdays_nuts.indic_nrg == 'HDD')]
heatdays_cdd.to_csv('../../data/heatdays_hdd.csv')

In [6]:
print(heatdays_hdd.shape)
heatdays_hdd.head(2)

(61, 14)


Unnamed: 0,City,City Code,NUTS 2,Country,unit,indic_nrg,2019,2018,2017,2016,2015,2014,2013,2012
1,Amsterdam,NL002C1,NL32,NL,NR,HDD,2476.15,2524.38,2501.11,2642.52,2601.07,2257.99,3005.27,2818.95
3,Antwerp,BE002C1,BE2,BE,NR,HDD,2372.84,2392.04,2396.57,2520.37,2451.94,2117.72,2846.68,2596.03


## Number and capacity of recovery and diswastecap_engrecposal facilities by NUTS 2 regions
Energy recovery (RCV_E): Operation R1;
The dataset on waste treatment facilities is broken down into the measurement variable (number of facilities, capacity), the waste operation and NUTS2 regions.

https://ec.europa.eu/eurostat/cache/metadata/en/env_wasgt_esms.htm

In [22]:
wastecap = get_data_df('env_wasfac', flags=False)
wastecap = wastecap.rename(columns={'geo\\time': ' NUTS 2'})
wastecap_nuts = pd.merge(target_cities, wastecap, on=[' NUTS 2'])

## filter for Energy recovery (RCV_E) measure and CAP (capacity)
wastecap_nuts = wastecap_nuts.iloc[np.where(wastecap_nuts.wst_oper == 'RCV_E')]
wastecap_engrec = wastecap_nuts.iloc[np.where(wastecap_nuts.indic_env == 'CAP')]

print(wastecap_engrec.shape)
wastecap_engrec.head(3)

## TODO interpolate odd years

(76, 14)


Unnamed: 0,City,City Code,NUTS 2,Country,indic_env,wst_oper,2018,2016,2014,2012,2010,2008,2006,2004
3,Amsterdam,NL002C1,NL32,NL,CAP,RCV_E,2333000.0,2243000.0,2243000.0,2243000.0,975000.0,0.0,93.0,3840.0
18,Ankara,TR001C1,TR51,TR,CAP,RCV_E,,,,,,,0.0,
34,Antwerp,BE002C1,BE2,BE,CAP,RCV_E,3256785.0,3195547.0,3300195.0,3641982.0,1763270.0,1763270.0,1849801.0,5391947.0


In [23]:
# Check for missing values
print('Missing values for columns:')
def NaN_percent(df, column_name):
    row_count = df[column_name].shape[0]
    empty_values = row_count - df[column_name].count()
    return (100.0*empty_values)/row_count
for i in list(wastecap_engrec):
    print("%s: %.2f%%" % (i, NaN_percent(wastecap_engrec,i)))


Missing values for columns:
City: 0.00%
 City Code: 0.00%
 NUTS 2: 0.00%
 Country: 0.00%
indic_env: 0.00%
wst_oper: 0.00%
2018: 10.53%
2016: 11.84%
2014: 17.11%
2012: 22.37%
2010: 28.95%
2008: 40.79%
2006: 32.89%
2004: 68.42%


In [25]:
## TODO interpolate entire columns for odd years

#    make empty columns, then rearrange order, filter < 2012,  flip to ascending years
#    then interpolate

wastecap_engrec.interpolate(method='linear').head(8)

Unnamed: 0,City,City Code,NUTS 2,Country,indic_env,wst_oper,2018,2016,2014,2012,2010,2008,2006,2004
3,Amsterdam,NL002C1,NL32,NL,CAP,RCV_E,2333000.0,2243000.0,2243000.0,2243000.0,975000.0,0.0,93.0,3840.0
18,Ankara,TR001C1,TR51,TR,CAP,RCV_E,2794892.5,2719273.5,2771597.5,2942491.0,1369135.0,881635.0,0.0,2697894.0
34,Antwerp,BE002C1,BE2,BE,CAP,RCV_E,3256785.0,3195547.0,3300195.0,3641982.0,1763270.0,1763270.0,1849801.0,5391947.0
49,Athens,EL001C1,EL30,EL,CAP,RCV_E,34757.0,43276.0,43276.0,43277.0,43756.0,64493.0,1263855.5,2695974.0
64,Barcelona,ES002C1,ES51,ES,CAP,RCV_E,2163268.0,2148592.0,2148592.0,1343814.0,808764.0,51328.67,677910.0,0.0
79,Valencia,ES003C1,ES51,ES,CAP,RCV_E,2163268.0,2148592.0,2148592.0,1343814.0,808764.0,38164.33,677910.0,0.0
94,Belfast,UK012C1,UKN0,UK,CAP,RCV_E,376000.0,196000.0,36000.0,32375.0,18140.0,25000.0,0.0,5428.571
108,Bergen,NO002C1,NO05,NO,CAP,RCV_E,350000.0,350000.0,350000.0,355500.0,387500.0,346600.0,228600.0,10857.14


# More potential data
internet https://ec.europa.eu/eurostat/cache/metadata/en/isoc_i_esms.htm

soil cover - ef_mp_soil https://ec.europa.eu/eurostat/cache/metadata/en/ef_esms.htm #massive 2.5M rows

waste - env_rwas_gen https://ec.europa.eu/eurostat/cache/metadata/en/env_rwas_gen_esms.htm

air transport - tran_r_avpa_nm https://ec.europa.eu/eurostat/cache/metadata/en/reg_tran_esms.htm

(tgs00026 disposable income of private households)

econ - nama_10r_2emhrw https://ec.europa.eu/eurostat/cache/metadata/en/reg_eco10_esms.htm



In [1]:
#waste = get_data_df('env_rwas_gen', flags=False)
#waste.head()

In [2]:
#econ = get_data_df('nama_10r_2emhrw', flags=False)
#econ.head(5)

In [3]:
#transp = get_data_df('tran_r_avpa_nm', flags=False)
#transp.head(5)

In [4]:
## disposable income
#dincome = get_data_df('tgs00026', flags=False)
#dincome.head(5)

# Scraping Polution index from numbeo
https://www.numbeo.com/pollution/region_rankings.jsp?title=2019&region=150

adapted response bs scraper for pollution

In [50]:
!python Pollution_Scraper.py

In [49]:
!python Response_Variable.py

In [51]:
!ls ../../data

 energy_simlp_nrg_100a.tsv	  Pollution_2017	 Quality_of_life_2017
'energy_simlp_nrg_100a .tsv.gz'   Pollution_2018	 Quality_of_life_2018
 heatdays_cdd.csv		  Pollution_2019	 Quality_of_life_2019
 heatdays_hdd.csv		  Pollution_2020	 Quality_of_life_2020
 Pollution_2012			  Quality_of_life_2012	 t2020_30.tsv
 Pollution_2013			  Quality_of_life_2013	 t2020_31.tsv
 Pollution_2014			  Quality_of_life_2014	 t2020_rl100+ESTAT.tsv
 Pollution_2015			  Quality_of_life_2015	 t2020_rn210.tsv
 Pollution_2016			  Quality_of_life_2016	 urb_cenv.tsv


## Read QOL & Pollution files to DF

In [70]:
## for each year: read QOL + pollution csv, merge, add year, append to combined df 
df_all = pd.DataFrame()    
for year in range(2012,2021):
    ## read pollution for year
    filename = ('../../data/' + 'Pollution_' + str(year))
    df_yr = pd.read_csv(filename).iloc[:,2:4]
    
    ## read Quality of Life for year
    filename_qol = ('../../data/' + 'Quality_of_life_' + str(year))
    df_qol = pd.read_csv(filename_qol).iloc[:,2:4]
    
    # merging QOL & Pollution 
    df_yr = pd.merge(df_yr,df_qol, how = 'outer', on = 'City')
    
    ## define year and rename
    df_yr.loc[:,'Year'] = int(year)
    df_yr = df_yr.rename(columns = {'Pollution Index':'Pollution', 'Quality of Life Index':'QOL'})
    
    ## concat to combined df
    df_all = pd.concat([df_all, df_yr])

df_all.to_csv('../../data/QOL_pol_merged.csv', index = False)

In [71]:
df_test = pd.read_csv('../../data/QOL_pol_merged.csv')
df_test

Unnamed: 0,City,Pollution,QOL,Year
0,Bucharest,140.71,-17.36,2012
1,Athens,140.00,-35.53,2012
2,Milan,120.00,13.59,2012
3,Barcelona,118.00,,2012
4,Moscow,116.61,-54.72,2012
...,...,...,...,...
1695,Dhaka,,62.20,2020
1696,Manila,,57.73,2020
1697,Tehran,,54.73,2020
1698,Lagos,,53.15,2020
