# [FAO] emissions intensities

In [1]:
import pandas as pd
import numpy as np
import re as re

import _functions_sql as fs
import _functions_data_files as fdf

source_dir = 'fao_emissions_intensities'
source_file = 'Environment_Emissions_intensities_E_All_Data_(Normalized).csv'

## import from CSV & show general information

In [2]:
# import raw data into a pandas dataframe
df_raw = pd.read_csv(
      fdf.get_path(source_file, source_dir)
    , encoding='latin-1'
    , converters={}
)

In [3]:
# show first data rows
df_raw.head()

Unnamed: 0,Area Code,Area Code (M49),Area,Item Code,Item Code (CPC),Item,Element Code,Element,Year Code,Year,Unit,Value,Flag
0,2,'004,Afghanistan,1718,'F1718,Cereals excluding rice,71761,Emissions intensity,1961,1961,kg CO2eq/kg,0.113,E
1,2,'004,Afghanistan,1718,'F1718,Cereals excluding rice,71761,Emissions intensity,1962,1962,kg CO2eq/kg,0.1149,E
2,2,'004,Afghanistan,1718,'F1718,Cereals excluding rice,71761,Emissions intensity,1963,1963,kg CO2eq/kg,0.1205,E
3,2,'004,Afghanistan,1718,'F1718,Cereals excluding rice,71761,Emissions intensity,1964,1964,kg CO2eq/kg,0.1154,E
4,2,'004,Afghanistan,1718,'F1718,Cereals excluding rice,71761,Emissions intensity,1965,1965,kg CO2eq/kg,0.1144,E


In [4]:
# show table summary
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 399410 entries, 0 to 399409
Data columns (total 13 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   Area Code        399410 non-null  int64  
 1   Area Code (M49)  399410 non-null  object 
 2   Area             399410 non-null  object 
 3   Item Code        399410 non-null  int64  
 4   Item Code (CPC)  399410 non-null  object 
 5   Item             399410 non-null  object 
 6   Element Code     399410 non-null  int64  
 7   Element          399410 non-null  object 
 8   Year Code        399410 non-null  int64  
 9   Year             399410 non-null  int64  
 10  Unit             399410 non-null  object 
 11  Value            399410 non-null  float64
 12  Flag             399410 non-null  object 
dtypes: float64(1), int64(5), object(7)
memory usage: 39.6+ MB


In [5]:
# show full duplicates
df_raw.duplicated().value_counts()

False    399410
Name: count, dtype: int64

In [6]:
# show null values
df_raw.isnull().value_counts()

Area Code  Area Code (M49)  Area   Item Code  Item Code (CPC)  Item   Element Code  Element  Year Code  Year   Unit   Value  Flag 
False      False            False  False      False            False  False         False    False      False  False  False  False    399410
Name: count, dtype: int64

## data cleaning

In [7]:
# unify column names:
# - remove leading/trailing spaces
# - convert to lower case
# - replace all non-alphanumerical characters with '_'
df_raw.columns = [
    re.sub('[^a-zA-Z0-9]', '_', col) for col in \
        df_raw.columns.str.strip().str.lower()
]

In [8]:
# retain original raw data for later
df_clean = df_raw.copy()

### drop 'area_code' & 'area_code_m49_' (redundant to 'Area')

:FIXME: :TODO: has yet to be proven!!!

In [9]:
# drop column
df_clean.drop(columns = ['area_code', 'area_code__m49_'], inplace=True)

### drop 'item_code_cpc_' (redundant to 'item_code')

:FIXME: :TODO: has yet to be proven!!!

In [10]:
# drop column
df_clean.drop(columns = ['item_code__cpc_'], inplace=True)

### drop 'element_code' (redundant to 'element')

:FIXME: :TODO: has yet to be proven!!!

'element_code' stands for a unique combination of element and unit, it can be dropped

In [11]:
# drop column
df_clean.drop(columns = ['element_code'], inplace=True)

### drop 'year_code' (redundant to 'year')

:FIXME: :TODO: has yet to be proven!!!

In [12]:
# drop column
df_clean.drop(columns = ['year_code'], inplace=True)

### drop 'flag' (no added value)

:FIXME: :TODO: has yet to be proven!!!

In [13]:
# drop column
df_clean.drop(columns = ['flag'], inplace=True)

## data wrangling

In [14]:
# retain original cleaned data for later
df_wrangled = df_clean.copy()

### split 'element' column into separate columns using 'value' and 'unit'

In [15]:
# check for duplicates regarding composite key with 'element'
df_wrangled[['area', 'year', 'item', 'element']].duplicated().value_counts()

False    399410
Name: count, dtype: int64

In [16]:
# verify all values are positive, otherwise aggregation via 'max' will not work
(df_wrangled['value'] >= 0).all()

True

In [17]:
# important: values have divergent units, so this has to be handled
# appropriately further on
df_wrangled['unit'].value_counts(dropna=False)

unit
kt             133607
t              133607
kg CO2eq/kg    132196
Name: count, dtype: int64

In [18]:
# split 'element' column (by the 3 expected values) into seperate columns, that
# contain 0/1 depending on the actual value of 'element'
df_dummies = pd.get_dummies(df_wrangled['element'])
# rename columns
df_dummies.rename(
        columns={
              'Emissions (CO2eq) (AR5)': 'emissions_co2eq'
            , 'Emissions intensity': 'emissions_intensity'
            , 'Production': 'production'
        }
        , inplace=True)

# add dummies after replacing '1' with actual value from 'value'
df_new_cols = df_dummies.mul(df_wrangled['value'], axis=0)
df_wrangled = pd.concat([df_wrangled, df_new_cols], axis=1)

# add dummies with suffix appended to column name and after replacing '1'
# with actual value from 'unit'
df_new_cols = df_dummies.add_suffix('_unit').mul(df_wrangled['unit'], axis=0)
df_wrangled = pd.concat([df_wrangled, df_new_cols], axis=1)

In [19]:
# calculate the precision loss
print(
      'precision loss for emissions_co2eq:'
    , df_wrangled[df_wrangled['element'] == 'Emissions (CO2eq) (AR5)'] \
        ['value'].sum()
      - df_wrangled['emissions_co2eq'].sum()
)
print(
      'precision loss for emissions_intensity:'
    , df_wrangled[df_wrangled['element'] == 'Emissions intensity'] \
        ['value'].sum()
      - df_wrangled['emissions_intensity'].sum()
)
print(
      'precision loss for production:'
    , df_wrangled[df_wrangled['element'] == 'Production'] \
        ['value'].sum()
      - df_wrangled['production'].sum()
)

precision loss for emissions_co2eq: 0.0
precision loss for emissions_intensity: 0.0
precision loss for production: 0.0001220703125


In [20]:
# drop now redundant columns
df_wrangled.drop(columns = ['element', 'unit', 'value'], inplace=True)
# group all duplicate columns keeping the actual values for each row
df_wrangled = df_wrangled.groupby([
      'area'
    , 'item_code'
    , 'item'
    , 'year'
]).agg({
      'emissions_co2eq': 'max'
    , 'emissions_intensity': 'max'
    , 'production': 'max'
    , 'emissions_co2eq_unit': 'max'
    , 'emissions_intensity_unit': 'max'
    , 'production_unit': 'max'
    }).reset_index()

### verify 'year' values

In [21]:
# show time span of rows
pd.Series(df_raw['year'].unique()).sort_values().to_numpy()

array([1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971,
       1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982,
       1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993,
       1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
       2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015,
       2016, 2017, 2018, 2019, 2020, 2021])

### reorder columns

In [23]:
column_order = [
      'area', 'year', 'item', 'item_code'
    , 'emissions_co2eq', 'emissions_co2eq_unit'
    , 'emissions_intensity', 'emissions_intensity_unit'
    , 'production', 'production_unit'
]
if len(column_order) != df_wrangled.shape[1]:
    print('warning: dropping columns')
df_wrangled = df_wrangled[column_order]

## final checks & upload to database server

In [23]:
# show first data rows
df_wrangled.head()

Unnamed: 0,area,year,item,item_code,emissions_co2eq,emissions_co2eq_unit,emissions_intensity,emissions_intensity_unit,production,production_unit
0,Afghanistan,1961,Rice,27,867.5087,kt,2.7195,kg CO2eq/kg,319000.0,t
1,Afghanistan,1962,Rice,27,867.5087,kt,2.7195,kg CO2eq/kg,319000.0,t
2,Afghanistan,1963,Rice,27,867.5087,kt,2.7195,kg CO2eq/kg,319000.0,t
3,Afghanistan,1964,Rice,27,910.6113,kt,2.3963,kg CO2eq/kg,380000.0,t
4,Afghanistan,1965,Rice,27,910.6113,kt,2.3963,kg CO2eq/kg,380000.0,t


In [24]:
# show table summary
df_wrangled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 133607 entries, 0 to 133606
Data columns (total 10 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   area                      133607 non-null  object 
 1   year                      133607 non-null  int64  
 2   item                      133607 non-null  object 
 3   item_code                 133607 non-null  int64  
 4   emissions_co2eq           133607 non-null  float64
 5   emissions_co2eq_unit      133607 non-null  object 
 6   emissions_intensity       133607 non-null  float64
 7   emissions_intensity_unit  133607 non-null  object 
 8   production                133607 non-null  float64
 9   production_unit           133607 non-null  object 
dtypes: float64(3), int64(2), object(5)
memory usage: 10.2+ MB


In [25]:
# check for duplicates regarding composite key
df_wrangled[['area', 'year', 'item']].duplicated().value_counts()

False    133607
Name: count, dtype: int64

### write raw data

In [26]:
# write to database & grant access
table_name = 'fao_emissions_intensities_raw_sh'
fs.write_dataframe(df_raw, table_name)
fs.run_command('CALL grant_access(\'' + table_name + '\')')

+ table written: fao_emissions_intensities_raw_sh


### write wrangled data

In [27]:
# write to database & grant access
table_name = 'fao_emissions_intensities_wrangled_sh'
fs.write_dataframe(df_wrangled, table_name)
fs.run_command('CALL grant_access(\'' + table_name + '\')')

+ table written: fao_emissions_intensities_wrangled_sh
