# [FAO] emissions from livestock

In [1]:
import pandas as pd
import numpy as np
import re as re

import _functions_sql as fs
import _functions_data_files as fdf

source_dir = 'fao_emissions_livestock'
source_file = 'Emissions_livestock_E_All_Data_(Normalized).csv'

## import from CSV & show general information

In [2]:
# import raw data into a pandas dataframe
df_raw = pd.read_csv(
      fdf.get_path(source_file, source_dir)
    , encoding='latin-1'
    , converters={'Note': str} # handle DtypeWarning without 'low_memory=False' 
)

In [3]:
# show first data rows
df_raw.head()

Unnamed: 0,Area Code,Area Code (M49),Area,Item Code,Item Code (CPC),Item,Element Code,Element,Year Code,Year,Source Code,Source,Unit,Value,Flag,Note
0,2,'004,Afghanistan,1107,'02132,Asses,5111,Stocks,1961,1961,3050,FAO TIER 1,An,1300000.0,A,
1,2,'004,Afghanistan,1107,'02132,Asses,5111,Stocks,1962,1962,3050,FAO TIER 1,An,851850.0,A,
2,2,'004,Afghanistan,1107,'02132,Asses,5111,Stocks,1963,1963,3050,FAO TIER 1,An,1001112.0,A,
3,2,'004,Afghanistan,1107,'02132,Asses,5111,Stocks,1964,1964,3050,FAO TIER 1,An,1150000.0,E,
4,2,'004,Afghanistan,1107,'02132,Asses,5111,Stocks,1965,1965,3050,FAO TIER 1,An,1300000.0,A,


In [4]:
# show table summary
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6285217 entries, 0 to 6285216
Data columns (total 16 columns):
 #   Column           Dtype  
---  ------           -----  
 0   Area Code        int64  
 1   Area Code (M49)  object 
 2   Area             object 
 3   Item Code        int64  
 4   Item Code (CPC)  object 
 5   Item             object 
 6   Element Code     int64  
 7   Element          object 
 8   Year Code        int64  
 9   Year             int64  
 10  Source Code      int64  
 11  Source           object 
 12  Unit             object 
 13  Value            float64
 14  Flag             object 
 15  Note             object 
dtypes: float64(1), int64(6), object(9)
memory usage: 767.2+ MB


In [5]:
# show full duplicates
df_raw.duplicated().value_counts()

False    6285217
Name: count, dtype: int64

In [6]:
# show null values
df_raw.isnull().value_counts()

Area Code  Area Code (M49)  Area   Item Code  Item Code (CPC)  Item   Element Code  Element  Year Code  Year   Source Code  Source  Unit   Value  Flag   Note 
False      False            False  False      False            False  False         False    False      False  False        False   False  False  False  False    6285217
Name: count, dtype: int64

## data cleaning

In [7]:
# unify column names:
# - remove leading/trailing spaces
# - convert to lower case
# - replace all non-alphanumerical characters with '_'
df_raw.columns = [
    re.sub('[^a-zA-Z0-9]', '_', col) for col in \
        df_raw.columns.str.strip().str.lower()
]

In [8]:
# retain original raw data for later
df_clean = df_raw.copy()

### drop 'area_code' & 'area_code_m49_' (redundant to 'Area')

:FIXME: :TODO: has yet to be proven!!!

In [9]:
# drop column
df_clean.drop(columns = ['area_code', 'area_code__m49_'], inplace=True)

### drop 'item_code_cpc_' (redundant to 'item_code')

:FIXME: :TODO: has yet to be proven!!!

In [10]:
# drop column
df_clean.drop(columns = ['item_code__cpc_'], inplace=True)

### drop 'element_code' (redundant to 'element')

:FIXME: :TODO: has yet to be proven!!!

'element_code' stands for a unique combination of element and unit, it can be dropped

In [11]:
# drop column
df_clean.drop(columns = ['element_code'], inplace=True)

### drop 'year_code' (redundant to 'year')

:FIXME: :TODO: has yet to be proven!!!

In [12]:
# drop column
df_clean.drop(columns = ['year_code'], inplace=True)

### drop 'source_code' (redundant to 'source')

:FIXME: :TODO: has yet to be proven!!!

In [13]:
# drop column
df_clean.drop(columns = ['source_code'], inplace=True)

### drop 'note' (no added value)

:FIXME: :TODO: has yet to be proven!!!

In [14]:
# replace empty strings with 'null'
df_clean['note'].replace('', np.nan, inplace=True)

In [15]:
# show unique values & counts
df_clean['note'].value_counts(dropna=False)

note
NaN                             6213175
NC/CRF/BUR                        60399
NC/CRF/BUR Unofficial figure       7521
Unofficial figure                  2482
UNFCCC Repository                  1640
Name: count, dtype: int64

In [16]:
# drop column
df_clean.drop(columns = ['note'], inplace=True)

### drop 'flag' (no added value)

:FIXME: :TODO: has yet to be proven!!!

In [17]:
# drop column
df_clean.drop(columns = ['flag'], inplace=True)

## data wrangling

In [18]:
# retain original cleaned data for later
df_wrangled = df_clean.copy()

### drop duplicate rows from divergent 'source'

when there is data from multiple sources, all but that from 'FAO TIER 1' is dropped

In [19]:
# show unique values & counts
df_wrangled['source'].value_counts(dropna=False)

source
FAO TIER 1    6199890
UNFCCC          85327
Name: count, dtype: int64

In [20]:
# count duplicate rows (from divergent 'source')
df_wrangled[df_wrangled.duplicated(
      subset=['area', 'item_code', 'item', 'element', 'year']
    , keep=False
)].shape[0]

164558

In [21]:
# sort, so that 'FAO TIER 1' is first for duplicate rows
df_wrangled.sort_values(
      ['area', 'item_code', 'item', 'element', 'year', 'source']
    , na_position='last'
    , ascending=True
    , inplace=True
)
# drop duplicate rows, keeping first
df_wrangled.drop_duplicates(
      subset=['area', 'item_code', 'item', 'element', 'year']
    , keep='first'
    , inplace=True
)

### drop all but totals values from 'element'

In [22]:
# show unique values & counts
df_wrangled['element'].value_counts(dropna=False)

element
Livestock total (Emissions CH4)                                  255374
Manure left on pasture (N content)                               255318
Manure management (manure treated, N content)                    255232
Manure management (Direct emissions N2O)                         255232
Manure management (Emissions N2O)                                254836
Livestock total (Emissions N2O)                                  254836
Manure management (Emissions CH4)                                254834
Manure left on pasture that volatilises (N content)              254829
Manure left on pasture that leaches (N content)                  254829
Manure left on pasture (Indirect emissions N2O)                  254829
Manure left on pasture (Emissions N2O)                           254829
Manure left on pasture (Direct emissions N2O)                    254829
Manure applied to soils that volatilises (N content)             254829
Manure management (Indirect emissions N2O)              

In [23]:
# drop rows that do not contain totals values
df_wrangled.drop(df_wrangled[~(df_wrangled['element'].isin([
      'Livestock total (Emissions CH4)'
    , 'Livestock total (Emissions N2O)'
]))].index, inplace=True)
print('remaining rows:', df_wrangled.shape[0])

remaining rows: 510210


### split 'element' column into separate columns using 'value' and 'unit'

In [24]:
# check for duplicates regarding composite key with 'element'
df_wrangled[['area', 'year', 'item', 'element']].duplicated().value_counts()

False    510210
Name: count, dtype: int64

In [25]:
# verify all values are positive, otherwise aggregation via 'max' will not work
(df_wrangled['value'] >= 0).all()

True

In [26]:
# verify all values have the same unit
df_wrangled['unit'].value_counts(dropna=False)

unit
kt    510210
Name: count, dtype: int64

In [27]:
# split 'element' column (by the 2 expected values) into seperate columns, that
# contain 0/1 depending on the actual value of 'element'
df_dummies = pd.get_dummies(df_wrangled['element'])
# rename columns
df_dummies.rename(
        columns={
              'Livestock total (Emissions CH4)': 'emissions_ch4'
            , 'Livestock total (Emissions N2O)': 'emissions_n2o'
        }
        , inplace=True)

# add dummies after replacing '1' with actual value from 'value'
df_new_cols = df_dummies.mul(df_wrangled['value'], axis=0)
df_wrangled = pd.concat([df_wrangled, df_new_cols], axis=1)

In [28]:
# calculate the precision loss
print(
      'precision loss for emissions_ch4:'
    , df_wrangled[df_wrangled['element'] == 'Livestock total (Emissions CH4)'] \
        ['value'].sum()
      - df_wrangled['emissions_ch4'].sum()
)
print(
      'precision loss for emissions_n2o:'
    , df_wrangled[df_wrangled['element'] == 'Livestock total (Emissions N2O)'] \
        ['value'].sum()
      - df_wrangled['emissions_n2o'].sum()
)

precision loss for emissions_ch4: -1.4901161193847656e-08
precision loss for emissions_n2o: 4.656612873077393e-10


In [29]:
# rename columns
df_wrangled.rename(columns={
    'unit': 'emissions_unit'
}, inplace=True)
# drop now redundant columns
df_wrangled.drop(columns = ['element', 'value'], inplace=True)
# group all duplicate columns keeping the actual values for each row
df_wrangled = df_wrangled.groupby([
      'area'
    , 'item_code'
    , 'item'
    , 'year'
    , 'source'
    , 'emissions_unit'
]).agg({
      'emissions_ch4': 'max'
    , 'emissions_n2o': 'max'
}).reset_index()

### verify 'year' values

note: '2030' and '2050' are official forecasts

In [31]:
# show time span of rows
pd.Series(df_raw['year'].unique()).sort_values().to_numpy()

array([1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971,
       1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982,
       1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993,
       1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
       2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015,
       2016, 2017, 2018, 2019, 2020, 2021, 2030, 2050])

### reorder columns

In [142]:
column_order = [
      'area', 'year', 'item', 'item_code', 'source'
    , 'emissions_ch4', 'emissions_n2o', 'emissions_unit'
]
if len(column_order) != df_wrangled.shape[1]:
    print('warning: dropping columns')
df_wrangled = df_wrangled[column_order]

## final checks & upload to database server

In [143]:
# show first data rows
df_wrangled.head()

Unnamed: 0,area,year,item,item_code,source,emissions_ch4,emissions_n2o,emissions_unit
0,Afghanistan,1961,"Cattle, dairy",960,FAO TIER 1,44.1,0.7937,kt
1,Afghanistan,1962,"Cattle, dairy",960,FAO TIER 1,44.1,0.7937,kt
2,Afghanistan,1963,"Cattle, dairy",960,FAO TIER 1,49.14,0.8844,kt
3,Afghanistan,1964,"Cattle, dairy",960,FAO TIER 1,49.14,0.8844,kt
4,Afghanistan,1965,"Cattle, dairy",960,FAO TIER 1,54.81,0.9864,kt


In [144]:
# show table summary
df_wrangled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 255374 entries, 0 to 255373
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   area            255374 non-null  object 
 1   year            255374 non-null  int64  
 2   item            255374 non-null  object 
 3   item_code       255374 non-null  int64  
 4   source          255374 non-null  object 
 5   emissions_ch4   255374 non-null  float64
 6   emissions_n2o   255374 non-null  float64
 7   emissions_unit  255374 non-null  object 
dtypes: float64(2), int64(2), object(4)
memory usage: 15.6+ MB


In [145]:
# check for duplicates regarding composite key
df_wrangled[['area', 'year', 'item']].duplicated().value_counts()

False    255374
Name: count, dtype: int64

### write raw data

In [146]:
# write to database & grant access
table_name = 'fao_emissions_livestock_raw_sh'
fs.write_dataframe(df_raw, table_name)
fs.run_command('CALL grant_access(\'' + table_name + '\')')

+ table written: fao_emissions_livestock_raw_sh


### write wrangled data

In [147]:
# write to database & grant access
table_name = 'fao_emissions_livestock_wrangled_sh'
fs.write_dataframe(df_wrangled, table_name)
fs.run_command('CALL grant_access(\'' + table_name + '\')')

+ table written: fao_emissions_livestock_wrangled_sh
