# [FAO] production of crops and livestock products

In [1]:
import pandas as pd
import numpy as np
import re as re

import _functions_sql as fs
import _functions_data_files as fdf

source_dir = 'fao_production'
source_file = 'Production_Crops_Livestock_E_All_Data_(Normalized).csv'

## import from CSV & show general information

In [2]:
# import raw data into a pandas dataframe
df_raw = pd.read_csv(
      fdf.get_path(source_file, source_dir)
    , encoding='latin-1'
    , converters={'Note': str} # handle DtypeWarning without 'low_memory=False' 
)

In [3]:
# show first data rows
df_raw.head()

Unnamed: 0,Area Code,Area Code (M49),Area,Item Code,Item Code (CPC),Item,Element Code,Element,Year Code,Year,Unit,Value,Flag,Note
0,2,'004,Afghanistan,221,'01371,"Almonds, in shell",5312,Area harvested,1961,1961,ha,0.0,A,
1,2,'004,Afghanistan,221,'01371,"Almonds, in shell",5312,Area harvested,1962,1962,ha,0.0,A,
2,2,'004,Afghanistan,221,'01371,"Almonds, in shell",5312,Area harvested,1963,1963,ha,0.0,A,
3,2,'004,Afghanistan,221,'01371,"Almonds, in shell",5312,Area harvested,1964,1964,ha,0.0,A,
4,2,'004,Afghanistan,221,'01371,"Almonds, in shell",5312,Area harvested,1965,1965,ha,0.0,A,


In [4]:
# show table summary
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4127584 entries, 0 to 4127583
Data columns (total 14 columns):
 #   Column           Dtype  
---  ------           -----  
 0   Area Code        int64  
 1   Area Code (M49)  object 
 2   Area             object 
 3   Item Code        int64  
 4   Item Code (CPC)  object 
 5   Item             object 
 6   Element Code     int64  
 7   Element          object 
 8   Year Code        int64  
 9   Year             int64  
 10  Unit             object 
 11  Value            float64
 12  Flag             object 
 13  Note             object 
dtypes: float64(1), int64(5), object(8)
memory usage: 440.9+ MB


In [5]:
# show full duplicates
df_raw.duplicated().value_counts()

False    4127584
Name: count, dtype: int64

In [6]:
# show null values
df_raw.isnull().value_counts()

Area Code  Area Code (M49)  Area   Item Code  Item Code (CPC)  Item   Element Code  Element  Year Code  Year   Unit   Value  Flag   Note 
False      False            False  False      False            False  False         False    False      False  False  False  False  False    4127584
Name: count, dtype: int64

## data cleaning

In [7]:
# unify column names:
# - remove leading/trailing spaces
# - convert to lower case
# - replace all non-alphanumerical characters with '_'
df_raw.columns = [
    re.sub('[^a-zA-Z0-9]', '_', col) for col in \
        df_raw.columns.str.strip().str.lower()
]

In [8]:
# retain original raw data for later
df_clean = df_raw.copy()

### drop 'area_code' & 'area_code_m49_' (redundant to 'Area')

:FIXME: :TODO: has yet to be proven!!!

In [9]:
# drop column
df_clean.drop(columns = ['area_code', 'area_code__m49_'], inplace=True)

### drop 'item_code_cpc_' (redundant to 'item_code')

:FIXME: :TODO: has yet to be proven!!!

In [10]:
# drop column
df_clean.drop(columns = ['item_code__cpc_'], inplace=True)

### drop 'element_code' (redundant to 'element')

:FIXME: :TODO: has yet to be proven!!!

'element_code' stands for a unique combination of element and unit, it can be dropped

In [11]:
# drop column
df_clean.drop(columns = ['element_code'], inplace=True)

### drop 'year_code' (redundant to 'year')

:FIXME: :TODO: has yet to be proven!!!

In [12]:
# drop column
df_clean.drop(columns = ['year_code'], inplace=True)

### drop 'note' (no added value)

:FIXME: :TODO: has yet to be proven!!!

In [13]:
# replace empty strings with 'null'
df_clean['note'].replace('', np.nan, inplace=True)

In [14]:
# show unique values & counts
df_clean['note'].value_counts(dropna=False)

note
NaN                  4046267
Unofficial figure      81317
Name: count, dtype: int64

In [15]:
# drop column
df_clean.drop(columns = ['note'], inplace=True)

### drop 'flag' (no added value)

:FIXME: :TODO: has yet to be proven!!!

In [16]:
# drop column
df_clean.drop(columns = ['flag'], inplace=True)

## data wrangling

In [17]:
# retain original cleaned data for later
df_wrangled = df_clean.copy()

### split 'element' column into separate columns using 'value' and 'unit'

In [18]:
# check for duplicates regarding composite key with 'element'
df_wrangled[['area', 'year', 'item', 'element']].duplicated().value_counts()

False    4093765
True       33819
Name: count, dtype: int64

In [19]:
# check for duplicates regarding composite key with 'element' + 'unit'
df_wrangled[['area', 'year', 'item', 'element', 'unit']].duplicated().value_counts()

False    4127584
Name: count, dtype: int64

it appears, that for some area-year-item-element combinations, there are duplicate 'value' values present, but with a different 'unit'

In [20]:
df_wrangled.sort_values(['area', 'year', 'item', 'element'])

Unnamed: 0,area,item_code,item,element,year,unit,value
0,Afghanistan,221,"Almonds, in shell",Area harvested,1961,ha,0.00
109,Afghanistan,221,"Almonds, in shell",Production,1961,t,0.00
171,Afghanistan,711,"Anise, badian, coriander, cumin, caraway, fenn...",Area harvested,1961,ha,0.00
270,Afghanistan,711,"Anise, badian, coriander, cumin, caraway, fenn...",Production,1961,t,0.00
332,Afghanistan,515,Apples,Area harvested,1961,ha,2220.00
...,...,...,...,...,...,...,...
3101662,Zimbabwe,1735,Vegetables Primary,Production,2022,t,235484.51
3101600,Zimbabwe,1735,Vegetables Primary,Yield,2022,100 g/ha,65856.00
3097858,Zimbabwe,15,Wheat,Area harvested,2022,ha,38700.00
3097982,Zimbabwe,15,Wheat,Production,2022,t,200000.00


In [21]:
df_wrangled[df_wrangled.duplicated(['area', 'year', 'item', 'element'])]#.sort_values(['area', 'year', 'item', 'element'])

Unnamed: 0,area,item_code,item,element,year,unit,value
3189,Afghanistan,1062,"Hen eggs in shell, fresh",Yield,1961,No/An,50.0
3190,Afghanistan,1062,"Hen eggs in shell, fresh",Yield,1962,No/An,50.0
3191,Afghanistan,1062,"Hen eggs in shell, fresh",Yield,1963,No/An,50.0
3192,Afghanistan,1062,"Hen eggs in shell, fresh",Yield,1964,No/An,50.0
3193,Afghanistan,1062,"Hen eggs in shell, fresh",Yield,1965,No/An,49.0
...,...,...,...,...,...,...,...
4100478,Net Food Importing Developing Countries,1062,"Hen eggs in shell, fresh",Production,2018,1000 No,95215864.0
4100479,Net Food Importing Developing Countries,1062,"Hen eggs in shell, fresh",Production,2019,1000 No,100018834.0
4100480,Net Food Importing Developing Countries,1062,"Hen eggs in shell, fresh",Production,2020,1000 No,108112821.0
4100481,Net Food Importing Developing Countries,1062,"Hen eggs in shell, fresh",Production,2021,1000 No,110522710.0




----


ab hier der revision unterlegen !!!!

In [22]:
df_clean[df_clean.duplicated(subset=['area', 'year', 'item', 'element'], keep=False)].sort_values(['area', 'year', 'item', 'element'])

Unnamed: 0,area,item_code,item,element,year,unit,value
3251,Afghanistan,1062,"Hen eggs in shell, fresh",Production,1961,t,10000.00
3313,Afghanistan,1062,"Hen eggs in shell, fresh",Production,1961,1000 No,200000.00
3127,Afghanistan,1062,"Hen eggs in shell, fresh",Yield,1961,100 mg/An,25000.00
3189,Afghanistan,1062,"Hen eggs in shell, fresh",Yield,1961,No/An,50.00
3252,Afghanistan,1062,"Hen eggs in shell, fresh",Production,1962,t,11000.00
...,...,...,...,...,...,...,...
3086761,Zimbabwe,1062,"Hen eggs in shell, fresh",Yield,2021,No/An,93.00
3086824,Zimbabwe,1062,"Hen eggs in shell, fresh",Production,2022,t,22163.89
3086886,Zimbabwe,1062,"Hen eggs in shell, fresh",Production,2022,1000 No,461748.00
3086700,Zimbabwe,1062,"Hen eggs in shell, fresh",Yield,2022,100 mg/An,49076.00


In [23]:
df_clean.head()

Unnamed: 0,area,item_code,item,element,year,unit,value
0,Afghanistan,221,"Almonds, in shell",Area harvested,1961,ha,0.0
1,Afghanistan,221,"Almonds, in shell",Area harvested,1962,ha,0.0
2,Afghanistan,221,"Almonds, in shell",Area harvested,1963,ha,0.0
3,Afghanistan,221,"Almonds, in shell",Area harvested,1964,ha,0.0
4,Afghanistan,221,"Almonds, in shell",Area harvested,1965,ha,0.0


### verify, column 'Unit' has comparable units of measurement

In [24]:
df_clean[['unit']].drop_duplicates()

Unnamed: 0,unit
0,ha
62,100 g/ha
109,t
704,An
1642,1000 An
3127,100 mg/An
3189,No/An
3313,1000 No
3871,100 g/An
4243,0.1 g/An


:FIXME: :TODO: comparison for the following needed: 'An' + '1000 An', '100mg/An' + '100g/An' + '0.1g/An', 'No' + '1000 No'

# hmmmm

Exploring the element column, we see that not all values are relevant for further analysis. According to element definitions, yield is the production per unit of harvested area 

In [25]:
pd.Series(df_raw['year'].unique()).sort_values().to_numpy()

array([1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971,
       1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982,
       1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993,
       1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
       2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015,
       2016, 2017, 2018, 2019, 2020, 2021, 2022])