
# Figure 4.



##  GLOBAL FOOD COMMODITIES


## United Nations FAO data: Production of 'Crops and livestock products' in 2022

In [1]:

from __future__ import division
import numpy as np
import scipy as sc
from itertools import product
import time
import matplotlib.pyplot as plt
import PIL
from numpy import log10
import random
from math import factorial
from scipy.stats import linregress, gaussian_kde, skew
from scipy import stats
from scipy.spatial import distance
import warnings
import pandas as pd
import re
import os
import math
from collections import Counter
from sklearn.preprocessing import PolynomialFeatures
from statsmodels.stats.outliers_influence import summary_table
import statsmodels.api as sm
from scipy.optimize import linear_sum_assignment

warnings.filterwarnings('ignore')

%config InlineBackend.figure_formats = ['svg']
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}

pd.set_option('display.max_columns', None)


  from pandas import Int64Index as NumericIndex


In [2]:
def NSECF(p):
    p_bins = len(p)
    p_obs = sum(p)
    
    z_p = (p_bins + 1)/p_bins
    p = [sum(p[:ii+1])**(z_p) for ii in range(len(p))]
    return np.sum(np.array(p)/(p_obs**z_p)) - 1


def DS(p):
    p_bins = len(p)
    p_obs = sum(p)
    
    z_p = (p_bins + 1)/p_bins
    p = [sum(p[:ii+1])**(z_p) for ii in range(len(p))]
    Sp = np.sum(np.array(p)/(p_obs**z_p)) - 1
    ds = Sp/(p_bins - 1)
    return ds


def gini_coefficient(x):
    """
    Compute Gini coefficient of array of values
    From: https://stackoverflow.com/questions/39512260/calculating-gini-coefficient-in-python-numpy
    """
    diffsum = 0
    for i, xi in enumerate(x[:-1], 1):
        diffsum += np.sum(np.abs(xi - x[i:]))
    return diffsum / (len(x)**2 * np.mean(x))


In [3]:
df = pd.read_csv('data/economic/FAO/FAOSTAT_data_en_2-23-2024.csv')

df.drop(labels=['Domain', 'Element', 'Year'], axis=1, inplace=True)
df = df[df['Unit'] == 't']

print(len(df['Item'].unique().tolist()))
print(len(df['Area'].unique().tolist()))

#for d in sorted(df['Item'].unique().tolist()):
#    print(d)
    
print(df.shape)
df.head()

209
200
(14360, 4)


Unnamed: 0,Area,Item,Unit,Value
0,Afghanistan,"Almonds, in shell",t,64000.0
1,Afghanistan,"Anise, badian, coriander, cumin, caraway, fenn...",t,17883.38
2,Afghanistan,Apples,t,318000.0
3,Afghanistan,Apricots,t,170507.92
4,Afghanistan,Barley,t,110000.0


In [4]:
gini_ls = []
ds_ls1 = []
ds_ls2 = []
food_ls = []

foods = df['Item'].unique().tolist()

for f in foods:
    food_ls.append(f)
    
    tdf = df[df['Item'] == f]
    tdf = tdf[tdf['Value'] > 0]
    
    if tdf.shape[0] == 0 or np.max(tdf['Value']) == 0:
        continue
    
    vals = tdf['Value'].astype('float')
    vals = np.array(sorted(list(vals), reverse=True))
    
    # Gini coefficient
    gini_ls.append(gini_coefficient(np.sqrt(vals)))
    #gini_ls.append(stats.entropy(vals))
    
    # Distributional shift (DS)
    # 1. Convert the abundances to logarithmic scale (base 2)
    abundances = np.log2(vals).tolist()

    # 2. Define the bins for the histogram
    min_abundance = 0
    max_abundance = np.ceil(max(abundances))
    bins = np.arange(min_abundance, max_abundance + 1, 1)

    # 3. Compute the histogram
    hist, bin_edges = np.histogram(abundances, bins=bins)

    # 4. Use the right side of the bin edges as bin values
    bin_values = bin_edges[1:]

    # 5. Convert histogram to list
    bin_heights = hist.tolist()
    
    # Calculate DS
    ds = DS(bin_heights)
    ds_ls1.append(ds)
    
    # 6. Normalized sums of exponentiated cumulative frequencies
    nsecf = NSECF(bin_heights)
    ds_ls2.append(nsecf)

In [5]:
def obs_pred_rsquare(obs, pred):
    '''
    Determines the proportion of variability in a data set accounted for by a model
    In other words, this determines the proportion of variation explained by the 1:1 line
    in an observed-predicted plot.
    
    Used in various peer-reviewed publications:
        1. Locey, K.J. and White, E.P., 2013. How species richness and total abundance 
        constrain the distribution of abundance. Ecology letters, 16(9), pp.1177-1185.
        2. Xiao, X., McGlinn, D.J. and White, E.P., 2015. A strong test of the maximum 
        entropy theory of ecology. The American Naturalist, 185(3), pp.E70-E80.
        3. Baldridge, E., Harris, D.J., Xiao, X. and White, E.P., 2016. An extensive 
        comparison of species-abundance distribution models. PeerJ, 4, p.e2823.
    '''
    r2 = 1 - sum((obs - pred) ** 2) / sum((obs - np.mean(obs)) ** 2)
    return r2

y_o = np.array(ds_ls1)
x_o = np.array(gini_ls)

x_o, y_o = zip(*sorted(zip(x_o, y_o)))
                
x_o = np.array(x_o)
y_o = np.array(y_o)
            
#Create single dimension
x = x_o[:, np.newaxis]
y = y_o[:, np.newaxis]

# Sort x values and get index
inds = x.ravel().argsort()  
x = x.ravel()[inds].reshape(-1, 1)
#Sort y according to x sorted index
y = y[inds]

exp = 1
polynomial_features = PolynomialFeatures(degree = exp)
xp = polynomial_features.fit_transform(x)
                    
model = sm.OLS(y, xp).fit()
ypred = model.predict(xp)
ypred = ypred.tolist()


poly_coefs = model.params[1:].tolist()
poly_coefs.reverse()
        
poly_exponents = list(range(1, len(poly_coefs)+1))
poly_exponents.reverse()

eqn = 'y = '
for i, p in enumerate(poly_coefs):
    exp = poly_exponents[i]
                
    if exp == 1:
        exp = 'x'
    elif exp == 2:
        exp = 'x²'
    elif exp == 3:
        exp = 'x³'
            
    if i == 0:
        p = round(p, 4)
        eqn = eqn + str(p) + exp
                
    else:
        if p >= 0:
            p = round(p, 4)
            eqn = eqn + ' + ' + str(p) + exp
        else:
            p = round(p, 4)
            eqn = eqn + ' - ' + str(np.abs(p)) + exp

            
            
b = model.params[0]
if b >= 0:
    b = round(b, 4)
    eqn = eqn + ' + ' + str(b)
else:
    b = round(b, 4)
    eqn = eqn + ' - ' + str(np.abs(b))
    
print(eqn)


try:
    y = y.flatten().tolist()
except:
    pass

op_r2 = obs_pred_rsquare(np.array(y), np.array(ypred))

try:
    op_r2 = round(op_r2, 4)
except:
    pass

if op_r2 < 0:
    op_r2 = 0
    
r2 = round(model.rsquared, 4)
r2_adj = round(model.rsquared_adj, 4)
print(r2, r2_adj, op_r2)

st, data, ss2 = summary_table(model, alpha=0.05)
predict_mean_ci_low, predict_mean_ci_upp = data[:, 4:6].T # confidence interval
predict_ci_low, predict_ci_upp = data[:, 6:8].T # prediction interval

outlier_y = []
outlier_x = []
nonoutlier_y = []
nonoutlier_x = []

for i, yi in enumerate(y_o):
    if yi > predict_ci_upp[i] or yi < predict_ci_low[i]:
        outlier_y.append(yi)
        outlier_x.append(x_o[i])
    else:
        nonoutlier_y.append(yi)
        nonoutlier_x.append(x_o[i])
                
obs_pred_r2 = obs_pred_rsquare(y_o, ypred)
obs_pred_r2 = str(np.round(obs_pred_r2, 3))

print(obs_pred_r2)

y = 0.6856x - 0.0271
0.8329 0.8321 0.8329
0.833


In [6]:
# Create the figure and axis objects
fig = plt.figure(figsize=(4, 4))

ax = plt.subplot(1, 1, 1)
plt.plot(x_o, ypred, c='0.5', label=r'$r^{2}$' + ' = ' + obs_pred_r2)

plt.fill_between(x_o, predict_ci_upp, predict_ci_low, color='k', alpha=0.1, linewidths=0)
plt.fill_between(x_o, predict_mean_ci_upp, predict_mean_ci_low, color='k', alpha=0.2, linewidths=0)

plt.scatter(nonoutlier_x, nonoutlier_y, s=5, c='k')
plt.scatter(outlier_x, outlier_y, s=5, c='k')

plt.xlabel('Inequality, (Gini Index)', fontsize= 14)
plt.ylabel('Scarcity, (DS)', fontsize= 14)
#plt.text(1.01, 3.8, s, fontsize=12)
#plt.tick_params(axis='both', labelsize=10)
#plot_num += 1

plt.ylim(-0.005, .6)
#plt.xlim(-0.005, 1.)
plt.legend()
fig.patch.set_facecolor('white')
plt.subplots_adjust(hspace=0.35, wspace=0.4)
plt.savefig('Final_Figs/manuscript/Fig4.pdf', bbox_inches='tight', format='pdf', dpi=600)
plt.savefig('Final_Figs/manuscript/Fig4.jpg', bbox_inches='tight', format='jpg', dpi=600)
plt.close()