In [None]:

from __future__ import division
import numpy as np
import scipy as sc
from itertools import product
import time
import matplotlib.pyplot as plt
import PIL
from numpy import log10
import random
from math import factorial
from scipy.stats import linregress, gaussian_kde, skew
from scipy import stats
from scipy.spatial import distance
import warnings
import pandas as pd
import re
import os
import math
from collections import Counter
from sklearn.preprocessing import PolynomialFeatures
from statsmodels.stats.outliers_influence import summary_table
import statsmodels.api as sm
from scipy.optimize import linear_sum_assignment

warnings.filterwarnings('ignore')

%config InlineBackend.figure_formats = ['svg']
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}

pd.set_option('display.max_columns', None)


In [None]:
def generate_cum_dists(n_obs, n_bins, cum=True):
    """Generate all possible discrete distributions for a given number of 
        observations distributed across a given number of bins
    """
    
    def partitions(n, k):
        if k == 1:
            yield (n,)
        else:
            for i in range(n + 1):
                for result in partitions(n - i, k - 1):
                    yield (i,) + result

    for combo in partitions(n_obs, n_bins):
        if cum:
            cum_dist = [sum(combo[:i + 1]) for i in range(n_bins)]
            yield cum_dist
        else:
            yield combo

            
def count_pts_within_radius(x, y, radius, scale=0):
    """Count the number of points within a fixed radius in 2D space"""
    
    raw_data = np.array([x, y])
    x = np.array(x)
    y = np.array(y)
    raw_data = raw_data.transpose()
    
    # Get unique data points by adding each pair of points to a set
    unique_points = set()
    for xval, yval in raw_data:
        unique_points.add((xval, yval))
    
    count_data = []
    for a, b in unique_points:
        if scale == 'sqrt':
            num_neighbors = len(x[((sqrt(x) - sqrt(a)) ** 2 +
                                   (sqrt(y) - sqrt(b)) ** 2) <= sqrt(radius) ** 2])
        else:        
            num_neighbors = len(x[((x - a) ** 2 + (y - b) ** 2) <= radius ** 2])
        count_data.append((a, b, num_neighbors))
    return count_data



def plot_color_by_pt_dens(x, y, radius, loglog=0, scale=0, plot_obj=None, point_size=10):
    
    """Plot bivariate relationships with large n using color for point density

    Inputs:
    x & y -- variables to be plotted
    radius -- the linear distance within which to count points as neighbors
    scale -- a flag to indicate the use of a scale plot (scale = 1)

    The color of each point in the plot is determined by the logarithm (base 10)
    of the number of points that occur with a given radius of the focal point,
    with hotter colors indicating more points. The number of neighboring points
    is determined in linear space regardless of whether a scale plot is
    presented.
    """
    
    plot_data = count_pts_within_radius(x, y, radius, scale)
    sorted_plot_data = np.array(sorted(plot_data, key=lambda point: point[2]))

    if plot_obj == None:
        plot_obj = plt.axes()
        
    plot_obj.scatter(sorted_plot_data[:, 0],
            sorted_plot_data[:, 1],
            facecolors='none',
            s = point_size, 
            edgecolors='0.1', 
            linewidths=1., 
            #cmap='Greys'
            )
    
    # plot points
    c = np.array(sorted_plot_data[:, 2])**0.25
    c = np.max(c) - c
    plot_obj.scatter(sorted_plot_data[:, 0],
                    sorted_plot_data[:, 1],
                    c = c,
                    s = point_size, 
                    edgecolors='k', 
                    linewidths=0.0, 
                    cmap='Greys_r',
                    #alpha = 0.5,
                    )
        
    return plot_obj


----

# 2. Calculating distributional shift (DS)



We define distributional shift (DS) as the concentration of frequencies towards the lowest discrete class, which we measure via the sum of cumulative frequencies.

We begin by simply deriving DS as: 

$$DS = (\sum{F}/n - 1)/(k-1)$$<br><br>

We then refine DS to include exponentiated cumulative frequencies:

$$DS = (\sum{F^{z}}/n^{z} - 1)/(k-1)$$<br><br>

Finally, we refine the exponent (*z*) to take fractional values:

$$DS = (\sum{F^{k+1/k}}/n^{k+1/k} - 1)/(k-1)$$<br><br>

Below, we provide a function to allow users to explore the calculation of DS and to vary the number of observations (n_obs) and the number of bins (n_bins).

In [None]:
n = 5
k = 4

# Generate cumulative distributions one at a time and add to a list
_set = []
for dist in generate_cum_dists(n, k, cum=False):
    _set.append(dist)


DS_ls = []
for i, d in enumerate(_set):

    # get cumulative distribution
    cd = np.array([sum(d[:i+1]) for i in range(len(d))])
    
    # exponentiate cumulative frequencies
    z = 1
    z = (k + 1)/k
    
    DS = str(((sum(cd**z)/(n**z)) - 1) / (k - 1))
    DS = DS[:8]
    
    if sum(cd)/(n) == 2.2:    
        #print(i+1, d, tuple(cd), sum(cd**z)/(n**z))
        print(i+1, d, tuple(cd), DS)

    DS_ls.append(DS)
    
print('\n')
print(len(DS_ls), 'members of the feasible set for n and k')

# 3. Generate manuscript figures

## Figure 1

In [None]:
n_obs = 10
n_bins = 5
fig = plt.figure(figsize=(10.5, 7))
   
    
################  TOP ROW  ###########

_set = []
for dist in generate_cum_dists(n_obs, n_bins, cum=False):
    _set.append(dist)

plot_num = 1
for z in [2, 4, 7]:
    
    t1 = []
    t2 = []
    for d in _set:

        cd = [sum(d[:i+1]) for i in range(len(d))]
        ncd = np.array(cd)/max(cd)
        t1.append(sum(ncd))

        G = [sum(d[:i+1])**(z) for i in range(len(d))]
        G_ = np.array(G)/(n_obs**z)

        t2.append(sum(G_))

    print('n:', n_obs, 'k:', n_bins)
    print('cardinality:', len(t1), '| no. of unique ΣF/n:', 
          len(list(set(t1))), '| no. of unique ΣF^z/n^z:', len(list(set(t2))), '\n')

    ax = plt.subplot(2, 3, plot_num)
    plt.scatter(t1, t2, s=1, c='k')
    plt.xlabel(r'$\sum{F/n}$', fontsize= 14)
    plt.ylabel(r'$\sum{F^{' + str(z) + '}/n^{' + str(z) + '}}$', fontsize= 14)
    
    s = '|A' + r'$_{n' + '=' + str(n_obs) + ', ' + 'k' + '=' + str(n_bins) + '}$' + '| = ' + str(len(_set)) + '\n'
    s += '\nValues of ' + r'$\sum{F^{' + str(z) + '}/n^{' + str(z) + '}}$' + '\n = ' + str(len(list(set(t2))))
    
    plt.text(1.01, 3.65, s, fontsize=10)
    plt.tick_params(axis='both', labelsize=10)
    plot_num += 1
    
print('\n')

################  BOTTOM ROW  ###########  

N_obs = [10, 10, 20]
N_bins = [5, 10, 10]
sets = []
for i, n_obs in enumerate(N_obs):
    n_bins = N_bins[i]
    
    _set = []
    for dist in generate_cum_dists(n_obs, n_bins, cum=False):
        _set.append(dist)
    sets.append(_set)



for i, n_obs in enumerate(N_obs):
    n_bins = N_bins[i]
    
    _set = sets[i]
    
    t1 = []
    t2 = []
    for d in _set:

        cd = [sum(d[:ii+1]) for ii in range(len(d))]
        ncd = np.array(cd)/max(cd)
        t1.append(sum(ncd))

        z = (n_bins + 1)/n_bins
        G = [sum(d[:ii+1])**(z) for ii in range(len(d))]
        G_ = np.array(G)/(n_obs**z)
        t2.append(sum(G_))

    print('n:', n_obs, 'k:', n_bins)
    print('cardinality:', len(t1), '| no. of unique ΣF/n:', 
          len(list(set(t1))), '| no. of unique ΣF^z/n^z:', len(list(set(t2))), '\n')
    
    s = '|A' + r'$_{n' + '=' + str(n_obs) + ', ' + 'k' + '=' + str(n_bins) + '}$' + '| = ' + str(len(_set)) + '\n'
    s += '\nValues of ' + r'$\sum{F_{i}^{' + str(z) + '}/n^{' + str(z) + '}}$' + '\n = ' + str(len(_set))
    
    if len(_set) > 10**5: 
        indices = np.random.choice(len(_set), 10**5, replace=False)
        t1 = np.array(t1)
        t2 = np.array(t2)
        t1 = t1[indices]
        t2 = t2[indices]
        t1 = t1.tolist()
        t2 = t2.tolist()
    
    ax = plt.subplot(2, 3, plot_num)
    plt.scatter(t1, t2, s=1, c='k')
    plt.xlabel(r'$\sum{F/n}$', fontsize= 14)
    plt.ylabel(r'$\sum{F^{' + str(z) + '}/n^{' + str(z) + '}}$', fontsize= 14)
    
    if plot_num == 4:
        plt.text(1. * min(t1), 0.73*max(t2), s, fontsize=10)
    elif plot_num == 5:
        plt.text(1. * min(t1), 0.7*max(t2), s, fontsize=10)
    elif plot_num == 6:
        plt.text(0.9 * min(t1), 0.71*max(t2), s, fontsize=10)
        
    plt.tick_params(axis='both', labelsize=10)
    plot_num += 1
    

fig.patch.set_facecolor('white')
plt.subplots_adjust(hspace=0.45, wspace=0.4)
plt.savefig('Fig1.jpg', bbox_inches='tight', format='jpg', dpi=600)
plt.close()

----
# Comparing RS to established measures








In [None]:

def histogram_intersection(p, q):
    
    # Calculate histogram intersection
    
    # q is the reference distribution
    # p is the query distribution
    
    minima = np.minimum(p, q)
    hi = np.true_divide(np.sum(minima), np.sum(p))
    
    if hi > 1 or hi < 0:
        print('Error, HI =', hi)
        print(p)
        print(q)
        return
    return hi


def chi_square_distance(p, q):
    
    # Calculate chi-square distance
    
    # q is the reference distribution
    # p is the query distribution
    
    p = np.array(p)/np.sum(p)
    q = np.array(q)/np.sum(q)

    return np.sum(((p - q)**2 / (p + q))) / 2


def kl_divergence(p, q):
    
    # Calculate Kullback-Leibler Divergence
    
    # q is the reference distribution
    # p is the query distribution
    
    # Ensure both lists are numpy arrays with dtype=float
    
    p = np.array(p, dtype=float)
    q = np.array(q, dtype=float)
    
    p /= p.sum()
    q /= q.sum()
    
    kl_div = np.sum(p * np.log(p / q))
    return kl_div


def earth_movers_distance(p, q):
    
    # Calculate Earth Mover's Distance
    
    # q is the reference distribution
    # p is the query distribution
    
    # Ensure both lists are numpy arrays with dtype=float
    p = np.array(p, dtype=float)
    q = np.array(q, dtype=float)

    # Normalize the distributions to ensure they sum to 1
    p /= p.sum()
    q /= q.sum()

    # Calculate cumulative distributions
    P = np.cumsum(p)
    Q = np.cumsum(q)

    # Calculate the cost matrix
    C = np.abs(np.subtract.outer(P, Q))

    # Solve the linear sum assignment problem
    row_ind, col_ind = linear_sum_assignment(C)

    # Calculate the Earth Mover's Distance
    emd = C[row_ind, col_ind].sum()

    return emd


def kolmogorov_smirnov_distance(p, q):
    
    # Calculate KS distance
    
    # q is the reference distribution
    # p is the query distribution
    
    # Ensure both lists are numpy arrays with dtype=float
    p = np.array(p, dtype=float)
    q = np.array(q, dtype=float)

    # Normalize the distributions to ensure they sum to 1
    p /= p.sum()
    q /= q.sum()

    # Calculate cumulative distributions
    P = np.cumsum(p)
    Q = np.cumsum(q)

    # Calculate the KS distance
    ks_distance = np.max(np.abs(P - Q))

    return ks_distance


def rank_probability_score(p, q):
    
    # Calculate the ranked probability score
    
    # q is the reference distribution
    # p is the query distribution
    
    # Ensure both lists are numpy arrays with dtype=float
    p = np.array(p, dtype=float)
    q = np.array(q, dtype=float)

    # Normalize the distributions to ensure they sum to 1
    p /= p.sum()
    q /= q.sum()

    # Calculate cumulative distributions
    P = np.cumsum(p)
    Q = np.cumsum(q)

    # Calculate Rank Probability Score
    rps = np.sum((P - Q)**2)

    return rps


def RDS(p, q):
    
    # Calculate Relative distribution shift
    
    # q is the reference distribution
    # p is the query distribution
    p_bins = len(p)
    p_obs = sum(p)
    
    q_bins = len(q)
    q_obs = sum(q)
    
    z_p = (p_bins + 1)/p_bins
    p = [sum(p[:ii+1])**(z_p) for ii in range(len(p))]
    Sp = np.sum(np.array(p)/(p_obs**z_p)) - 1
    Sp = Sp/(p_bins - 1)
    
    z_q = (q_bins + 1)/q_bins
    q = [sum(q[:ii+1])**(z_q) for ii in range(len(q))]
    Sq = np.sum(np.array(q)/(q_obs**z_q)) - 1
    Sq = Sq/(q_bins - 1)
    
    return Sq - Sp
    
    
def DS(p):
    p_bins = len(p)
    p_obs = sum(p)
    
    z_p = (p_bins + 1)/p_bins
    p = [sum(p[:ii+1])**(z_p) for ii in range(len(p))]
    Sp = np.sum(np.array(p)/(p_obs**z_p)) - 1
    ds = Sp/(p_bins - 1)
    return ds


In [None]:
n_obs = 100
n_bins = 5
# Generate cumulative distributions one at a time and add to a list
feasible_set = []
for dist in generate_cum_dists(n_obs, n_bins, cum=False):
    feasible_set.append(dist)

num = f"{len(feasible_set):,}"
print('There are', num, 'possible discrete frequency distributions for', n_obs, 'observations distributed among', n_bins, 'bins, when allowing bins to have values of 0.\n')

ls = []
for d in feasible_set:
    
    z = (n_bins + 1)/n_bins
    G = [sum(d[:ii+1])**(z) for ii in range(len(d))]
    G_ = np.array(G)/(n_obs**z)
    ls.append( (sum(G_) - 1) / (n_bins - 1) )
    
num = f"{len(list(set(ls))):,}"
print('There are ' + num + ' unique values of shift.\n')

print('min shift =', min(ls))
print('max shift =', max(ls))

In [None]:

RDS_vals = []
csd = []
ks_dists = []
hist_ints = []
kl_divs = []
em_dists = []
rp_scores = []
d1_ls = []
d2_ls = []
dif_med = []
dif_var = []
dif_skew = []

num = 0
start_time_0 = time.time()
while num < 10**5:
    
    d1, d2 = random.sample(feasible_set, 2)
    
    d1_ls.append(d1)
    d2_ls.append(d2)
    
    # Relative distributional shift
    j = RDS(d1, d2)
    RDS_vals.append(j)
    
    # KS distance
    j = kolmogorov_smirnov_distance(d1, d2)
    ks_dists.append(j)
        
    # Histogram intersection
    j = histogram_intersection(d1, d2)
    hist_ints.append(1-j)
    
    # chi-square distance
    j = chi_square_distance(d1, d2)
    csd.append(j)
    
    # KL divergence
    j = kl_divergence(d1, d2)
    kl_divs.append(j)
    
    # Rank probability score
    j = rank_probability_score(d1, d2)
    rp_scores.append(j)
    
    # Earth movers distance
    j = earth_movers_distance(d1, d2)
    em_dists.append(j)
    
    # difference in median
    j = np.nanmedian(d1) - np.nanmedian(d2)
    dif_med.append(j)
    
    # difference in variance
    v1 = []
    for i, f in enumerate(d1): v1.extend([i+1]*f)
        
    v2 = []
    for i, f in enumerate(d2): v2.extend([i+1]*f)
    
    dif_var.append(np.var(v1) - np.var(v2))
    dif_skew.append(skew(v1) - skew(v2))
    
    num += 1

end_time = time.time()
print('Completed in {:.3f} seconds'.format(end_time - start_time_0))
print(num)

In [None]:
from scipy.stats import linregress
import warnings
warnings.filterwarnings('ignore')

X_lists = [np.sqrt(csd).tolist(), 
           ks_dists, 
           hist_ints, 
           np.sqrt(kl_divs).tolist(), 
           em_dists, 
           np.sqrt(rp_scores).tolist(),
           np.abs(RDS_vals),
          ]

x_labs = [r"$\sqrt{CSD}$",
          'KSD',
          '1-HI',
          r"$\sqrt{KLD}$",
          'EMD',
          r"$\sqrt{RPS}$",
          '|RDS|',
         ]

fig = plt.figure(figsize=(13, 13))

ind = 1
for i, y_ls in enumerate(X_lists):
    xlab = x_labs[i]
    for j, x_ls in enumerate(X_lists):
        ylab = x_labs[j]
        
        if ind not in [8, 15, 16, 22, 23, 24, 29, 30, 31, 32, 36, 37, 38, 39, 40,
                      43, 44, 45, 46, 47, 48]:
            ind += 1
            continue

        x = []
        y = []
        for i, xval in enumerate(x_ls):
            if np.isnan(xval) == False and np.isinf(xval) == False:
                yval = y_ls[i]
                if np.isnan(yval) == False and np.isinf(yval) == False:
                    x.append(xval)
                    y.append(yval)
            
        plot_color_by_pt_dens(x, y, radius=0.05, loglog=0, plot_obj=plt.subplot(7, 7, ind), point_size=10)

        slope, intercept, r_val, p_val, std_err = linregress(x, y)
        fitted_vals = slope * np.array(x) + intercept
        
        s = r'$r^{2}$' + ' = ' + str(round(r_val**2, 2))
        plt.plot(x, fitted_vals, color='k', linewidth=1, label=s)

        plt.tick_params(axis='both', left=False, top=False, right=False, bottom=False, 
                        labelleft=False, labeltop=False, 
                        labelright=False, labelbottom=False,
                       )
        
        legend = plt.legend(loc='lower center', bbox_to_anchor=(0.1, 0.96), borderaxespad=0., frameon=False)
        legend.set_alpha(0)

        # Remove the line representing data
        for line in legend.get_lines():
            line.set_linestyle('None')
            line.set_marker(None)

        plt.xlabel(xlab, fontsize = 10)
        plt.ylabel(ylab, fontsize = 10)
        ind += 1
    

fig.patch.set_facecolor('white')
plt.subplots_adjust(hspace=0.5, wspace=0.5)
plt.savefig('Fig3.jpg', bbox_inches='tight', format='jpg', dpi=600)
plt.close()


In [None]:

X_lists = [ks_dists, 
           em_dists, 
           np.sqrt(rp_scores).tolist(),
           np.sqrt(csd).tolist(), 
           hist_ints, 
           np.sqrt(kl_divs).tolist(),
          ]

x_labs = ['Kolmogorov-Smirnov distance',
          'Earth Mover Distance',
          r"$\sqrt{RPS}$",
          r"$\sqrt{CSD}$",
          '1 - Histogram Intersection',
          r"$\sqrt{KLD}$",
          ]

fig = plt.figure(figsize=(11, 11))

text_x_vals = [0.05, 0.15, 0.05, 
               0.05, 0.05, 0.1]

for i, x_ls in enumerate(X_lists):
    
    xv = []
    yv = []
    ct = 0
    for ii, val in enumerate(x_ls):
        
        if val > 0 and val < 10**10:
            xv.append(val)
            yv.append(RDS_vals[ii])

    plot_color_by_pt_dens(xv, yv, radius=0.05, loglog=0, plot_obj=plt.subplot(3, 3, i+1), point_size=10)
    slope, intercept, r_val, p_val, std_err = linregress(xv, np.abs(yv))
    fitted_vals = slope * np.array(x) + intercept
    s = r'$r^{2}$' + ' = ' + str(round(r_val**2, 2))
    
    plt.text(text_x_vals[i], 0.7, s, fontsize=12)
    plt.xlabel(x_labs[i], fontsize= 12)
    plt.ylabel('RDS', fontsize= 12)
    plt.tick_params(axis='both', labelsize=10)
    
fig.patch.set_facecolor('white')
plt.subplots_adjust(hspace=0.5, wspace=0.5)
plt.savefig('Fig4.jpg', bbox_inches='tight', format='jpg', dpi=600)
plt.close()

In [None]:
f1 = [21, 2, 0, 2, 21]
f2 = [1, 1, 42, 1, 1]
print('f1:', f1)
print('f2:', f2, '\n')

print('DS(f1):', round(DS(f1),3))
print('DS(f2):', round(DS(f2),3), '\n')
print('RDS = ', round(RDS(f1, f2),3))

#print('Rank probability score = ', round(rank_probability_score(f1, f2)/rank_probability_score(mf1, mf2), 3))
print('1 - Histogram Intersection = ', round(1 - histogram_intersection(f1, f2),3))
d = chi_square_distance(f1, f2)
print('Chi-square distance = ', d)

In [None]:
# Set parameters for the Poisson
lambda_parameter = 5  # Adjust this parameter as needed

dists = []

while len(dists) < 10**5:
    
    data = np.random.poisson(lambda_parameter, 100)
    hist_vals, bins = np.histogram(data, bins=5, density=False)
    hist_vals = hist_vals.tolist()
    dists.append(hist_vals)


In [None]:
RDS_vals = []
csd = []
ks_dists = []
hist_ints = []
kl_divs = []
em_dists = []
rp_scores = []
d1_ls = []
d2_ls = []

num = 0
start_time_0 = time.time()
while num < 10**5:
    
    d1, d2 = random.sample(dists, 2)
    
    d1_ls.append(d1)
    d2_ls.append(d2)
    
    # Relative distributional shift
    j = RDS(d1, d2)
    RDS_vals.append(j)
    
    # KS distance
    j = kolmogorov_smirnov_distance(d1, d2)
    ks_dists.append(j)
        
    # Histogram intersection
    j = histogram_intersection(d1, d2)
    hist_ints.append(1-j)
    
    # chi-square distance
    j = chi_square_distance(d1, d2)
    csd.append(j)
    
    # KL divergence
    j = kl_divergence(d1, d2)
    kl_divs.append(j)

    
    # Rank probability score
    j = rank_probability_score(d1, d2)
    rp_scores.append(j)
    
    # Earth movers distance
    j = earth_movers_distance(d1, d2)
    em_dists.append(j)
    
    num += 1

end_time = time.time()
print('Completed in {:.3f} seconds'.format(end_time - start_time_0), '\n')
print(num)

In [None]:
from scipy.stats import linregress
import warnings
warnings.filterwarnings('ignore')

lists = [np.sqrt(csd).tolist(),
         ks_dists, 
         hist_ints,
         np.sqrt(kl_divs).tolist(), 
         em_dists, 
         np.sqrt(rp_scores).tolist(),
         np.abs(RDS_vals),
        ]

labs = [r"$\sqrt{CSD}$",
          'KSD',
          '1-HI',
          r"$\sqrt{KLD}$",
          'EMD',
          r"$\sqrt{RPS}$",
          '|RDS|',
         ]

fig = plt.figure(figsize=(13, 13))

ind = 1
for i, y_ls in enumerate(lists):
    xlab = labs[i]
    for j, x_ls in enumerate(lists):
        ylab = labs[j]
        
        if ind not in [8, 15, 16, 22, 23, 24, 29, 30, 31, 32, 36, 37, 38, 39, 40,
                      43, 44, 45, 46, 47, 48]:
            ind += 1
            continue

        x = []
        y = []
        for i, xval in enumerate(x_ls):
            if np.isnan(xval) == False and np.isinf(xval) == False:
                yval = y_ls[i]
                if np.isnan(yval) == False and np.isinf(yval) == False:
                    x.append(xval)
                    y.append(yval)
            
        plot_color_by_pt_dens(x, y, radius=0.05, loglog=0, plot_obj=plt.subplot(7, 7, ind), point_size=10)

        slope, intercept, r_val, p_val, std_err = linregress(x, y)
        fitted_vals = slope * np.array(x) + intercept
        
        s = r'$r^{2}$' + ' = ' + str(round(r_val**2, 2))
        plt.plot(x, fitted_vals, color='k', linewidth=1, label=s)

        plt.tick_params(axis='both', left=False, top=False, right=False, bottom=False, 
                        labelleft=False, labeltop=False, 
                        labelright=False, labelbottom=False,
                       )
        
        legend = plt.legend(loc='lower center', bbox_to_anchor=(0.1, 0.96), borderaxespad=0., frameon=False)
        legend.set_alpha(0)

        for line in legend.get_lines():
            line.set_linestyle('None') 
            line.set_marker(None)

        plt.xlabel(xlab, fontsize = 10)
        plt.ylabel(ylab, fontsize = 10)
        ind += 1
    

fig.patch.set_facecolor('white')
plt.subplots_adjust(hspace=0.5, wspace=0.5)
plt.savefig('Fig5.jpg', bbox_inches='tight', format='jpg', dpi=600)
plt.close()


In [None]:

X_lists = [ks_dists, 
           em_dists, 
           np.sqrt(rp_scores).tolist(),
           np.sqrt(csd).tolist(), 
           hist_ints, 
           np.sqrt(kl_divs).tolist(),
          ]

x_labs = ['Kolmogorov-Smirnov distance',
          'Earth Mover Distance',
          r"$\sqrt{RPS}$",
          r"$\sqrt{CSD}$",
          '1 - Histogram Intersection',
          r"$\sqrt{KLD}$",
          ]

fig = plt.figure(figsize=(11, 11))

text_x_vals = [0.02, 0.05, 0.02, 
               0.02, 0.02, 0.05]

for i, x_ls in enumerate(X_lists):
    
    xv = []
    yv = []
    ct = 0
    for ii, val in enumerate(x_ls):
        
        if val > 0 and val < 10**10:
            xv.append(val)
            yv.append(RDS_vals[ii])

    plot_color_by_pt_dens(xv, yv, radius=0.05, loglog=0, plot_obj=plt.subplot(3, 3, i+1), point_size=10)
    slope, intercept, r_val, p_val, std_err = linregress(xv, np.abs(yv))
    fitted_vals = slope * np.array(x) + intercept
    s = r'$r^{2}$' + ' = ' + str(round(r_val**2, 2))
    
    plt.text(text_x_vals[i], 0.35, s, fontsize=12)
    plt.xlabel(x_labs[i], fontsize= 12)
    plt.ylabel('RDS', fontsize= 12)
    plt.tick_params(axis='both', labelsize=10)
    
fig.patch.set_facecolor('white')
plt.subplots_adjust(hspace=0.5, wspace=0.5)
plt.savefig('Fig6.jpg', bbox_inches='tight', format='jpg', dpi=600)
plt.close()

# Ecological analysis

## Analyzing species rarity
 
The analysis of shift is highly relevant to the study of species-abundance distributions (SADs) (Fig 8a) [10, 12, 14, 21]. These histograms of species abundance underpin thousands of ecological studies spanning all domains of life and major habitats [10, 12, 14, 21]. SADs are often compared to theoretical predictions, to SADs sampled from other communities of similar taxa, and are the basis for many measures of species dominance, diversity, evenness, and rarity [9, 10, 12, 14, 21]. Consequently, the field of ecology is replete with techniques for analyzing SADs and for quantifying the aspects of biodiversity they contain.


In considering that RDS provides a means of quantifying the magnitude and direction by which one SAD is concentrated to lesser or greater abundances relative to another SAD,  

Beyond providing yet another means of comparing SADs, DS and RDS provide an intuitive and easily interpretable means of quantifying species rarity, i.e., the concentration of species at low abundances [14, 21]. 

However, 

As the concentration of frequencies away from the discrete class having the greatest value, DS is highly similar in concept to species rarity. Consequently, given its bounded values and intuitive interpretation, we asked whether DS provides a preferrable measure of species rarity. in particular, because skewness based measures are not bounded an

Unlike skewness-based measures, RDS is bounded, intuitive, and directly reflects the concentration of species towards the class of lowest abundance. Likewise, RDS represents a better comparative metric of species rarity for the same reasons.

In [None]:
def Rlogskew(sad):
    '''
    Calculation of rarity used in:
        A.E. Magurran and B.J. McGill, eds. Biological diversity: frontiers in measurement and assessment. 
        OUP Oxford, 2010.

    '''
    
    S = len(sad)

    if S <= 2.0:
        print('S < 2, cannot compute log-skew')
        sys.exit()

    sad = np.log10(sad)
    mu = np.mean(sad)

    num = 0
    denom = 0
    for ni in sad:
        num += ((ni - mu)**3.0)/S
        denom += ((ni - mu)**2.0)/S

    t1 = num/(denom**(3.0/2.0))
    t2 = (S/(S - 2.0)) * np.sqrt((S - 1.0)/S)

    return t1 * t2


def get_RADs(path, name, closedref=True):

    # Get rank-abundance distributions, i.e, abundance vectors
    
    RADdict = {}
    DATA = path + name + '-data.txt'

    with open(DATA) as f:

        for d in f:

            if d.strip():
                d = d.split()
                length = len(d)

                if name == 'GENTRY':
                    site = d[0]
                    #species = d[1] # Dataset name plus species identifier
                    abundance = float(d[-1])

                else:
                    site = d[0]
                    #year = d[1]

                    if closedref == True:
                        for i in d:
                            if 'unclassified' in i:
                                #print('unclassified')
                                continue
                            elif 'unidentified' in i:
                                #print('unidentified')
                                continue

                    abundance = float(d[-1])


                if abundance > 0:
                    if site in RADdict:
                        RADdict[site].append(abundance)
                    else:
                        RADdict[site] = [abundance]

    RADs = RADdict.values()
    filteredRADs = []
    for rad in RADs:
        if len(rad) >= 10:
            filteredRADs.append(rad)

    return filteredRADs



def EMP_RADs(path, name):

    minS = 10

    IN = path + '/EMPclosed-SADs.txt'
    rads = []
    
    with open(IN) as f:

        for rad in f:
            rad = eval(rad)
            if len(rad) >= minS:
                rads.append(rad)

    return rads

def Louca_RADs(path, name):
    
    # Get rank-abundance distributions, i.e, abundance vectors
    
    RADdict = {}
    DATA = path + 'SSADdata.txt'

    with open(DATA) as f:

        for d in f:

            if d.strip():
                d = d.split()
                length = len(d)

                site = d[1]
                for i in d:
                    abundance = float(d[-1])


                if abundance > 0:
                    if site in RADdict:
                        RADdict[site].append(abundance)
                    else:
                        RADdict[site] = [abundance]

    RADs = RADdict.values()
    filteredRADs = []
    for rad in RADs:
        if len(rad) >= 10:
            filteredRADs.append(rad)

    return filteredRADs


def NSECF(p):
    p_bins = len(p)
    p_obs = sum(p)
    
    z_p = (p_bins + 1)/p_bins
    p = [sum(p[:ii+1])**(z_p) for ii in range(len(p))]
    return np.sum(np.array(p)/(p_obs**z_p)) - 1
    
                
def getMetrics():
    
    name_ls = []
    kind_ls = []
    N_ls = []
    S_ls = []
    skew_ls = []
    logskew_ls = []
    log_mod_skew_ls = []
    log_mod_skew_log_ls = []
    ds_ls = []
    nsecf_ls = []
    
    datasets = []
    for name in os.listdir('data/ecological/micro'):
            datasets.append([name, 'micro'])
    for name in os.listdir('data/ecological/macro'):
            datasets.append([name, 'macro'])

    for dataset in datasets:
        
        name = dataset[0] # name of dataset
        kind = dataset[1] # micro or macro

        if name == '.DS_Store' or name == 'MGRAST': 
            continue
        
        OUT = open('data/ecological/'+kind+'/'+name+'/'+name+'-SADMetricData.txt','w+')
        RADs = []

        if kind == 'macro':
            RADs = get_RADs('data/ecological/'+kind+'/'+name+'/', name)
            print('macro', name, len(RADs))


        if kind == 'micro':
            if name == 'EMPclosed' or name == 'EMPopen':
                RADs = EMP_RADs('data/ecological/'+kind+'/'+name+'/', name)
            
            elif name == 'Louca':
                RADs = Louca_RADs('data/ecological/'+kind+'/'+name+'/', name)
            
            else:
                RADs = get_RADs('data/ecological/'+kind+'/'+name+'/', name)

            print('micro', name, len(RADs))

        ct = 0
        numRADs = len(RADs)
        for RAD in RADs:

            if kind == 'micro':
                RAD = list([x for x in RAD if x > 0])

            elif kind == 'macro':
                RAD = list([x for x in RAD if x > 0])


            N = sum(RAD)
            S = len(RAD)

            if S < 10: 
                continue
            if max(RAD) == min(RAD): 
                continue

            # Measures of Rarity
            
            # 1. skewness of abundances
            skewness = skew(RAD)
            
            # 2. log-modulo transformation of skewnness
            lms = np.log10(np.abs(float(skewness)) + 1)
            if skewness < 0: 
                lms = lms * -1
            log_mod_skew = float(lms)
            
            # 3. skewness of log-transformed abundances
            logskew = Rlogskew(RAD)
            
            # 4. log-modulo transformation of skewnness of log-transformed abundances
            lms = np.log10(np.abs(float(logskew)) + 1)
            if skewness < 0: 
                lms = lms * -1
            log_mod_skew_log = float(lms)
            
            # 5. Distributional shift (DS)
            # Convert the abundances to logarithmic scale (base 2)
            abundances = np.log2(RAD)

            # Define the bins for the histogram
            min_abundance = 0 #np.floor(min(abundances))
            max_abundance = np.ceil(max(abundances))
            bins = np.arange(min_abundance, max_abundance + 1, 1)

            # Compute the histogram
            hist, bin_edges = np.histogram(abundances, bins=bins)

            # Use the right side of the bin edges as bin values
            bin_values = bin_edges[1:]

            # Convert histogram to list
            bin_heights = hist.tolist()
            ds = DS(bin_heights)
            
            # 6. Normalized sums of exponentiated cumulative frequencies
            nsecf = NSECF(bin_heights)

            ct+=1

            print(name, kind, N, S, skewness, logskew, log_mod_skew, log_mod_skew_log, ds, nsecf, file=OUT)
            
            name_ls.append(name)
            kind_ls.append(kind)
            N_ls.append(N)
            S_ls.append(S)
            skew_ls.append(skewness)
            logskew_ls.append(logskew)
            log_mod_skew_ls.append(log_mod_skew)
            log_mod_skew_log_ls.append(log_mod_skew_log)
            ds_ls.append(ds)
            nsecf_ls.append(nsecf)
            
        OUT.close()
        
    return name_ls, kind_ls, N_ls, S_ls, skew_ls, logskew_ls, log_mod_skew_ls, log_mod_skew_log_ls, ds_ls, nsecf_ls


In [None]:
name_ls, kind_ls, N_ls, S_ls, skew_ls, logskew_ls, log_mod_skew_ls, log_mod_skew_log_ls, ds_ls, nsecf_ls = getMetrics()

main_df = pd.DataFrame({
    'name': name_ls,
    'kind': kind_ls,
    'N': N_ls,
    'S': S_ls,
    'skew': skew_ls,
    'logskew': logskew_ls,
    'log_mod_skew': log_mod_skew_ls,
    'log_mod_skew_log': log_mod_skew_log_ls,
    'DS': ds_ls,
    'NSECF': nsecf_ls,
})

print(main_df.shape)

print(main_df['name'].unique().tolist())
main_df.head()

In [None]:
print(main_df.shape)

its = 20
tdf = None
names = ['CHU', 'CATLIN', 'BOVINE', 'LAUB', 'SED', 'BIGN', 
         'CHINA', 'TARA', 'HMP', 'FUNGI', 'Louca', 'HUMAN', 
         'HYDRO', 'EMPclosed', 'FIA', 'CBC', 'MCDB', 'BBS', 
         'GENTRY']

for n in range(its):

    for i, name in enumerate(names):
                
        tdf_nm = main_df[main_df['name'] == name]
        kind = tdf_nm['kind'].iloc[0]
        numlines = tdf_nm.shape[0]
                
        small = ['BIGN', 'BOVINE', 'CHU', 'LAUB', 'SED']
        big = ['HUMAN', 'CHINA', 'CATLIN', 'FUNGI', 'HYDRO']

        if name == 'Louca':
            tdf_nm = tdf_nm.sample(1000, replace=True)

        elif kind == 'macro':
            tdf_nm = tdf_nm.sample(100, replace=True)
        elif name in small:
            tdf_nm = tdf_nm.sample(20, replace=True)
        elif name in big:
            tdf_nm = tdf_nm.sample(50, replace=True)
        elif name == 'TARA':
            tdf_nm = tdf_nm.sample(50, replace=True)
        else:
            tdf_nm = tdf_nm.sample(50, replace=True)
            
        if n == 0 and i == 0:
            tdf = tdf_nm.copy(deep=True)
        else:
            tdf = pd.concat([tdf, tdf_nm])
            

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import linregress

tdf.dropna(subset=['DS'], how='any', inplace=True)

# Log scale the 'N' values
log_N = np.log10(tdf['N'])

# log_mod_skew values
log_mod_skew = tdf['log_mod_skew']

# DS values
DS_vals = tdf['DS']

# NSECF
NSECF_vals = tdf['NSECF']

# Reshape the data to fit the model
X = log_N.values
y1 = log_mod_skew.values
y2 = DS_vals.values
y3 = NSECF_vals.values

# Fit the linear regression model for log_mod_skew
slope1, intercept1, r_value1, p1, se1 = linregress(X, y1)
print("Regression for log_mod_skew:")
print("Slope:", slope1)
print("Intercept:", intercept1)
print("r2:", r_value1 ** 2, '\n')

# Fit the linear regression model for DS
slope2, intercept2, r_value2, p2, se2 = linregress(X, y2)
print("Regression for DS:")
print("Slope:", slope2)
print("Intercept:", intercept2)
print("r2:", r_value2 ** 2, '\n')

# Fit the linear regression model for NSECF
slope3, intercept3, r_value3, p3, se3 = linregress(X, y3)
print("Regression for NSECF:")
print("Slope:", slope3)
print("Intercept:", intercept3)
print("r2:", r_value3 ** 2)

# Plotting
fig = plt.figure(figsize=(10, 2.5))
x_lab = 'log(N)'

# Scatter plot and regression line for log_mod_skew
plot_color_by_pt_dens(X, y1, radius=0.05, loglog=0, plot_obj=plt.subplot(1, 3, 1), point_size=10)
slope, intercept, r_val, p_val, std_err = linregress(X, y1)
fitted_vals = slope * np.array(X) + intercept
s = r'$r_{2}$' + ' = ' + str(round(r_val**2, 2))
plt.plot(X, fitted_vals, color='k', linewidth=2, label=s)
plt.xlabel(x_lab, fontweight='bold')
plt.ylabel('log-modulo skewness', fontweight='bold')
plt.title(r'Rarity = '+str(round(10**intercept,2))+'*'+r'$N$'+'$^{'+str(round(slope,2))+'}$')
plt.legend(frameon=False)

# Scatter plot and regression line for DS (non-standardized)
plot_color_by_pt_dens(X, y3, radius=0.05, loglog=0, plot_obj=plt.subplot(1, 3, 2), point_size=10)
slope, intercept, r_val, p_val, std_err = linregress(X, y3)
fitted_vals = slope * np.array(X) + intercept
s = r'$r_{2}$' + ' = ' + str(round(r_val**2, 2))
plt.plot(X, fitted_vals, color='k', linewidth=2, label=s)
plt.xlabel(x_lab, fontweight='bold')
plt.ylabel('DS, non-standardized', fontweight='bold')
plt.title(r'Rarity = '+str(round(slope,2))+'*'+r'$N$'+ ' - ' + str(abs(round(intercept,2))))
plt.legend(frameon=False)


# Scatter plot and regression line for DS
plot_color_by_pt_dens(X, y2, radius=0.05, loglog=0, plot_obj=plt.subplot(1, 3, 3), point_size=10)
slope, intercept, r_val, p_val, std_err = linregress(X, y2)
fitted_vals = slope * np.array(X) + intercept
s = r'$r_{2}$' + ' = ' + str(round(r_val**2, 2))
plt.plot(X, fitted_vals, color='k', linewidth=2, label=s)
plt.xlabel(x_lab, fontweight='bold')
plt.ylabel('DS, standardized', fontweight='bold')
plt.legend(frameon=False)


fig.patch.set_facecolor('white')
plt.subplots_adjust(wspace=0.45, hspace=0.35)
plt.savefig('Fig8.jpg', bbox_inches='tight', format='jpg', dpi=600)
plt.close()


#   

# ECONOMIC ANALYSIS

## FAMILY INCOME AND POVERTY BY RACE/ETHNICITY


## Family income data

In [None]:
datasets = []
for name in os.listdir('data/economic/USCB/FAMILY_INCOME_IN_THE_PAST_12_MONTHS/data/'):
    datasets.append(name)

main_df = pd.DataFrame(columns=['Year'])

yrs = ['5Y2010', '5Y2011', '5Y2012', '5Y2013', '5Y2014', 
       '5Y2015', '5Y2016', '5Y2017', '5Y2018', '5Y2019', '5Y2020', 
       '5Y2021', '5Y2022']

# Iterate over each year to concatenate files and add a 'year' column

for yr in yrs:
    tdf = pd.DataFrame(columns=['Year'])

    files = []
    for d in datasets:
        if yr in d:
            files.append(d)

    for file in files:
        # Read CSV and specify the header row
        df = pd.read_csv('data/economic/USCB/FAMILY_INCOME_IN_THE_PAST_12_MONTHS/data/'+file, header=0)
        
        # Reset the index of the DataFrame
        df = df.reset_index(drop=True)

        lab = file
        lab = lab.replace('-Data.csv', '')
        lab = lab + '-Table-Notes.txt'

        path = 'data/economic/USCB/FAMILY_INCOME_IN_THE_PAST_12_MONTHS/notes/'+lab
        # Open the file in read mode
        with open(path, "r") as File:
            # Read the first six lines
            first_six_lines = [File.readline() for _ in range(6)]

        # Print the first six lines
        lab1 = first_six_lines[-1]
        lab1 = lab1.upper()

        R_E = ['TWO OR MORE RACES HOUSEHOLDER',
            'BLACK OR AFRICAN AMERICAN ALONE HOUSEHOLDER',
            'WHITE ALONE HOUSEHOLDER',
            'ASIAN ALONE HOUSEHOLDER',
            'HISPANIC OR LATINO HOUSEHOLDER',
            'WHITE ALONE, NOT HISPANIC OR LATINO HOUSEHOLDER',
            'NATIVE HAWAIIAN AND OTHER PACIFIC ISLANDER ALONE HOUSEHOLDER',
            'AMERICAN INDIAN AND ALASKA NATIVE ALONE HOUSEHOLDER',
           ]

        lab2 = str(lab1)
        for r_e in R_E:
            if r_e in lab1:
                lab2 = r_e

        if lab2 == lab1:
            lab2 = 'ALL'

        # Set new headers, drop redundant rows, and reset index
        new_headers = df.iloc[0]
        df.columns = new_headers
        df = df.drop(0)
        df = df.reset_index(drop=True)

        df = df.drop(columns=['Geography', 'Geographic Area Name'])
        df = df.dropna(axis=1, how='all')

        df.dropna(how='all', axis=1, inplace=True)
        cols_to_drop = df.filter(regex='Margin of Error').columns
        df = df.drop(columns=cols_to_drop)
        cols_to_drop = df.filter(regex='Median').columns
        df = df.drop(columns=cols_to_drop)
        cols_to_drop = df.filter(regex='MEDIAN').columns
        df = df.drop(columns=cols_to_drop)
        cols_to_drop = df.filter(regex='Mean').columns
        df = df.drop(columns=cols_to_drop)
        cols_to_drop = df.filter(regex='MEAN').columns
        df = df.drop(columns=cols_to_drop)
        cols_to_drop = df.filter(regex='IMPUTED').columns
        df = df.drop(columns=cols_to_drop)
        
        df.columns = df.columns.str.replace('Total:', 'Total')
        df.columns = df.columns.str.replace("Estimate!!", "")
        df.columns = df.columns.str.replace("Total!!", "")
        
        df.columns = [col + ' - ' + lab2 for col in df.columns]
        df['Year'] = yr[2:]
        
        df.columns = df.columns.str.replace(" HOUSEHOLDER", "")
        df.columns = df.columns.str.replace(" ALONE", "")
        
        # Merge files for same year
        tdf = tdf.merge(df, how='outer', on='Year')
        tdf.reset_index(drop=True, inplace=True)
        
    # concat different years
    try:
        main_df = pd.concat([main_df, tdf])
        #main_df = main_df.sort_index(axis=1)
    except:
        pass
        
print(main_df.shape)
main_df.head(main_df.shape[0])

In [None]:
suf_ls = ['WHITE',
          'HISPANIC OR LATINO',
          'WHITE, NOT HISPANIC OR LATINO',
          'BLACK OR AFRICAN AMERICAN',
          'ASIAN',
          'ALL',
          'NATIVE HAWAIIAN AND OTHER PACIFIC ISLANDER',
          'AMERICAN INDIAN AND ALASKA NATIVE',
         ]

for suf in suf_ls:
    # Define columns to convert to float

    print(suf)
    cols = ['Less than $10,000 - '+suf, 
        '$10,000 to $14,999 - '+suf, 
        '$15,000 to $19,999 - '+suf, 
        '$20,000 to $24,999 - '+suf, 
        '$25,000 to $29,999 - '+suf, 
        '$30,000 to $34,999 - '+suf, 
        '$35,000 to $39,999 - '+suf, 
        '$40,000 to $44,999 - '+suf, 
        '$45,000 to $49,999 - '+suf, 
        '$50,000 to $59,999 - '+suf, 
        '$60,000 to $74,999 - '+suf, 
        '$75,000 to $99,999 - '+suf, 
        '$100,000 to $124,999 - '+suf, 
        '$125,000 to $149,999 - '+suf, 
        '$150,000 to $199,999 - '+suf, 
        '$200,000 or more - '+suf,
       ]

    main_df[cols] = main_df[cols].astype(float)
    main_df['DOW'] = main_df.apply(lambda row: row[cols].tolist(), axis=1)

    ds_ls = []
    for dow in main_df['DOW'].tolist():
        ds = DS(dow)
        ds_ls.append(ds)

    main_df[suf+'_DS'] = ds_ls
    main_df.drop(labels=['DOW'], axis=1, inplace=True)
    
    main_df[suf + ' < $25K'] = 100 * (main_df[cols[0]] + main_df[cols[1]] + main_df[cols[2]] + main_df[cols[3]]) / main_df['Total - ' + suf].astype(float)
    main_df[suf + ' ≥ $200K'] = 100 * (main_df[cols[-1]]) / main_df['Total - ' + suf].astype(float)

    ##
    ## Get DS for lower
    ##
    
    cols = ['Less than $10,000 - '+suf, 
        '$10,000 to $14,999 - '+suf, 
        '$15,000 to $19,999 - '+suf, 
        '$20,000 to $24,999 - '+suf, 
        ]

    main_df['DOW'] = main_df.apply(lambda row: row[cols].tolist(), axis=1)

    ds_ls = []
    for dow in main_df['DOW'].tolist():
        ds = DS(dow)
        ds_ls.append(ds)

    main_df[suf+'_DS_lower'] = ds_ls
    main_df.drop(labels=['DOW'], axis=1, inplace=True)
    
    ##
    ## Get DS for middle
    ##
    
    cols = ['$25,000 to $29,999 - '+suf, 
        '$30,000 to $34,999 - '+suf, 
        '$35,000 to $39,999 - '+suf, 
        '$40,000 to $44,999 - '+suf,
        '$45,000 to $49,999 - '+suf, 
        '$50,000 to $59,999 - '+suf, 
        '$60,000 to $74,999 - '+suf, 
        '$75,000 to $99,999 - '+suf, 
       ]

    main_df['DOW'] = main_df.apply(lambda row: row[cols].tolist(), axis=1)

    ds_ls = []
    for dow in main_df['DOW'].tolist():
        ds = DS(dow)
        ds_ls.append(ds)

    main_df[suf+'_DS_middle'] = ds_ls
    main_df.drop(labels=['DOW'], axis=1, inplace=True)
    
    ##
    ## Get DS for upper
    ##
    
    cols = [
        '$100,000 to $124,999 - '+suf, 
        '$125,000 to $149,999 - '+suf, 
        '$150,000 to $199,999 - '+suf, 
        '$200,000 or more - '+suf,
       ]
    
    main_df['DOW'] = main_df.apply(lambda row: row[cols].tolist(), axis=1)

    ds_ls = []
    for dow in main_df['DOW'].tolist():
        ds = DS(dow)
        ds_ls.append(ds)

    main_df[suf+'_DS_upper'] = ds_ls
    main_df.drop(labels=['DOW'], axis=1, inplace=True)

main_df.head(main_df.shape[0])

## Poverty data

In [None]:
datasets = []
for name in os.listdir('data/economic/USCB/POVERTY/data/'):
    datasets.append(name)

df2 = pd.DataFrame(columns=['Year'])

yrs = ['1Y2010', '1Y2011', '1Y2012', '1Y2013', 
       '1Y2014', '1Y2015', '1Y2016', 
       '5Y2017', '5Y2018', '5Y2019', '5Y2020', 
       '5Y2021', '5Y2022',
      ]

# Iterate over each year to concatenate files and add a 'year' column

for yr in yrs:
    tdf = pd.DataFrame(columns=['Year'])

    files = []
    for d in datasets:
        if yr in d:
            files.append(d)

    for file in files:
        # Read CSV file and specify header row
        df = pd.read_csv('data/economic/USCB/POVERTY/data/'+file, header=0)
        
        # Reset index of DataFrame
        df = df.reset_index(drop=True)

        # Set new headers, drop redundant rows, and reset index
        new_headers = df.iloc[0]
        df.columns = new_headers
        df = df.drop(0)
        df = df.reset_index(drop=True)
        
        df.columns = df.columns.str.replace('One race!!White', 'White')
        df.columns = df.columns.str.replace('One race!!Black or African American', 'Black or African American')
        df.columns = df.columns.str.replace('One race!!American Indian and Alaska Native', 'American Indian and Alaska Native')
        df.columns = df.columns.str.replace('One race!!Asian', 'Asian')
        df.columns = df.columns.str.replace('One race!!Native Hawaiian and Other Pacific Islander', 'Native Hawaiian and Other Pacific Islander')
        
        df.columns = df.columns.str.replace('White alone', 'White')
        df.columns = df.columns.str.replace('Black or African American alone', 'Black or African American')
        df.columns = df.columns.str.replace('American Indian and Alaska Native alone', 'American Indian and Alaska Native')
        df.columns = df.columns.str.replace('Asian alone', 'Asian')
        df.columns = df.columns.str.replace('Native Hawaiian and Other Pacific Islander alone', 'Native Hawaiian and Other Pacific Islander')
        
        df.columns = df.columns.str.replace("\(of any race\)", "")
        
        df.columns = df.columns.str.replace("Estimate!!Total!!", "Total!!Estimate!!")
        df.columns = df.columns.str.replace("Estimate!!Below poverty level!!", "Below poverty level!!Estimate!!")
        
        df.columns = df.columns.str.replace("Population for whom poverty status is determined!!", "")
        
        df.columns = df.columns.str.replace("RACE AND HISPANIC OR LATINO ORIGIN!!", "")
        
        
        df = df.drop(columns=['Geography', 'Geographic Area Name'])
        cols_to_drop = df.filter(regex='Margin of Error').columns
        df = df.drop(columns=cols_to_drop)
        cols_to_drop = df.filter(regex='Median').columns
        df = df.drop(columns=cols_to_drop)
        cols_to_drop = df.filter(regex='MEDIAN').columns
        df = df.drop(columns=cols_to_drop)
        cols_to_drop = df.filter(regex='Mean').columns
        df = df.drop(columns=cols_to_drop)
        cols_to_drop = df.filter(regex='MEAN').columns
        df = df.drop(columns=cols_to_drop)
        cols_to_drop = df.filter(regex='IMPUTED').columns
        df = df.drop(columns=cols_to_drop)
        cols_to_drop = df.filter(regex='years').columns
        df = df.drop(columns=cols_to_drop)
        cols_to_drop = df.filter(regex='Female').columns
        df = df.drop(columns=cols_to_drop)
        cols_to_drop = df.filter(regex='Male').columns
        df = df.drop(columns=cols_to_drop)
        cols_to_drop = df.filter(regex='housing').columns
        df = df.drop(columns=cols_to_drop)
        cols_to_drop = df.filter(regex='ork').columns
        df = df.drop(columns=cols_to_drop)
        cols_to_drop = df.filter(regex='raduate').columns
        df = df.drop(columns=cols_to_drop)
        cols_to_drop = df.filter(regex='egree').columns
        df = df.drop(columns=cols_to_drop)
        cols_to_drop = df.filter(regex='ndividuals').columns
        df = df.drop(columns=cols_to_drop)
        cols_to_drop = df.filter(regex='NDIVIDUALS').columns
        df = df.drop(columns=cols_to_drop)
        cols_to_drop = df.filter(regex='mploy').columns
        df = df.drop(columns=cols_to_drop)
        cols_to_drop = df.filter(regex='other').columns
        df = df.drop(columns=cols_to_drop)
        cols_to_drop = df.filter(regex='or more race').columns
        df = df.drop(columns=cols_to_drop)
        cols_to_drop = df.filter(regex='One race').columns
        df = df.drop(columns=cols_to_drop)
        cols_to_drop = df.filter(regex='Percent').columns
        df = df.drop(columns=cols_to_drop)
        
        df.dropna(how='all', axis=1, inplace=True)
        #df.columns = df.columns.str.replace('Total:', 'Total')

        df['Year'] = yr[2:]

        # Merge files for same year
        tdf = tdf.merge(df, how='outer', on='Year')
        tdf.reset_index(drop=True, inplace=True)
        
    # Now concat different years
    try:
        df2 = pd.concat([df2, tdf])
        #main_df = main_df.sort_index(axis=1)
    except:
        pass


df.reset_index(drop=True, inplace=True)
print(df2.shape)
df2.head(df2.shape[0])


suf1_ls = ['White',
           'Hispanic or Latino origin ',
           'White, not Hispanic or Latino',
           'Black or African American',
           'Asian',
           'Population for whom poverty status is determined',
           'Native Hawaiian and Other Pacific Islander',
           'American Indian and Alaska Native',
          ]

suf2_ls = ['WHITE',
          'HISPANIC OR LATINO',
          'WHITE, NOT HISPANIC OR LATINO',
          'BLACK OR AFRICAN AMERICAN',
          'ASIAN',
          'ALL',
          'NATIVE HAWAIIAN AND OTHER PACIFIC ISLANDER',
          'AMERICAN INDIAN AND ALASKA NATIVE',
         ]

        
for i, suf in enumerate(suf1_ls):
    
    pvr = 100 * df2['Below poverty level!!Estimate!!' + suf].astype(float) / df2['Total!!Estimate!!' + suf].astype(float)
    
    main_df[suf2_ls[i] + ' - % Poverty'] = pvr
    
main_df.head(main_df.shape[0])     

### FIGURE 3

In [None]:
race_colors = {
    'BLACK OR AFRICAN AMERICAN': '0.8',
    'AMERICAN INDIAN AND ALASKA NATIVE': '0.8',
    'HISPANIC OR LATINO': '0.8',
    'ALL': 'k',
    'WHITE, NOT HISPANIC OR LATINO': '0.1',
    'ASIAN': '0.1',
    #'NATIVE HAWAIIAN AND OTHER PACIFIC ISLANDER': '0.5',
    
}

race_styles = {
    'BLACK OR AFRICAN AMERICAN': 'solid',
    'AMERICAN INDIAN AND ALASKA NATIVE': 'dashed',
    'HISPANIC OR LATINO': 'dotted',
    'ALL': 'dotted',
    'WHITE, NOT HISPANIC OR LATINO': 'solid',
    'ASIAN': 'dashed',
    #'NATIVE HAWAIIAN AND OTHER PACIFIC ISLANDER': '0.5',
}

race_labels = {
    'HISPANIC OR LATINO': 'Hispanic or Latinx',
    'WHITE, NOT HISPANIC OR LATINO': 'White, non-hispanic/latinx',
    'BLACK OR AFRICAN AMERICAN': 'Black or African American',
    'ASIAN': 'Asian',
    'ALL': 'All',
    'NATIVE HAWAIIAN AND OTHER PACIFIC ISLANDER': 'Native Hawaiian or Pacific Islander',
    'AMERICAN INDIAN AND ALASKA NATIVE': 'Native American or Alaska Native',
}

# Use the race/ethnicity labels
race_eth = ['AMERICAN INDIAN AND ALASKA NATIVE',
            'BLACK OR AFRICAN AMERICAN',
            'HISPANIC OR LATINO',
            #'ALL',
            'WHITE, NOT HISPANIC OR LATINO',
            'ASIAN',
            #'NATIVE HAWAIIAN AND OTHER PACIFIC ISLANDER',
           ]
  
# Use these years
years = ['2010', '', '2012', '', '2014', 
         '', '2016', '', '2018', '', '2020',
         '', '2022',
        ]


# Create figure and axis objects
fig, axs = plt.subplots(1, 3, figsize=(13, 4))

# Plot poverty vs year
c = 0
for race in race_eth:
    axs[c].plot(main_df['Year'], main_df[f'{race} - % Poverty'], 
                label=race_labels[race], 
                color=race_colors[race], 
                linewidth=3, 
                linestyle=race_styles[race],
               )
    
axs[c].set_xlabel('Year', fontsize=14, fontweight='bold')
axs[c].set_ylabel('US Poverty Rate (%)', fontsize=14, fontweight='bold')
axs[c].tick_params(axis='x', rotation=45) 
axs[c].set_xticklabels(years) 

# Plot DS vs year
c = 1
for race in race_eth:
    axs[c].plot(main_df['Year'], main_df[f'{race}_DS'], 
                color=race_colors[race], 
                linewidth=3, 
                linestyle=race_styles[race],
               )
    
axs[c].set_xlabel('Year', fontsize=14, fontweight='bold')
axs[c].set_ylabel('US Poverty (DS)', fontsize=14, fontweight='bold')
axs[c].tick_params(axis='x', rotation=45)
axs[c].set_xticklabels(years) 


# Plot poverty vs year
c = 2
for race in race_eth:
    axs[c].plot(main_df['Year'], main_df[f'{race}_DS_lower'], 
                color=race_colors[race], 
                linewidth=3,
                linestyle=race_styles[race],
               )
axs[c].set_xlabel('Year', fontsize=14, fontweight='bold')
axs[c].set_ylabel('Poverty (DS) of families\nmaking <$25K/year', fontsize=14, fontweight='bold')
axs[c].tick_params(axis='x', rotation=45) 
#axs[c].set_title('Families making <$25K per year', fontsize=14, fontweight='bold')
axs[c].set_xticklabels(years)


fig.patch.set_facecolor('white')
plt.tight_layout()
fig.legend(bbox_to_anchor=(0.055, 1.05, 0.937, .1), 
           loc=10, 
           ncol=3, 
           mode="expand",
           prop={'size':14},
          )

plt.subplots_adjust(wspace=0.4, hspace=0.4)
plt.savefig('Fig3.jpg', bbox_inches='tight', format='jpg', dpi=600)
plt.close()


### FIGURE 10

### DS for lower, middle, upper classes


In [None]:

# Create the figure and axis objects
fig, axs = plt.subplots(1, 3, figsize=(10, 4))

# Plot poverty vs year
c = 0
for race in race_eth:
    axs[c].plot(main_df['Year'], main_df[f'{race}_DS_lower'], 
                label=race_labels[race], 
                color=race_colors[race], 
                linewidth=3,
                linestyle=race_styles[race],
               )
axs[c].set_xlabel('Year', fontsize=14, fontweight='bold')
axs[c].set_ylabel('Poverty, DS', fontsize=14, fontweight='bold')
axs[c].tick_params(axis='x', rotation=45) 
axs[c].set_title('<$25K', fontsize=14, fontweight='bold')
axs[c].set_xticklabels(years) 

# Plot poverty vs year
c = 1
for race in race_eth:
    axs[c].plot(main_df['Year'], main_df[f'{race}_DS_middle'],
                color=race_colors[race], 
                linewidth=3,
                linestyle=race_styles[race],
               )
axs[c].set_xlabel('Year', fontsize=14, fontweight='bold')
axs[c].set_ylabel('Poverty, DS', fontsize=14, fontweight='bold')
axs[c].tick_params(axis='x', rotation=45) 
axs[c].set_title('\$25K to $100K', fontsize=14, fontweight='bold')
axs[c].set_xticklabels(years) 

# Plot DS vs year
c = 2
for race in race_eth:
    axs[c].plot(main_df['Year'], 1-main_df[f'{race}_DS_upper'],
                color=race_colors[race], 
                linewidth=3,
                linestyle=race_styles[race],
               )
axs[c].set_xlabel('Year', fontsize=14, fontweight='bold')
axs[c].set_ylabel('Affluency, 1 - DS', fontsize=14, fontweight='bold')
axs[c].tick_params(axis='x', rotation=45) 
axs[c].set_title('>$100K', fontsize=14, fontweight='bold')
axs[c].set_xticklabels(years) 


plt.tight_layout()
fig.legend(bbox_to_anchor=(0.0725, 1.05, 0.915, .1), 
           loc=10, 
           ncol=3, 
           mode="expand",
           prop={'size':12},
          )

fig.patch.set_facecolor('white')
plt.subplots_adjust(wspace=0.4, hspace=0.4)
plt.savefig('Fig10.jpg', bbox_inches='tight', format='jpg', dpi=600)
plt.close()


##  GLOBAL FOOD COMMODITIES

## United Nations FAO data: Production of 'Crops and livestock products' in 2022

In [None]:
df = pd.read_csv('data/economic/FAO/FAOSTAT_data_en_2-23-2024.csv')

df.drop(labels=['Domain', 'Element', 'Year'], axis=1, inplace=True)
df = df[df['Unit'] == 't']

print(len(df['Item'].unique().tolist()))
print(len(df['Area'].unique().tolist()))

#for d in sorted(df['Item'].unique().tolist()):
#    print(d)
    
print(df.shape)
df.head()

In [None]:
def gini_coefficient(x):
    """
    Compute Gini coefficient of array of values
    From: https://stackoverflow.com/questions/39512260/calculating-gini-coefficient-in-python-numpy
    """
    diffsum = 0
    for i, xi in enumerate(x[:-1], 1):
        diffsum += np.sum(np.abs(xi - x[i:]))
    return diffsum / (len(x)**2 * np.mean(x))


gini_ls = []
ds_ls1 = []
ds_ls2 = []
food_ls = []

foods = df['Item'].unique().tolist()

for f in foods:
    food_ls.append(f)
    
    tdf = df[df['Item'] == f]
    tdf = tdf[tdf['Value'] > 0]
    
    if tdf.shape[0] == 0 or np.max(tdf['Value']) == 0:
        continue
    
    vals = tdf['Value'].astype('float')
    vals = np.array(sorted(list(vals), reverse=True))
    
    # Gini coefficient
    gini_ls.append(gini_coefficient(np.sqrt(vals)))
    #gini_ls.append(stats.entropy(vals))
    
    # Distributional shift (DS)
    # 1. Convert the abundances to logarithmic scale (base 2)
    abundances = np.log2(vals).tolist()

    # 2. Define the bins for the histogram
    min_abundance = 0
    max_abundance = np.ceil(max(abundances))
    bins = np.arange(min_abundance, max_abundance + 1, 1)

    # 3. Compute the histogram
    hist, bin_edges = np.histogram(abundances, bins=bins)

    # 4. Use the right side of the bin edges as bin values
    bin_values = bin_edges[1:]

    # 5. Convert histogram to list
    bin_heights = hist.tolist()
    
    # Calculate DS
    ds = DS(bin_heights)
    ds_ls1.append(ds)
    
    # 6. Normalized sums of exponentiated cumulative frequencies
    nsecf = NSECF(bin_heights)
    ds_ls2.append(nsecf)

In [None]:


def obs_pred_rsquare(obs, pred):
    '''
    Determines the proportion of variability in a data set accounted for by a model
    In other words, this determines the proportion of variation explained by the 1:1 line
    in an observed-predicted plot.
    
    Used in various peer-reviewed publications:
        1. Locey, K.J. and White, E.P., 2013. How species richness and total abundance 
        constrain the distribution of abundance. Ecology letters, 16(9), pp.1177-1185.
        2. Xiao, X., McGlinn, D.J. and White, E.P., 2015. A strong test of the maximum 
        entropy theory of ecology. The American Naturalist, 185(3), pp.E70-E80.
        3. Baldridge, E., Harris, D.J., Xiao, X. and White, E.P., 2016. An extensive 
        comparison of species-abundance distribution models. PeerJ, 4, p.e2823.
    '''
    r2 = 1 - sum((obs - pred) ** 2) / sum((obs - np.mean(obs)) ** 2)
    return r2

y_o = np.array(ds_ls1)
x_o = np.array(gini_ls)

x_o, y_o = zip(*sorted(zip(x_o, y_o)))
                
x_o = np.array(x_o)
y_o = np.array(y_o)
            
#Create single dimension
x = x_o[:, np.newaxis]
y = y_o[:, np.newaxis]

# Sort x values and get index
inds = x.ravel().argsort()  
x = x.ravel()[inds].reshape(-1, 1)
#Sort y according to x sorted index
y = y[inds]

exp = 1
polynomial_features = PolynomialFeatures(degree = exp)
xp = polynomial_features.fit_transform(x)
                    
model = sm.OLS(y, xp).fit()
ypred = model.predict(xp)
ypred = ypred.tolist()


poly_coefs = model.params[1:].tolist()
poly_coefs.reverse()
        
poly_exponents = list(range(1, len(poly_coefs)+1))
poly_exponents.reverse()

eqn = 'y = '
for i, p in enumerate(poly_coefs):
    exp = poly_exponents[i]
                
    if exp == 1:
        exp = 'x'
    elif exp == 2:
        exp = 'x²'
    elif exp == 3:
        exp = 'x³'
            
    if i == 0:
        p = round(p, 4)
        eqn = eqn + str(p) + exp
                
    else:
        if p >= 0:
            p = round(p, 4)
            eqn = eqn + ' + ' + str(p) + exp
        else:
            p = round(p, 4)
            eqn = eqn + ' - ' + str(np.abs(p)) + exp

            
            
b = model.params[0]
if b >= 0:
    b = round(b, 4)
    eqn = eqn + ' + ' + str(b)
else:
    b = round(b, 4)
    eqn = eqn + ' - ' + str(np.abs(b))
    
print(eqn)


try:
    y = y.flatten().tolist()
except:
    pass

op_r2 = obs_pred_rsquare(np.array(y), np.array(ypred))

try:
    op_r2 = round(op_r2, 4)
except:
    pass

if op_r2 < 0:
    op_r2 = 0
    
r2 = round(model.rsquared, 4)
r2_adj = round(model.rsquared_adj, 4)
print(r2, r2_adj, op_r2)

st, data, ss2 = summary_table(model, alpha=0.05)
predict_mean_ci_low, predict_mean_ci_upp = data[:, 4:6].T # confidence interval
predict_ci_low, predict_ci_upp = data[:, 6:8].T # prediction interval

outlier_y = []
outlier_x = []
nonoutlier_y = []
nonoutlier_x = []

for i, yi in enumerate(y_o):
    if yi > predict_ci_upp[i] or yi < predict_ci_low[i]:
        outlier_y.append(yi)
        outlier_x.append(x_o[i])
    else:
        nonoutlier_y.append(yi)
        nonoutlier_x.append(x_o[i])
                
obs_pred_r2 = obs_pred_rsquare(y_o, ypred)
obs_pred_r2 = str(np.round(obs_pred_r2, 3))

print(obs_pred_r2)

In [None]:
# Create the figure and axis objects
fig = plt.figure(figsize=(4, 4))

ax = plt.subplot(1, 1, 1)
plt.plot(x_o, ypred, c='0.5', label=r'$r^{2}$' + ' = ' + obs_pred_r2)

plt.fill_between(x_o, predict_ci_upp, predict_ci_low, color='k', alpha=0.1, linewidths=0)
plt.fill_between(x_o, predict_mean_ci_upp, predict_mean_ci_low, color='k', alpha=0.2, linewidths=0)

plt.scatter(nonoutlier_x, nonoutlier_y, s=5, c='k')
plt.scatter(outlier_x, outlier_y, s=5, c='k')

plt.xlabel('Inequality, (Gini Index)', fontsize= 14)
plt.ylabel('Scarcity, (DS)', fontsize= 14)
#plt.text(1.01, 3.8, s, fontsize=12)
#plt.tick_params(axis='both', labelsize=10)
#plot_num += 1

plt.ylim(-0.005, .6)
#plt.xlim(-0.005, 1.)
plt.legend()
fig.patch.set_facecolor('white')
plt.subplots_adjust(hspace=0.35, wspace=0.4)
plt.savefig('Fig4.jpg', bbox_inches='tight', format='jpg', dpi=600)
plt.close()