In [1]:
import numpy as np
import pandas as pd
from scipy.stats import norm, pearsonr

  from pandas.core import (


In [2]:
# From "Comparing correlated correlation coefficients" - Meng, Rosenthal, Ruben
def compare_correlations(Y, X1, X2):
    
    # number of observations
    N = len(Y)
    
    # correlation coefficients 
    r_yx1, _ = pearsonr(Y, X1)
    r_yx2, _ = pearsonr(Y, X2)
    r_x1x2, _ = pearsonr(X1, X2)
    
    # avg r2 
    r2_avg = (r_yx1**2 + r_yx2**2) / 2
    
    # f <= 1
    f = np.min([1, (1 - r_x1x2) / (2 * (1 - r2_avg))])
    
    # h
    h = (1 - f * r2_avg) / (1 - r2_avg)
    
    # Fisher Z-transform
    z_yx1 = np.arctanh(r_yx1)
    z_yx2 = np.arctanh(r_yx2)
    
    # Z score 
    Z = (z_yx1 - z_yx2) * np.sqrt((N - 3) / (2 * (1 - r_x1x2) * h ))
    
    # Two-tailed p-value
    p_value = 2 * (1 - norm.cdf(abs(Z)))
    
    return Z, p_value

In [3]:
for strain in ['CDanc', 'CDevo']:
    
    # import k-fold results from gLV model
    kfold_df = pd.read_csv(f"kfold/{strain}_20_fold.csv")

    # names of species in communities
    species = np.unique(kfold_df.species.values)    
    
    # predicted and measured values for each species
    all_exp_names = kfold_df['Treatments'].values
    all_pred_species = kfold_df['species'].values
    all_true = kfold_df['true'].values
    all_pred = kfold_df['pred'].values
    all_stdv = kfold_df['stdv'].values
    
    # import k-fold results from HOI model
    kfold_df_3 = pd.read_csv(f"kfold3/{strain}_kfold3.csv")
    
    # predicted and measured values for each species
    all_exp_names_3 = kfold_df_3['Treatments'].values
    all_pred_species_3 = kfold_df_3['species'].values
    all_true_3 = kfold_df_3['true'].values
    all_pred_3 = kfold_df_3['pred'].values
    all_stdv_3 = kfold_df_3['stdv'].values
    
    # show prediction performance of individual species
    glv_pearsons = []
    hoi_pearsons = []
    z_values = []
    p_values = []
    for sp in species:
        
        # gLV
        sp_inds = all_pred_species == sp
        measured = all_true[sp_inds]
        predicted = all_pred[sp_inds]
        glv_pearsons.append(pearsonr(measured, predicted)[0])
        
        # HOI 
        measured_3 = all_true_3[sp_inds]
        predicted_3 = all_pred_3[sp_inds]
        hoi_pearsons.append(pearsonr(measured, predicted_3)[0])
        
        # compare correlations
        assert np.all(measured==measured_3), "measured values must be the same!"
        z, p = compare_correlations(measured, predicted, predicted_3)
        z_values.append(z)
        p_values.append(p)
        
    # save to .csv
    compare_df = pd.DataFrame()
    compare_df['Species'] = species
    compare_df['gLV Pearson'] = glv_pearsons
    compare_df['HOI Pearson'] = hoi_pearsons
    compare_df['Z statistic'] = z_values
    compare_df['p-value'] = p_values
    compare_df.to_csv(f"tables/{strain}_HOI_performance.csv", index=False)