In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from scipy.stats import norm, linregress

import os
import time
import itertools

from glove.model3 import *

from sklearn.model_selection import KFold

  from pandas.core import (


In [2]:
# number of folds
n_splits = 20

# import file names
files = os.listdir("data/SET3_Thirdtrial/")

# fit gLV models

In [3]:
def predict_df(df, species):
    
    # save measured and predicted values
    exp_names = []
    pred_species = []
    pred = []
    stdv = []
    true = []

    # pull just the community data
    test_data = process_df(df, species) 

    # plot the results
    for exp, t_span, Y_m in test_data:

        # predict 
        Y_p, Y_std = model.predict(Y_m, t_span)
        
        # set NaN to zero
        Y_p = np.nan_to_num(Y_p)
        Y_std = np.nan_to_num(Y_std)
        
        ### prediction results for species that were present ###
        inds_present = Y_m[0] > 0 
        exp_names.append([exp]*sum(inds_present)*(Y_m.shape[0]-1))
        pred_species.append(np.tile(np.vstack(species)[inds_present], Y_m.shape[0]-1).T.ravel())
        true.append(Y_m[1:,inds_present].ravel())
        pred.append(Y_p[1:,inds_present].ravel())
        stdv.append(Y_std[1:,inds_present].ravel())
                
    # concatenate list
    exp_names = np.concatenate(exp_names)
    pred_species = np.concatenate(pred_species)
    true = np.concatenate(true)
    pred = np.concatenate(pred)
    stdv = np.concatenate(stdv)
        
    return exp_names, pred_species, true, pred, stdv

In [None]:
# run kfold for each file 
for file in files:
    strain = file.split("_")[0]
    
    # import data
    df = pd.read_csv(f"data/SET3_Thirdtrial/{file}")
    df.sort_values(by=["Treatments", "Time"], inplace=True)
    
    # make sure that conditions have at least one measurement
    dfs = []
    for treatment, df_t in df.groupby("Treatments"):
        if df_t.shape[0] > 1:
            dfs.append(df_t)
    df = pd.concat(dfs)

    # determine species names 
    species = df.columns.values[2:]

    # separate mono culture data 
    mono_dfs = []
    dfs = []
    treatments = []
    for treatment, df_i in df.groupby("Treatments"):
        # hyphen is only in community conditions
        if "-" in treatment:
            dfs.append(df_i)
            # save treatment name without the replicate identifier 
            treatments.append([treatment.split("_")[0]]*df_i.shape[0])
        else:
            mono_dfs.append(df_i)
    treatments = np.concatenate(treatments)
    unique_treatments = np.unique(treatments)
    mono_df = pd.concat(mono_dfs)
    df = pd.concat(dfs)

    # init kfold object
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=21)

    # keep track of all predictions
    all_exp_names = []
    all_pred_species = []
    all_true = []
    all_pred = []
    all_stdv = []

    # run Kfold 
    for train_index, test_index in kf.split(unique_treatments):
        
        # get train df
        train_inds = np.in1d(treatments, unique_treatments[train_index])
        train_df = df.iloc[train_inds].copy()
        train_df = pd.concat((mono_df, train_df))
        
        # average replicates in the test_df
        test_df = []
        for test_treatment in unique_treatments[test_index]:
            # pull dataframe with all replicates of same test treatment 
            treatment_inds = np.in1d(treatments, test_treatment)
            df_treatment = df.iloc[treatment_inds].copy()
            
            # get set of unique measurement times
            treatment_times = np.unique(df_treatment.Time.values)
            
            # init dataframe to store averaged values
            avg_df = pd.DataFrame()
            avg_df['Treatments'] = [test_treatment]*len(treatment_times)
            avg_df['Time'] = treatment_times

            avg_data = np.zeros([len(treatment_times), len(species)])
            for i, time in enumerate(treatment_times):
                avg_data[i] = df_treatment.iloc[df_treatment.Time.values==time][species].mean()
            avg_df[species] = avg_data
            test_df.append(avg_df)
        
        # combine averaged dataframes for test dataframe
        test_df = pd.concat(test_df)
        
        # instantiate gLV fit 
        model = gLV(species, train_df)

        # fit to data 
        model.fit()

        # plot fitness to data
        exp_names, pred_species, true, pred, stdv = predict_df(test_df, species)

        # append predictions 
        all_exp_names = np.append(all_exp_names, exp_names)
        all_pred_species = np.append(all_pred_species, pred_species)
        all_true = np.append(all_true, true)
        all_pred = np.append(all_pred, pred)
        all_stdv = np.append(all_stdv, stdv)

        # save prediction results to a .csv
        kfold_df = pd.DataFrame()
        kfold_df['Treatments'] = all_exp_names
        kfold_df['species'] = all_pred_species
        kfold_df['true'] = all_true
        kfold_df['pred'] = all_pred
        kfold_df['stdv'] = all_stdv
        kfold_df.to_csv(f"kfold/{strain}_{n_splits}_fold_3.csv", index=False)
        
    # show prediction performance of individual species
    for sp in species:
        sp_inds = all_pred_species == sp
        R = linregress(all_true[sp_inds], all_pred[sp_inds]).rvalue
        plt.scatter(all_true[sp_inds], all_pred[sp_inds], label=f"{sp} " + "R={:.3f}".format(R))
        plt.errorbar(all_true[sp_inds], all_pred[sp_inds], yerr=all_stdv[sp_inds], 
                     fmt='.', capsize=3)

    plt.xlabel("Measured OD")
    plt.ylabel("Predicted OD")
    plt.legend()
    plt.title(strain)
    plt.savefig(f"figures/{strain}_{n_splits}_fold_3.pdf", dpi=300)
    plt.show()

Total samples: 258, Initial regularization: 1.00e-03
Loss: 29.841, Residuals: -0.336
Loss: 15.817, Residuals: 0.189
Loss: 12.964, Residuals: 0.130
Loss: 8.996, Residuals: 0.081
Loss: 6.676, Residuals: 0.055
Loss: 6.154, Residuals: 0.042
Loss: 5.757, Residuals: 0.034
Loss: 5.264, Residuals: 0.039
Loss: 4.945, Residuals: 0.034
Loss: 4.913, Residuals: 0.034
Loss: 4.865, Residuals: 0.036
Loss: 4.804, Residuals: 0.037
Loss: 4.689, Residuals: 0.033
Loss: 4.485, Residuals: 0.026
Loss: 4.451, Residuals: 0.041
Loss: 4.384, Residuals: 0.037
Loss: 4.265, Residuals: 0.029
Loss: 4.230, Residuals: 0.034
Loss: 4.164, Residuals: 0.030
Loss: 4.054, Residuals: 0.023
Loss: 4.051, Residuals: 0.025
Loss: 4.010, Residuals: 0.027
Loss: 4.002, Residuals: 0.029
Loss: 3.935, Residuals: 0.025
Loss: 3.933, Residuals: 0.027
Loss: 3.854, Residuals: 0.021
Loss: 3.852, Residuals: 0.021
Loss: 3.851, Residuals: 0.024
Loss: 3.807, Residuals: 0.021
Loss: 3.807, Residuals: 0.021
Optimization terminated successfully.
Evide

Loss: 966.796, Residuals: 0.018
Loss: 965.252, Residuals: 0.011
Optimization terminated successfully.
Evidence 4696.421
Updating hyper-parameters...
Total samples: 254, Updated regularization: 3.52e-01
Loss: 1004.035, Residuals: 0.006
Loss: 995.428, Residuals: 0.008
Loss: 983.884, Residuals: 0.012
Loss: 983.727, Residuals: 0.012
Optimization terminated successfully.
Evidence 4728.230
Updating hyper-parameters...
Total samples: 254, Updated regularization: 3.50e-01
Loss: 1007.575, Residuals: 0.011
Loss: 991.850, Residuals: 0.012
Loss: 991.688, Residuals: 0.013
Optimization terminated successfully.
Evidence 4749.910
Updating hyper-parameters...
Total samples: 254, Updated regularization: 3.42e-01
Loss: 992.884, Residuals: 0.015
Loss: 992.209, Residuals: 0.013
Optimization terminated successfully.
Evidence 4767.152
Updating hyper-parameters...
Total samples: 254, Updated regularization: 3.19e-01
Loss: 1008.099, Residuals: 0.014
Loss: 1003.623, Residuals: 0.014
Loss: 1002.573, Residuals: 0

Total samples: 257, Updated regularization: 3.69e-01
Loss: 1022.474, Residuals: 0.012
Loss: 1014.124, Residuals: 0.012
Loss: 1009.518, Residuals: 0.013
Loss: 1001.819, Residuals: 0.014
Loss: 1001.714, Residuals: 0.015
Optimization terminated successfully.
Evidence 4854.806
Updating hyper-parameters...
Total samples: 257, Updated regularization: 3.51e-01
Loss: 1004.105, Residuals: 0.012
Loss: 1003.945, Residuals: 0.012
Optimization terminated successfully.
Evidence 4875.853
Updating hyper-parameters...
Total samples: 257, Updated regularization: 2.85e-01
Loss: 1020.871, Residuals: 0.013
Optimization terminated successfully.
Evidence 4884.240
Updating hyper-parameters...
Total samples: 257, Updated regularization: 2.81e-01
Loss: 1019.930, Residuals: 0.013
Loss: 1019.721, Residuals: 0.013
Optimization terminated successfully.
Evidence 4887.959
Pass count  1
Total samples: 258, Initial regularization: 1.00e-03
Loss: 30.203, Residuals: -0.322
Loss: 16.522, Residuals: 0.196
Loss: 13.572, Res

Loss: 4.782, Residuals: 0.027
Loss: 4.714, Residuals: 0.023
Loss: 4.711, Residuals: 0.023
Loss: 4.605, Residuals: 0.018
Loss: 4.604, Residuals: 0.018
Loss: 4.602, Residuals: 0.019
Loss: 4.599, Residuals: 0.020
Loss: 4.572, Residuals: 0.019
Loss: 4.566, Residuals: 0.021
Loss: 4.519, Residuals: 0.018
Loss: 4.519, Residuals: 0.020
Loss: 4.501, Residuals: 0.019
Loss: 4.496, Residuals: 0.019
Loss: 4.485, Residuals: 0.018
Loss: 4.466, Residuals: 0.017
Loss: 4.466, Residuals: 0.017
Optimization terminated successfully.
Evidence -79.931
Updating hyper-parameters...
Total samples: 256, Updated regularization: 4.15e-01
Loss: 189.165, Residuals: 0.012
Optimization terminated successfully.
Evidence 3551.984
Updating hyper-parameters...
Total samples: 256, Updated regularization: 3.43e-01
Loss: 773.774, Residuals: 0.013
Optimization terminated successfully.
Evidence 4670.451
Updating hyper-parameters...
Total samples: 256, Updated regularization: 3.82e-01
Loss: 976.961, Residuals: 0.011
Loss: 972.9

Evidence 4862.486
Updating hyper-parameters...
Total samples: 258, Updated regularization: 3.43e-01
Loss: 1025.920, Residuals: 0.009
Loss: 1016.448, Residuals: 0.010
Loss: 1014.947, Residuals: 0.010
Loss: 1002.829, Residuals: 0.010
Loss: 1002.790, Residuals: 0.010
Optimization terminated successfully.
Evidence 4892.383
Updating hyper-parameters...
Total samples: 258, Updated regularization: 3.29e-01
Loss: 1023.040, Residuals: 0.011
Loss: 1019.197, Residuals: 0.011
Loss: 1018.882, Residuals: 0.011
Loss: 1007.553, Residuals: 0.011
Loss: 1007.499, Residuals: 0.010
Optimization terminated successfully.
Evidence 4913.375
Updating hyper-parameters...
Total samples: 258, Updated regularization: 3.06e-01
Loss: 1018.851, Residuals: 0.011
Loss: 1018.507, Residuals: 0.012
Optimization terminated successfully.
Evidence 4920.991
Updating hyper-parameters...
Total samples: 258, Updated regularization: 3.04e-01
Loss: 1014.813, Residuals: 0.011
Loss: 1014.137, Residuals: 0.013
Optimization terminated 

Evidence 5003.129
Updating hyper-parameters...
Total samples: 256, Updated regularization: 2.42e-01
Loss: 1021.053, Residuals: 0.013
Loss: 1016.980, Residuals: 0.012
Loss: 1016.978, Residuals: 0.012
Optimization terminated successfully.
Evidence 5008.084
Pass count  1
Total samples: 260, Initial regularization: 1.00e-03
Loss: 30.143, Residuals: -0.316
Loss: 17.467, Residuals: 0.210
Loss: 14.168, Residuals: 0.146
Loss: 9.743, Residuals: 0.083
Loss: 7.611, Residuals: 0.057
Loss: 7.171, Residuals: 0.035
Loss: 6.842, Residuals: 0.036
Loss: 6.373, Residuals: 0.042
Loss: 6.023, Residuals: 0.028
Loss: 6.002, Residuals: 0.036
Loss: 5.814, Residuals: 0.032
Loss: 5.498, Residuals: 0.024
Loss: 5.484, Residuals: 0.035
Loss: 5.356, Residuals: 0.035
Loss: 5.302, Residuals: 0.043
Loss: 5.203, Residuals: 0.038
Loss: 5.036, Residuals: 0.029
Loss: 5.028, Residuals: 0.036
Loss: 5.013, Residuals: 0.035
Loss: 4.885, Residuals: 0.027
Loss: 4.877, Residuals: 0.027
Loss: 4.863, Residuals: 0.028
Loss: 4.742, R

Loss: 4.380, Residuals: 0.024
Loss: 4.379, Residuals: 0.024
Loss: 4.379, Residuals: 0.024
Loss: 4.379, Residuals: 0.024
Loss: 4.379, Residuals: 0.024
Loss: 4.378, Residuals: 0.025
Loss: 4.378, Residuals: 0.025
Loss: 4.378, Residuals: 0.025
Loss: 4.378, Residuals: 0.025
Loss: 4.378, Residuals: 0.025
Loss: 4.378, Residuals: 0.025
Loss: 4.377, Residuals: 0.025
Loss: 4.377, Residuals: 0.025
Loss: 4.377, Residuals: 0.025
Loss: 4.377, Residuals: 0.025
Loss: 4.377, Residuals: 0.025
Loss: 4.376, Residuals: 0.025
Loss: 4.376, Residuals: 0.025
Loss: 4.376, Residuals: 0.025
Loss: 4.376, Residuals: 0.025
Loss: 4.376, Residuals: 0.025
Loss: 4.376, Residuals: 0.025
Loss: 4.376, Residuals: 0.025
Loss: 4.376, Residuals: 0.025
Optimization terminated successfully.
Evidence -80.738
Updating hyper-parameters...
Total samples: 260, Updated regularization: 2.94e-01
Loss: 193.133, Residuals: 0.017
Optimization terminated successfully.
Evidence 3598.097
Updating hyper-parameters...
Total samples: 260, Update

Loss: 757.006, Residuals: 0.008
Loss: 756.929, Residuals: 0.008
Optimization terminated successfully.
Evidence 4879.653
Updating hyper-parameters...
Total samples: 264, Updated regularization: 2.39e-01
Loss: 1019.270, Residuals: 0.009
Optimization terminated successfully.
Evidence 4966.606
Updating hyper-parameters...
Total samples: 264, Updated regularization: 2.43e-01
Loss: 1042.990, Residuals: 0.008
Loss: 1042.838, Residuals: 0.008
Optimization terminated successfully.
Evidence 4973.767
Updating hyper-parameters...
Total samples: 264, Updated regularization: 2.42e-01
Loss: 1048.578, Residuals: 0.007
Loss: 1044.136, Residuals: 0.007
Loss: 1036.270, Residuals: 0.008
Loss: 1036.118, Residuals: 0.008
Optimization terminated successfully.
Evidence 4989.008
Updating hyper-parameters...
Total samples: 264, Updated regularization: 2.38e-01
Loss: 1029.866, Residuals: 0.009
Loss: 1028.733, Residuals: 0.008
Optimization terminated successfully.
Evidence 5009.832
Updating hyper-parameters...
To