In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import linregress
from scipy.stats import multivariate_normal as mvn

from cr.mcr import *

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as mse

# set plot parameters
params = {'legend.fontsize': 18,
          'figure.figsize': (16, 12),
          'lines.linewidth': 4,
          'axes.labelsize': 24,
          'axes.titlesize':24,
          'axes.linewidth':5,
          'xtick.labelsize':20,
          'ytick.labelsize':20}
plt.rcParams.update(params)
plt.style.use('seaborn-colorblind')
plt.rcParams['pdf.fonttype'] = 42

np.random.seed(123)

# Set parameters of script

In [None]:
# number of times to randomly pull training samples 
n_trials = 10

# Import data

In [None]:
df = pd.read_csv("Data/2021_02_19_MultifunctionalDynamicData.csv")

all_experiments = df.Experiments.values
unique_experiments = np.unique(all_experiments)
n_conditions = len(unique_experiments)

df.head()

### Define which variables are species and which are metabolites

In [None]:
# specify species and metabolite names 
species = df.columns.values[2:-4]
metabolites =  df.columns.values[-4:]
controls = []
system_variables = np.concatenate((np.array(species), np.array(metabolites)))
system_variables

In [None]:
# for each number of samples to train on 
for n_train in [1, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70]:

    # perform k-fold x-validation several times (n_trials) with randomized partitions of data
    for k in range(n_trials):

        # random set of n_train experiments for training 
        train_exps = np.random.choice(unique_experiments, n_train, replace=False)
        train_inds = np.in1d(all_experiments, train_exps)
        
        # pull train and test dataframes
        train_df = df.iloc[train_inds].copy()
        test_df  = df.iloc[~train_inds].copy()

        # scale metabolites based on training data values 
        train_df_scaled = train_df.copy()
        test_df_scaled  = test_df.copy()
        train_df_scaled[metabolites] /= train_df[metabolites].max()
        test_df_scaled[metabolites]  /= train_df[metabolites].max()
        
        # init model 
        model = CR(dataframe=train_df_scaled, species=species, resources=metabolites, r0=1.)

        # fit to data 
        model.fit(lr=1e-1, map_tol=1e-3, evd_tol=1e-3)

        # make predictions
        df_boot_cr = model.predict_df(test_df_scaled, species, metabolites)

        # scale back resources
        for i, variable in enumerate(metabolites):
            df_boot_cr[variable + " true"] *= train_df[variable].max()
            df_boot_cr[variable + " pred"] *= train_df[variable].max()
            df_boot_cr[variable + " stdv"] *= train_df[variable].max()

        # save kfold predictions
        df_boot_cr.to_csv(f"Fig2/Bootstrap/MCR_{n_train}samples_trial_{k+1}.csv", index=False)