In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from scipy.stats import norm

import os
import time
import itertools

from glove.model import *

# Import data

In [2]:
# import file names
files = os.listdir("data/")
files = [f for f in files if "processed" in f and "passage" not in f]

# Design experiments for each strain

In [3]:
def gen_exp_name(species_names):
    exp_name = ""
    for s in species_names:
        exp_name += s
        exp_name += "-"
    return exp_name[:-1]

In [4]:
# define possible evaluation times 
t_eval = np.array([0, 24])

In [5]:
exp_names = []
for file in files:
    # import data
    df = pd.read_csv(f"data/{file}")

    # determine name of strain
    strain = file.split("_")[1]
    
    # make sure that experiment names are sorted alphabetically 
    exp_names = [gen_exp_name(np.sort(name.split("_")[0].replace(strain, "CD").split("-"))) for name in df.Treatments.values]
    df['Treatments'] = exp_names
    
    # determine species names 
    species = df.columns.values[2:]

    # instantiate gLV fit 
    model = gLV(species, df)

    # fit to data 
    model.fit()

    ### design experiment ###

    # create matrix of all possible communities
    dim = len(species)
    Xlist = [np.reshape(np.array(i), (1, dim)) for i in itertools.product([0, 1], repeat = dim)]
    # remove all zeros community
    X = np.array(np.concatenate(Xlist)[1:, :][::-1], float)
    # exclude mono cultures
    non_mono_inds = np.sum(X, 1) > 1
    X = X[non_mono_inds]

    # scale initial conditions 
    total_OD = .01 
    X = total_OD * np.einsum("ij,i->ij", X, 1/np.sum(X, 1))

    # generate design matrix 
    design_df = pd.DataFrame()
    for i,x in enumerate(X):
        exp_name = gen_exp_name(np.sort(species[x>0]))
        if exp_name not in exp_names:
            # eval time [0, 24]
            x_mat = np.empty([2, dim])
            x_mat[:] = np.nan
            x_mat[0] = x
            df_exp = pd.DataFrame()
            df_exp['Treatments'] = 2*[exp_name]
            df_exp['Time'] = t_eval
            df_exp[species] = x_mat
            design_df = pd.concat((design_df, df_exp))
            
    # remove samples that don't have C. diff
    CD_inds = np.in1d(design_df.Treatments.values, [exp_name for exp_name in np.unique(design_df.Treatments.values) if "CD" in exp_name])
    design_df = design_df.iloc[CD_inds].copy()
            
    # remove samples that have already been collected 
    prev_exp = np.unique(df.Treatments.values)
    dup_inds = np.in1d(design_df.Treatments.values, prev_exp)
    design_df = design_df.iloc[~dup_inds].copy()

    # determine best set of new experiments to collect 
    # N is the total number of measurements (corresponds to number of wells in 96 well plate)
    new_exp = model.design(design_df, N=16)
    exp_names += new_exp

    # new data to collect
    inds = np.in1d(design_df.Treatments.values, new_exp)
    new_df = design_df.iloc[inds].copy()

    # save design
    fname = f"designs/" + strain + "_design.csv"
    new_df.to_csv(fname, index=False)

No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)


Total samples: 38, Initial regularization: 1.00e-03
Loss: 9.201, Residuals: -0.311
Loss: 5.629, Residuals: 0.134
Loss: 5.099, Residuals: 0.124
Loss: 4.127, Residuals: 0.090
Loss: 3.579, Residuals: 0.072
Loss: 2.831, Residuals: 0.007
Loss: 2.577, Residuals: 0.028
Loss: 2.387, Residuals: 0.021
Loss: 2.194, Residuals: -0.021
Loss: 2.160, Residuals: -0.013
Loss: 2.108, Residuals: -0.019
Loss: 2.107, Residuals: -0.009
Loss: 2.066, Residuals: -0.021
Loss: 2.065, Residuals: -0.016
Loss: 2.054, Residuals: -0.019
Loss: 2.034, Residuals: -0.025
Loss: 2.026, Residuals: -0.022
Loss: 2.013, Residuals: -0.027
Loss: 1.992, Residuals: -0.037
Loss: 1.992, Residuals: -0.038
Loss: 1.991, Residuals: -0.038
Loss: 1.990, Residuals: -0.038
Loss: 1.989, Residuals: -0.038
Loss: 1.986, Residuals: -0.038
Loss: 1.981, Residuals: -0.040
Loss: 1.979, Residuals: -0.041
Loss: 1.979, Residuals: -0.041
Loss: 1.974, Residuals: -0.043
Loss: 1.974, Residuals: -0.043
Optimization terminated successfully.
Evidence -40.170
U

Loss: 112.562, Residuals: 0.018
Loss: 112.100, Residuals: 0.019
Loss: 111.629, Residuals: 0.017
Loss: 111.548, Residuals: 0.016
Loss: 111.521, Residuals: 0.016
Loss: 111.498, Residuals: 0.016
Loss: 111.480, Residuals: 0.016
Loss: 111.478, Residuals: 0.016
Loss: 111.476, Residuals: 0.016
Optimization terminated successfully.
Evidence 411.256
Updating hyper-parameters...
Total samples: 31, Updated regularization: 2.66e+00
Loss: 117.293, Residuals: 0.009
Loss: 117.163, Residuals: 0.008
Loss: 117.140, Residuals: 0.010
Loss: 117.098, Residuals: 0.009
Loss: 117.033, Residuals: 0.009
Loss: 116.988, Residuals: 0.007
Loss: 116.987, Residuals: 0.007
Optimization terminated successfully.
Evidence 416.464
Updating hyper-parameters...
Total samples: 31, Updated regularization: 2.40e+00
Loss: 119.830, Residuals: 0.005
Loss: 119.790, Residuals: 0.004
Loss: 119.738, Residuals: 0.004
Loss: 119.717, Residuals: 0.003
Loss: 119.715, Residuals: 0.003
Loss: 119.714, Residuals: 0.002
Optimization terminated 

Dropped BU-CA-CD-CS-DP
Picked BU-CA-CD-CH-CS
Total samples: 38, Initial regularization: 1.00e-03
Loss: 8.534, Residuals: -0.361
Loss: 5.497, Residuals: 0.186
Loss: 4.583, Residuals: 0.143
Loss: 4.162, Residuals: 0.112
Loss: 3.636, Residuals: 0.041
Loss: 3.525, Residuals: 0.011
Loss: 3.323, Residuals: 0.001
Loss: 3.002, Residuals: 0.001
Loss: 2.750, Residuals: 0.009
Loss: 2.673, Residuals: 0.025
Loss: 2.550, Residuals: 0.006
Loss: 2.518, Residuals: 0.011
Loss: 2.463, Residuals: -0.001
Loss: 2.390, Residuals: -0.023
Loss: 2.380, Residuals: -0.010
Loss: 2.363, Residuals: -0.015
Loss: 2.349, Residuals: -0.024
Loss: 2.327, Residuals: -0.029
Loss: 2.325, Residuals: -0.023
Loss: 2.309, Residuals: -0.028
Loss: 2.283, Residuals: -0.034
Loss: 2.275, Residuals: -0.026
Loss: 2.275, Residuals: -0.023
Loss: 2.254, Residuals: -0.030
Loss: 2.248, Residuals: -0.023
Loss: 2.248, Residuals: -0.023
Loss: 2.234, Residuals: -0.027
Loss: 2.234, Residuals: -0.023
Loss: 2.220, Residuals: -0.025
Loss: 2.220, Re

In [6]:
for file in files:
    fname = f"designs/" + file.split("_")[1] + "_design.csv"
    eval_times = pd.read_csv(fname)['Time'].values
    print(fname, sum(eval_times>0))

designs/MS001_design.csv 16
designs/MS014_design.csv 16
designs/MS008_design.csv 16
designs/DSM_design.csv 16


# Design non-CD conditions

In [7]:
non_cd = np.in1d(df.Treatments.values, [exp for exp in df.Treatments.values if "CD" not in exp])
species = np.array([s for s in species if "CD" not in s])
species

array(['BT', 'BV', 'CH', 'BU', 'CS', 'CA', 'DP'], dtype='<U2')

In [8]:
non_cd_df = df.iloc[non_cd][["Treatments", "Time"] + list(species)].copy()
non_cd_df

Unnamed: 0,Treatments,Time,BT,BV,CH,BU,CS,CA,DP
0,BT-BU-BV-CA-CH-CS-DP,0,0.001429,0.001429,0.001429,0.001429,0.001429,0.001429,0.001429
1,BT-BU-BV-CA-CH-CS-DP,12,1.104770,0.179778,0.212016,0.401491,0.021472,0.018854,0.046787
2,BT-BU-BV-CA-CH-CS-DP,24,1.043519,0.026562,0.191175,0.303696,0.039351,0.028246,0.059284
18,BT-CA,0,0.005000,0.000000,0.000000,0.000000,0.000000,0.005000,0.000000
19,BT-CA,12,1.715623,0.000000,0.000000,0.000000,0.000000,0.048044,0.000000
...,...,...,...,...,...,...,...,...,...
169,MonocultureDP,12,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.120400
170,MonocultureDP,15,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.126867
171,MonocultureDP,18,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.138367
172,MonocultureDP,21,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.156267


In [9]:
# instantiate gLV fit 
model = gLV(species, non_cd_df)

# fit to data 
model.fit()

### design experiment ###

# create matrix of all possible communities
dim = len(species)
Xlist = [np.reshape(np.array(i), (1, dim)) for i in itertools.product([0, 1], repeat = dim)]
# remove all zeros community
X = np.array(np.concatenate(Xlist)[1:, :][::-1], float)
# exclude mono cultures
non_mono_inds = np.sum(X, 1) > 1
X = X[non_mono_inds]

# scale initial conditions 
total_OD = .01 
X = total_OD * np.einsum("ij,i->ij", X, 1/np.sum(X, 1))

# generate design matrix 
design_df = pd.DataFrame()
for i,x in enumerate(X):
    exp_name = gen_exp_name(np.sort(species[x>0]))
    if exp_name not in exp_names:
        # eval time [0, 24]
        x_mat = np.empty([2, dim])
        x_mat[:] = np.nan
        x_mat[0] = x
        df_exp = pd.DataFrame()
        df_exp['Treatments'] = 2*[exp_name]
        df_exp['Time'] = t_eval
        df_exp[species] = x_mat
        design_df = pd.concat((design_df, df_exp))

# remove samples that have already been collected 
prev_exp = np.unique(non_cd_df.Treatments.values)
dup_inds = np.in1d(design_df.Treatments.values, prev_exp)
design_df = design_df.iloc[~dup_inds].copy()

# determine best set of new experiments to collect 
# N is the total number of measurements (corresponds to number of wells in 96 well plate)
new_exp = model.design(design_df, N=30)

# new data to collect
inds = np.in1d(design_df.Treatments.values, new_exp)
new_df = design_df.iloc[inds].copy()

# save design
fname = f"designs/universal_design.csv"
new_df.to_csv(fname, index=False)

Total samples: 22, Initial regularization: 1.00e-03
Loss: 4.334, Residuals: -0.044
Loss: 3.006, Residuals: 0.075
Loss: 2.328, Residuals: 0.035
Loss: 2.048, Residuals: 0.015
Loss: 1.802, Residuals: 0.003
Loss: 1.706, Residuals: 0.040
Loss: 1.660, Residuals: 0.030
Loss: 1.641, Residuals: 0.017
Loss: 1.614, Residuals: 0.021
Loss: 1.596, Residuals: 0.037
Loss: 1.564, Residuals: 0.031
Loss: 1.506, Residuals: 0.023
Loss: 1.504, Residuals: 0.038
Loss: 1.441, Residuals: 0.030
Loss: 1.439, Residuals: 0.032
Loss: 1.436, Residuals: 0.042
Loss: 1.413, Residuals: 0.034
Loss: 1.413, Residuals: 0.041
Loss: 1.400, Residuals: 0.036
Loss: 1.400, Residuals: 0.036
Optimization terminated successfully.
Evidence -28.111
Updating hyper-parameters...
Total samples: 22, Updated regularization: 9.66e-01
Loss: 14.553, Residuals: 0.037
Optimization terminated successfully.
Evidence 98.516
Updating hyper-parameters...
Total samples: 22, Updated regularization: 1.07e+00
Loss: 28.959, Residuals: 0.038
Optimization t

In [14]:
new_df.iloc[new_df.Time.values == 0].to_csv("designs/unversal_t0.csv", index=False)

In [15]:
new_df.iloc[new_df.Time.values == 0]

Unnamed: 0,Treatments,Time,BT,BV,CH,BU,CS,CA,DP
0,BT-BV-CH-DP,0,0.0025,0.0025,0.0025,0.0,0.0,0.0,0.0025
0,BT-BU-BV-CA-CS-DP,0,0.001667,0.001667,0.0,0.001667,0.001667,0.001667,0.001667
0,BT-BU-BV-CA-DP,0,0.002,0.002,0.0,0.002,0.0,0.002,0.002
0,BT-BV-CA-CS-DP,0,0.002,0.002,0.0,0.0,0.002,0.002,0.002
0,BT-BU-CA-CH-CS,0,0.002,0.0,0.002,0.002,0.002,0.002,0.0
0,BT-BU-CA-CH,0,0.0025,0.0,0.0025,0.0025,0.0,0.0025,0.0
0,BT-BU-CH-DP,0,0.0025,0.0,0.0025,0.0025,0.0,0.0,0.0025
0,BT-CA-CH-CS-DP,0,0.002,0.0,0.002,0.0,0.002,0.002,0.002
0,BT-BU-CA-CS-DP,0,0.002,0.0,0.0,0.002,0.002,0.002,0.002
0,BU-BV-CA-CH-CS-DP,0,0.0,0.001667,0.001667,0.001667,0.001667,0.001667,0.001667
