In [1]:
import pandas as pd
import numpy as np
import os

from scipy.stats import pearsonr

  from pandas.core import (


In [2]:
# range of L1 penalties to try
lmbdas = [0., 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1.]

# list of C diff strains
strains = ['CDanc', 'CDevo']

In [3]:
# for each CD strain
for strain in strains:

    # init list of optimized hyperparameters and corresponding performance
    hyper_dfs = []
    r_vals_k = []

    for outer in range(20):

        hypers = []
        r_vals = []

        for lmbda in lmbdas:

            # import experimental data
            train_df = pd.read_csv(f"folds_{strain}/train_{outer}.csv")

            # get species names
            species = train_df.columns.values[2:]
            
            # remove rep number from exp names
            exp_names = train_df.Treatments.values
            exp_names = [exp_name.split("_")[0] for exp_name in exp_names]
            train_df['Treatments'] = exp_names
            
            # average replicates in the train_df
            train_df_mean = []
            for exp_name, df_exp in train_df.groupby("Treatments"):
                df_groups = df_exp.groupby("Time")
                df_avg = df_groups[species].mean().reset_index()
                df_avg.insert(0, "Treatments", [exp_name]*df_avg.shape[0])
                train_df_mean.append(df_avg)
            train_df = pd.concat(train_df_mean)
            
            # loop over inner folds
            pred_dfs = []
            train_dfs = []
            for inner in range(10):
                try:
                    # import predictions 
                    pred_df = pd.read_csv(f"{strain}_preds/{strain}_pred_{outer}_{inner}_{lmbda}.csv")
                    sample_inds = np.in1d(train_df.Treatments.values, pred_df.Treatments.values)
                    train_dfs.append(train_df.iloc[sample_inds].copy())
                    pred_dfs.append(pred_df)
                except:
                    print(f"missing {outer}_{inner}_{lmbda}.csv")

            # concat predicted dataframes
            pred_df = pd.concat(pred_dfs)
            train_df = pd.concat(train_dfs)

            # compute performance
            r_values = []
            for s in species:

                pred_vals = []
                true_vals = []

                for exp_name, exp_train_df in train_df.groupby("Treatments"):

                    if exp_train_df[s].values[0]>0:

                        true_vals.append(exp_train_df[s].values[1:])
                        exp_pred_df = pred_df.iloc[pred_df.Treatments.values==exp_name].copy()
                        pred_vals.append(exp_pred_df[s].values[1:])

                true_vals = np.concatenate(true_vals)
                pred_vals = np.concatenate(pred_vals)

                r_values.append(pearsonr(true_vals, pred_vals)[0])

            hypers.append(lmbda)
            r_vals.append(np.mean(r_values))

        # determine best hyperparams
        lmbda = hypers[np.argmax(r_vals)]
        r_vals_k.append(np.max(r_vals))

        # save to dataframe
        hyper_df = pd.DataFrame()
        hyper_df[['k', 'L1']] = [[outer, lmbda]]
        hyper_dfs.append(hyper_df)

    hyper_df = pd.concat(hyper_dfs)
    hyper_df.to_csv(f"{strain}_hyper_df.csv", index=False)    

In [4]:
hyper_df = pd.read_csv(f"CDanc_hyper_df.csv") 
hyper_df

Unnamed: 0,k,L1
0,0.0,1e-05
1,1.0,1e-05
2,2.0,1e-05
3,3.0,1e-07
4,4.0,1e-05
5,5.0,1e-05
6,6.0,1e-06
7,7.0,1e-06
8,8.0,1e-06
9,9.0,1e-05


In [5]:
hyper_df = pd.read_csv(f"CDevo_hyper_df.csv") 
hyper_df

Unnamed: 0,k,L1
0,0.0,0.0001
1,1.0,0.0
2,2.0,1e-06
3,3.0,1e-05
4,4.0,1e-06
5,5.0,0.0
6,6.0,1e-06
7,7.0,1e-06
8,8.0,1e-05
9,9.0,0.0001
