In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from scipy.stats import norm

import os
import time
import itertools

# Import raw data file names

In [2]:
# import file names
files = os.listdir("data/")
files = [f for f in files if "EXP0012" in f]
files

['EXP0012_P2_MS001_12h.xlsx',
 'EXP0012_P1_DSM_24h.xlsx',
 'EXP0012_P2_MS001_24h.xlsx',
 'EXP0012_P5_UNIV_24h.xlsx',
 'EXP0012_P4_MS014_12h.xlsx',
 'EXP0012_P3_MS008_12h.xlsx',
 'EXP0012_P1_DSM_12h.xlsx',
 'EXP0012_P5_UNIV_12h.xlsx',
 'EXP0012_P4_MS014_24h.xlsx',
 'EXP0012_P3_MS008_24h.xlsx']

# Preprocess data

In [3]:
# columns with species absolute abundance
sp_names = ["CA", "BT", "BU", "CS", "CD", "DP", "CH", "BV"]
sp_abs_abundance = ["Abs " + sp for sp in sp_names]
n_species = len(sp_names)

# list of parameter names 
param_names = []
for s1 in sp_names:
    for s2 in sp_names:
        param_names += [s1+"*"+s2]
param_names = sp_names + param_names

def get_ic(exp_name):
    # function to compute initial condition for samples with Cdiff

    # determine which species were in the experiment
    sp_exp = exp_name.split("-")[1:] + ["CD"]
    
    # determine inoculation condition 
    present = np.array(np.in1d(sp_names, sp_exp), int)

    # compute initial inoculation abundance 
    ic = present / sum(present) * .01
    return ic

def get_univ_ic(exp_name):
    # function to compute initial condition for samples without Cdiff

    # determine which species were in the experiment
    sp_exp = exp_name.split("-")
    
    # determine inoculation condition 
    present = np.array(np.in1d(sp_names, sp_exp), int)

    # compute initial inoculation abundance 
    ic = present / sum(present) * .01
    return ic

# loop through plates to add 
for i in range(1, 5):
    # determine set of dataframes for each strain
    strain_files = np.sort([f for f in files if f"P{i}" in f])
    
    # make sure that the first file corresponds to the 12h measurement
    assert strain_files[0].split("_")[-1].split(".")[0] == '12h', "Incorrect order of time points"

    # add strain specific samples
    # 12 and 24 hour data 
    df_12 = pd.read_excel(f"data/{strain_files[0]}").dropna()
    df_24 = pd.read_excel(f"data/{strain_files[1]}").dropna()
    
    # store data into dataframe
    df_strain = pd.DataFrame()
    for (name_12, df_12_i), (name_24, df_24_i) in zip(df_12.groupby("Combination"), df_24.groupby("Combination")):
        
        ic = get_ic(name_12)
        exp_data = np.stack([ic, np.mean(df_12_i[sp_abs_abundance].values, 0), np.mean(df_24_i[sp_abs_abundance].values, 0)])
        df_exp = pd.DataFrame()
        df_exp["Treatments"] = [name_12]*3 
        df_exp["Time"] = np.array([0, 12, 24], float)
        df_exp[sp_names] = exp_data
        
        df_strain = pd.concat((df_strain, df_exp))
        
    # add universal data samples
    # 12 and 24 hour data 
    df_12 = pd.read_excel(f"data/EXP0012_P5_UNIV_12h.xlsx").dropna()
    df_24 = pd.read_excel(f"data/EXP0012_P5_UNIV_24h.xlsx").dropna()
    
    # store data into dataframe
    for (name_12, df_12_i), (name_24, df_24_i) in zip(df_12.groupby("Combination"), df_24.groupby("Combination")):
        
        # make sure experimental conditions are the same at each time point 
        assert name_12 == name_24, "Incorrect experimental conditions at time points"
        
        ic = get_univ_ic(name_12)
        exp_data = np.stack([ic, np.mean(df_12_i[sp_abs_abundance].values, 0), np.mean(df_24_i[sp_abs_abundance].values, 0)])
        df_exp = pd.DataFrame()
        df_exp["Treatments"] = [name_12]*3 
        df_exp["Time"] = np.array([0, 12, 24], float)
        df_exp[sp_names] = exp_data
        
        df_strain = pd.concat((df_strain, df_exp))
        
    # add monoculture data 
    strain = strain_files[0].split("_")[-2]
    df_mono = pd.read_excel(f"data/Data_monoculture_community_{strain}_Jordy.xlsx").fillna(0.).drop(columns=['EL'])

    # replace specific name of strain with just CD
    strain_name = df_mono.columns[~np.in1d(df_mono.columns, df_strain.columns)].item()
    df_mono.rename(columns={strain_name:"CD"}, inplace=True)

    # pull only monoculture data up to 24 hours
    df_monos = []
    for name, df in df_mono.groupby("Treatments"): 
        if "Mono" in name:
            t_eval = df.Time.values
            df_monos.append(df.iloc[t_eval <= 24.])
    df_mono = pd.concat(df_monos)

    # append to data in right column order
    df_strain = pd.concat([df_strain, df_mono[df_strain.columns]])
        
    # save dataframe
    df_strain.to_csv("data/"+strain_files[0].split("_")[2]+"_processed_mono.csv", index=False)