In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from scipy.stats import norm, linregress

import os
import time
import itertools

from sklearn.model_selection import KFold

  from pandas.core import (


In [2]:
# number of folds
n_splits_outer = 20
n_splits_inner = 10

# import file names
files = os.listdir("../data/SET3_Thirdtrial/")

In [3]:
# run kfold for each file 
for file in files:
    strain = file.split("_")[0]
    
    # import data
    df = pd.read_csv(f"../data/SET3_Thirdtrial/{file}")
    df.sort_values(by=["Treatments", "Time"], inplace=True)
    
    # make sure that conditions have at least one measurement
    dfs = []
    for treatment, df_t in df.groupby("Treatments"):
        if df_t.shape[0] > 1:
            dfs.append(df_t)
    df = pd.concat(dfs)

    # determine species names 
    species = df.columns.values[2:]

    # separate mono culture data 
    mono_dfs = []
    dfs = []
    treatments = []
    for treatment, df_i in df.groupby("Treatments"):
        # hyphen is only in community conditions
        if "-" in treatment:
            dfs.append(df_i)
            # save treatment name without the replicate identifier 
            treatments.append([treatment.split("_")[0]]*df_i.shape[0])
        else:
            mono_dfs.append(df_i)
    treatments = np.concatenate(treatments)
    unique_treatments = np.unique(treatments)
    mono_df = pd.concat(mono_dfs)
    df = pd.concat(dfs)

    # init kfold object
    kf = KFold(n_splits=n_splits_outer, shuffle=True, random_state=21)

    # run Kfold 
    for outer_idx, (train_index, test_index) in enumerate(kf.split(unique_treatments)):
        
        # get train df
        train_inds = np.in1d(treatments, unique_treatments[train_index])
        train_df = df.iloc[train_inds].copy()
        train_df = pd.concat((mono_df, train_df))
        
        # average replicates in the test_df
        test_df = []
        for test_treatment in unique_treatments[test_index]:
            # pull dataframe with all replicates of same test treatment 
            treatment_inds = np.in1d(treatments, test_treatment)
            df_treatment = df.iloc[treatment_inds].copy()
            
            # get set of unique measurement times
            treatment_times = np.unique(df_treatment.Time.values)
            
            # init dataframe to store averaged values
            avg_df = pd.DataFrame()
            avg_df['Treatments'] = [test_treatment]*len(treatment_times)
            avg_df['Time'] = treatment_times

            avg_data = np.zeros([len(treatment_times), len(species)])
            for i, time in enumerate(treatment_times):
                avg_data[i] = df_treatment.iloc[df_treatment.Time.values==time][species].mean()
            avg_df[species] = avg_data
            test_df.append(avg_df)
        
        # combine averaged dataframes for test dataframe
        test_df = pd.concat(test_df)
        
        # save train / test splits
        train_df.to_csv(f"folds_{strain}/train_{outer_idx}.csv", index=False)
        test_df.to_csv(f"folds_{strain}/test_{outer_idx}.csv", index=False)      
        
        # separate mono culture data 
        inner_mono_dfs = []
        inner_dfs = []
        inner_treatments = []
        for treatment, df_i in train_df.groupby("Treatments"):
            # hyphen is only in community conditions
            if "-" in treatment:
                inner_dfs.append(df_i)
                # save treatment name without the replicate identifier 
                inner_treatments.append([treatment.split("_")[0]]*df_i.shape[0])
            else:
                inner_mono_dfs.append(df_i)
        inner_treatments = np.concatenate(inner_treatments)
        inner_unique_treatments = np.unique(inner_treatments)
        inner_mono_df = pd.concat(inner_mono_dfs)
        inner_df = pd.concat(inner_dfs)

        # init kfold object
        inner_kf = KFold(n_splits=n_splits_inner, shuffle=True, random_state=21)

        # run Kfold 
        for inner_idx, (train_index, test_index) in enumerate(inner_kf.split(inner_unique_treatments)):
            
            # get train df
            inner_train_inds = np.in1d(inner_treatments, inner_unique_treatments[train_index])
            inner_train_df = inner_df.iloc[inner_train_inds].copy()
            inner_train_df = pd.concat((inner_mono_df, inner_train_df))

            # average replicates in the test_df
            inner_test_df = []
            for test_treatment in inner_unique_treatments[test_index]:
                # pull dataframe with all replicates of same test treatment 
                treatment_inds = np.in1d(inner_treatments, test_treatment)
                df_treatment = inner_df.iloc[treatment_inds].copy()

                # get set of unique measurement times
                treatment_times = np.unique(df_treatment.Time.values)

                # init dataframe to store averaged values
                avg_df = pd.DataFrame()
                avg_df['Treatments'] = [test_treatment]*len(treatment_times)
                avg_df['Time'] = treatment_times

                avg_data = np.zeros([len(treatment_times), len(species)])
                for i, time in enumerate(treatment_times):
                    avg_data[i] = df_treatment.iloc[df_treatment.Time.values==time][species].mean()
                avg_df[species] = avg_data
                inner_test_df.append(avg_df)

            # combine averaged dataframes for test dataframe
            inner_test_df = pd.concat(inner_test_df)

            # save train / test splits
            inner_train_df.to_csv(f"folds_{strain}/train_{outer_idx}_{inner_idx}.csv", index=False)
            inner_test_df.to_csv(f"folds_{strain}/test_{outer_idx}_{inner_idx}.csv", index=False)   