In [1]:
import pandas as pd
import numpy as np
import os 

In [2]:
def one_hot_encoder(p_vals):
    categories = np.unique(p_vals)
    categories

    hot_p_vals = np.zeros([len(p_vals), len(categories)])
    for i, p_val in enumerate(p_vals):
        hot_p_vals[i, np.in1d(categories, p_val)] = 1.
        
    return hot_p_vals

In [3]:
# import file names
files = os.listdir("data/")
files = [f for f in files if "processed" not in f]
files

for file in files:
    # import df 
    df = pd.read_csv("data/"+file)

    # remove extra columns 
    save_columns = ["Treatments", "Time", "BT", "BV", "CD", "CH", "BU", "CS", "CA", "DP"]
    df_save = df.copy()[save_columns]
    
    # rename treatment to include passage info 
    Treatments = [t + "_" + str(p) for t, p in zip(df.Treatments.values, df.Passage.values)]
    df_save["Treatments"] = Treatments 
    
    # one hot encode passage number 
    hot_passages = one_hot_encoder(df['Passage'].values)
    df_save[["P1", "P2", "P3"]] = hot_passages
    
    # save processed df
    fname = "data/" + file.split(".")[0] + "_processed.csv"
    df_save.sort_values(by=["Treatments", "Time"], inplace=True)
    df_save.to_csv(fname, index=False)