In [6]:
import pandas as pd
import numpy as np
from dataclasses import dataclass


#define data classes:
@dataclass
class Run:
    id: int
    decay: float
    touchTransferFraction: float
    counts: list[int]
    occupancies: list[int]
    cdffs: list[int]
    anyCps: list[int]

@dataclass
class Sample:
        run: int
        startDay: int
        decay: float
        touchTransferFractions: float
        counts: list[int]
        occupancies: list[int]
        cdiffs: list[int]
        anyCps: list[int]
        


In [7]:
# read the simulated data into a dataframe.  Indexed on the run and sorted by tick.

data = pd.read_csv('data/sim_data.csv', index_col=['run']).sort_index().sort_values(by="tick")
data.head()

Unnamed: 0_level_0,count,tick,decayRate,surfaceTransferFraction,CDIFF,occupancy,anyCP
run,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
16,23.0,90.0,0.590993,0.939448,0.0,15,2.0
8,0.0,90.0,0.789146,0.665336,0.0,15,5.0
6,2.0,90.0,0.728805,0.799395,0.0,16,3.0
19,5.0,90.0,0.781537,0.443818,2.0,12,2.0
2,0.0,90.0,0.539657,0.258963,0.0,20,4.0


In [41]:
# How many runs do we have and what are their lengths?
#for i in data.index.unique():
    #print(i, ": ", len(data.loc[i]))
    
runs = []

#create individual run objects for each run in the data.  Each run object will have a list of counts, occupancies, cdffs, and anyCps
for i in data.index.unique():
    #print(i)
    #print(data.loc[i].decayRate.iloc[1])
    run = Run(id=i, decay=data.loc[i].decayRate.iloc[0], touchTransferFraction=data.loc[i].surfaceTransferFraction.iloc[0], counts=data.loc[i].count, occupancies=data.loc[i].occupancy, cdffs=data.loc[i].CDIFF, anyCps=data.loc[i].anyCP)
    runs.append(run)

# print(runs)

In [26]:
#define a function to split the sequences into lists of n steps (we have 56 days of observed data)
#we will split the simulated data into sequences n long (90-136, 91-137, 92-138, etc.)

def split_sequences(run, number_of_steps):
    start = int(run.tick.min())
    end = int(run.tick.max())-int(number_of_steps)
    print(start, "-", end)
    print(run)
    run_samples = []
    for i in np.arange(start, end):
        sample = Sample(run=run.index[0], startDay=i, decay=run.decayRate, touchTransferFractions=run.surfaceTransferFraction, counts=run.count[i:i+number_of_steps], occupancies=run.occupancy[i:i+number_of_steps], cdiffs=run.CDIFF[i:i+number_of_steps], anyCps=run.anyCp[i:i+number_of_steps])
        run_samples.append(sample)
    return run_samples

In [42]:
# define a function to split the list of runs into a list of samples

def split_runs(runs, number_of_steps):
    samples = []
    for run in runs:
        samples.extend(split_sequences(run, number_of_steps))
    return