# 01 – Data Preparation

Generate or load a dummy dataset of shape `[sample_id, timestep, throughput]`,
with exactly `seq_len + 1` rows per sample for feature+target.


In [None]:
# 1) Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scripts.prepare_data import generate_dummy

## 2 – Generate Dummy Data In-Memory


In [None]:
SEQ_LEN = 10
NUM_SAMPLES = 50

df = generate_dummy(num_samples=NUM_SAMPLES, seq_len=SEQ_LEN)
print(df.head(12))  # show first sample_id’s seq_len+1 rows


## 3 – Inspect Sequence Length


In [None]:
# verify each sample has seq_len+1 rows
group = df.groupby("sample_id").size()
print("Unique counts per sample_id:", group.unique())


## 4 – Quick Plot of Throughput Trace (sample 0)


In [None]:
sample0 = df[df.sample_id == 0]
plt.plot(sample0.timestep, sample0.throughput)
plt.xlabel("Timestep")
plt.ylabel("Throughput")
plt.title(f"Dummy Throughput Trace (sample 0, length={SEQ_LEN+1})")
plt.show()


## 5 – Save to Parquet via CLI


In [None]:
# In Colab or Jupyter shell
!python scripts/prepare_data.py --dummy \
    --num-samples 50 \
    --seq-len 10 \
    --output data/training_data.parquet


## 6 – Load & Verify Parquet


In [None]:
df2 = pd.read_parquet("data/training_data.parquet")
print("Loaded parquet, shape:", df2.shape)
print(df2.groupby("sample_id").size().unique())
df2.sample(5)
