In [26]:
import xml.etree.ElementTree as ET
from collections import defaultdict
import os

In [27]:
tree = ET.parse('data_xml/559-ws-testing.xml')

In [28]:
root = tree.getroot()

In [31]:
unique_elements = []
for element in root.iter():
        if element.tag not in unique_elements:
            unique_elements.append(element.tag)
            print(element.tag)

patient
glucose_level
event
finger_stick
basal
temp_basal
bolus
meal
sleep
work
stressors
hypo_event
illness
exercise
basis_heart_rate
basis_gsr
basis_skin_temperature
basis_air_temperature
basis_steps
basis_sleep


In [32]:
from helper_functions import *
df = extract_all_features_to_csv('data_xml/559-ws-testing.xml')

✅ Successfully extracted features!
   Input file: data_xml/559-ws-testing.xml
   Output file: data_xml/559-ws-testing_all_features.csv
   Total rows: 6734
   Columns: timestamp, patient, glucose_level, finger_stick, basal, meal_type, meal_carbs, exercise_intensity, exercise_duration, basis_heart_rate, basis_gsr, basis_skin_temperature, basis_air_temperature, basis_steps


In [49]:
import importlib
import helper_functions
importlib.reload(helper_functions)
from helper_functions import extract_duration_events_to_csv

# Extract duration events
df_duration = extract_duration_events_to_csv('data_xml/559-ws-testing.xml')


In [50]:
df_duration.head()

Unnamed: 0,patient,timestamp,duration_minutes,sleep,basis_sleep,work_intensity,bolus_type,bolus_dose,bolus_carb_input,temp_basal_value
0,559,2022-01-17 21:30:00,480.0,3.0,,,,,,
1,559,2022-01-18 06:26:05,0.0,,,,normal,6.3,40.0,
2,559,2022-01-18 11:10:13,0.0,,,,normal,3.3,30.0,
3,559,2022-01-18 13:30:00,0.0,,,5.0,,,,
4,559,2022-01-18 17:55:42,0.0,,,,normal,0.2,0.0,


In [35]:
for file in os.listdir(os.getcwd()):
    if file.endswith('.xml'):
        extract_all_features_to_csv(file)

In [47]:
df.head()

Unnamed: 0,timestamp,patient,glucose_level,finger_stick,basal,meal_type,meal_carbs,exercise_intensity,exercise_duration,basis_heart_rate,basis_gsr,basis_skin_temperature,basis_air_temperature,basis_steps
0,2022-01-18 00:00:00,559,,,,,,,,57.0,,92.3,89.6,
1,2022-01-18 00:01:00,559,179.0,,,,,,,,0.000183,,,
2,2022-01-18 00:04:00,559,,,,,,,,,,,,0.0
3,2022-01-18 00:05:00,559,,,,,,,,57.0,,92.3,89.6,
4,2022-01-18 00:06:00,559,183.0,,,,,,,,0.000182,,,


In [48]:
df_duration.head()

Unnamed: 0,patient,date,duration_minutes,sleep,basis_sleep,work_intensity,bolus_type,bolus_dose,bolus_carb_input,temp_basal_value
0,559,2022-01-17 21:30:00,480.0,3.0,,,,,,
1,559,2022-01-18 06:26:05,0.0,,,,normal,6.3,40.0,
2,559,2022-01-18 11:10:13,0.0,,,,normal,3.3,30.0,
3,559,2022-01-18 13:30:00,0.0,,,5.0,,,,
4,559,2022-01-18 17:55:42,0.0,,,,normal,0.2,0.0,


In [53]:
df_new = pd.merge(df, df_duration, on=['timestamp', 'patient'], how='left')

In [54]:
df_new.head()

Unnamed: 0,timestamp,patient,glucose_level,finger_stick,basal,meal_type,meal_carbs,exercise_intensity,exercise_duration,basis_heart_rate,...,basis_air_temperature,basis_steps,duration_minutes,sleep,basis_sleep,work_intensity,bolus_type,bolus_dose,bolus_carb_input,temp_basal_value
0,2022-01-18 00:00:00,559,,,,,,,,57.0,...,89.6,,,,,,,,,
1,2022-01-18 00:01:00,559,179.0,,,,,,,,...,,,,,,,,,,
2,2022-01-18 00:04:00,559,,,,,,,,,...,,0.0,,,,,,,,
3,2022-01-18 00:05:00,559,,,,,,,,57.0,...,89.6,,,,,,,,,
4,2022-01-18 00:06:00,559,183.0,,,,,,,,...,,,,,,,,,,


In [56]:
df.to_csv('training_data.csv', index=False)

In [1]:
import pandas as pd
from timeGAN import train_time_series_gan, generate_synthetic_data

# Step 1: Load your training data
df = pd.read_csv('training_data.csv')
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Step 2: Select features to generate
# Based on your CSV columns
features = [
    'glucose_level', 'finger_stick', 'basal', 'temp_basal', 'bolus', 'meal',
    'sleep', 'work', 'stressors', 'hypo_event', 'illness', 'exercise',
    'basis_heart_rate', 'basis_gsr', 'basis_skin_temperature',
    'basis_air_temperature', 'basis_steps', 'basis_sleep'
]

# Filter to only include features that exist in your DataFrame
features = [f for f in features if f in df.columns]
print(f"Using {len(features)} features: {features}")

# Step 3: Train the GAN
print("\n" + "="*80)
print("TRAINING TIME SERIES GAN")
print("="*80)

gan, scaler, feature_names = train_time_series_gan(
    df,
    features=features,
    seq_len=24,        # 24 timesteps = 2 hours (assuming 5-min intervals)
    epochs=200,        # Number of training epochs
    batch_size=64,     # Batch size
    hidden_dim=128,    # LSTM hidden dimension
    num_layers=2       # Number of LSTM layers
)

# Step 4: Generate synthetic data
print("\n" + "="*80)
print("GENERATING SYNTHETIC DATA")
print("="*80)

df_synthetic = generate_synthetic_data(
    gan, 
    scaler, 
    feature_names,
    n_samples=100,     # Number of sequences to generate
    seq_len=24         # Length of each sequence
)

# Step 5: Save synthetic data
df_synthetic.to_csv('synthetic_patient_data.csv', index=False)
print(f"\n✅ Generated {len(df_synthetic)} synthetic data points")
print("   Saved to 'synthetic_patient_data.csv'")

Using 8 features: ['glucose_level', 'finger_stick', 'basal', 'basis_heart_rate', 'basis_gsr', 'basis_skin_temperature', 'basis_air_temperature', 'basis_steps']

TRAINING TIME SERIES GAN
TIME SERIES GAN TRAINING PIPELINE

[1/4] Preparing data...
   Dataset size: 6711
   Features: ['glucose_level', 'finger_stick', 'basal', 'basis_heart_rate', 'basis_gsr', 'basis_skin_temperature', 'basis_air_temperature', 'basis_steps']
   Sequence length: 24

[2/4] Initializing Time Series GAN...


  df_features = df_features.fillna(method='ffill', limit=10).fillna(method='bfill', limit=10).fillna(0)


   Device: cpu
   Input dimension: 8

[3/4] Training GAN...


RuntimeError: mat1 and mat2 shapes cannot be multiplied (1536x2 and 8x128)