In [30]:
import json
import os
import logging
import numpy as np
import pandas as pd
from scipy import stats
from scipy.interpolate import CubicSpline


class Config:

    #Globals
    batch_size = 128
    num_classes = 2  # classes, seizure/no seizure
    epochs = 10     # Epoch iterations
    length_time_step = 5
    row_hidden = 128  # hidden neurons in conv layers
    col_hidden = 128   # hidden neurons in the Bi-LSTM layers
    RANDOM_SEED = 333    
    N_TIME_STEPS = 125   # 50 records in each sequence
    N_FEATURES = 3     # mag,hr,roi_Ratio,output
    step = 100           # window overlap = 50 -10 = 40  (80% overlap)
    N_CLASSES = 2      # class labels
  

In [31]:
class OsdbLabelGenerator:
    def __init__(self, file_path, sampling_rate=25):
        self.file_path = file_path  # Path to the JSON file
        self.sampling_rate = sampling_rate  # Sampling rate (Hz)
        self.df_sensordata = None  # To store the processed DataFrame
        
    def load_data(self):
        """Load and flatten the JSON data into a DataFrame."""
        with open(self.file_path, 'r') as file:
            raw_json = json.load(file)
        
        flattened_data = []
        for attribute in raw_json:
            user_id = attribute.get('userId', None)
            seizure_times = attribute.get('seizureTimes', [])
            datapoints = attribute.get('datapoints', [])
            
            for point in datapoints:
                event_id = point.get('eventId', None)
                hr = point.get('hr', [])
                o2Sat = point.get('o2Sat', [])
                rawData = point.get('rawData', [])
                rawData3D = point.get('rawData3D', [])
                alarmPhrase = point.get('alarmPhrase', None)
                flattened_data.append({
                    'eventId': event_id,
                    'userId': user_id,
                    'hr': hr,
                    'o2Sat': o2Sat,
                    'rawData': rawData,
                    'rawData3D': rawData3D,
                    'seizure_times': seizure_times,
                    'alarmPhrase': alarmPhrase
                })
        
        # Convert to DataFrame
        self.df_sensordata = pd.DataFrame(flattened_data)
        
        # Add a sequential 'Id' column
        self.df_sensordata['Id'] = range(len(self.df_sensordata))
    
    def calculate_fft(self, raw_data):
        """Calculate FFT for the raw data."""
        raw_data = raw_data - np.mean(raw_data)  # Remove the DC component
        fft_result = np.fft.fft(raw_data)  # Compute FFT
        frequencies = np.fft.fftfreq(len(raw_data), d=1/self.sampling_rate)  # Compute frequencies
        fft_magnitude = np.abs(fft_result)  # Compute the magnitude
        positive_frequencies = frequencies[:len(frequencies)//2]  # Only positive frequencies
        positive_fft_magnitude = fft_magnitude[:len(frequencies)//2]  # Only positive FFT magnitudes
        return positive_frequencies, positive_fft_magnitude
    
    def add_fft_column(self):
        """Add an FFT column to the DataFrame with zero-padding to ensure each entry has 125 values."""
        fft_results = []
        for _, row in self.df_sensordata.iterrows():
            raw_data = np.array(row['rawData'])
            _, positive_fft_magnitude = self.calculate_fft(raw_data)  # Calculate FFT for the row
            # Apply zero padding to ensure the FFT column has exactly 125 values
            padded_fft = np.pad(positive_fft_magnitude, (0, 125 - len(positive_fft_magnitude)), 'constant', constant_values=0)
            fft_results.append(list(padded_fft))  # Append padded FFT result
        self.df_sensordata['FFT'] = fft_results
    
    def add_timestep_and_label(self):
        """Add timestep and label columns to the DataFrame."""
        # Add 'timestep' column in 5-second increments
        self.df_sensordata['timestep'] = self.df_sensordata.index * 5

        # Add 'label' column, initialized to 0
        self.df_sensordata['label'] = 0
    
    def label_alarm_events(self):
        """Label the data based on alarm events."""
        for idx, row in self.df_sensordata.iterrows():
            if row['alarmPhrase'] == 'ALARM':  # If alarmPhrase is ALARM
                alarm_time = row['timestep']
                seizure_times = row['seizure_times']
                
                # Process the seizure times list, assuming seizure_times are in seconds
                for seizure in seizure_times:
                    start_time = alarm_time + seizure  # Adjust by the seizure offset
                    
                    # Label the rows before and after the alarm (within the range of seizure_times)
                    before_idx = self.df_sensordata[(self.df_sensordata['timestep'] >= start_time) & 
                                                     (self.df_sensordata['timestep'] < alarm_time)].index
                    self.df_sensordata.loc[before_idx, 'label'] = 1  # Mark as seizure (1)
                    
                    # For the positive offset (after alarm)
                    after_idx = self.df_sensordata[(self.df_sensordata['timestep'] >= alarm_time) & 
                                                    (self.df_sensordata['timestep'] <= start_time)].index
                    self.df_sensordata.loc[after_idx, 'label'] = 1  # Mark as seizure (1)
    
    def process_data(self):
        """Process the data through all stages and return the final DataFrame."""
        # Step 1: Load the data
        self.load_data()

        # Step 2: Add FFT column with padding
        self.add_fft_column()

        # Step 3: Add timestep and label columns
        self.add_timestep_and_label()

        # Step 4: Label based on alarm events
        self.label_alarm_events()

        # Step 5: Drop the 'seizure_times' column
        self.df_sensordata.drop(columns=['seizure_times'], inplace=True)

        # Step 6: Ensure the DataFrame is sorted by 'Id' column
        self.df_sensordata.sort_values(by='Id', inplace=True)
        self.df_sensordata.reset_index(drop=True, inplace=True)

        return self.df_sensordata

In [32]:
class OsdbDataLoader:
    def __init__(self, file_path, time_steps):
        self.file_path = file_path
        self.time_steps = time_steps
        self.df_sensordata = None
        self.load_and_process_data_from_json()

    def load_and_process_data_from_json(self):
        """
        Load and process OSDB data from a JSON file. This function will create a DataFrame 
        with the necessary columns and calculate FFT features.
        """
        with open(self.file_path, 'r') as file:
            raw_json = json.load(file)

        # Flatten the JSON and extract the necessary data
        flattened_data = []
        for attribute in raw_json:
            user_id = attribute.get('userId', None)
            datapoints = attribute.get('datapoints', [])

            for point in datapoints:
                event_id = point.get('eventId', None)
                hr = point.get('hr', None)
                o2Sat = point.get('o2Sat', None)
                rawData = point.get('rawData', [])
                rawData3D = point.get('rawData3D', [])

                # FFT calculation for rawData
                fft_result = self.calculate_fft(rawData)
                #Uncomment the sensor data that you want to load
                flattened_data.append({
                    'eventId': event_id,
                    'userId': user_id,
                    'hr': hr,
                    #'o2Sat': o2Sat,
                    'rawData': rawData,
                    #'rawData3D': rawData3D,
                    'FFT': fft_result  # Adding FFT column directly
                })

        # Create DataFrame from the flattened data
        self.df_sensordata = pd.DataFrame(flattened_data)

        # Apply zero padding to the FFT column to make sure all rows have 125 FFT values
        self.df_sensordata['FFT'] = self.df_sensordata['FFT'].apply(lambda fft: np.pad(fft, (0, 125 - len(fft)), 'constant', constant_values=0) if len(fft) < 125 else fft)

    def calculate_fft(self, raw_data):
        if not raw_data:
            return []

        # Convert raw_data to numpy array
        raw_data = np.array(raw_data)
        # Perform FFT, remove DC component, and return magnitudes
        raw_data = raw_data - np.mean(raw_data)  # Remove DC component
        fft_result = np.fft.fft(raw_data)
        fft_magnitude = np.abs(fft_result)
        # Isolate positive frequencies
        positive_fft_magnitude = fft_magnitude[:len(fft_magnitude) // 2]
        
        return positive_fft_magnitude.tolist()  # Return as a list

In [33]:
class DataReshaper:
    def __init__(self, dataframe):
        self.df = dataframe

    def reshape_data(self):
        reshaped_rows = []
        
        for idx, row in self.df.iterrows():
            Id = row['Id']
            event_id = row['eventId']
            user_id = row['userId']
            hr = row['hr']
            o2Sat = row['o2Sat']
            rawData = row['rawData']
            rawData3D = row['rawData3D']
            fft = row['FFT']
            label = row['label']
            
            # Replicate eventId, userId, hr, o2Sat for 125 times
            repeated_info = {
                'Id': [Id] * 125,
                'eventId': [event_id] * 125,
                'userId': [user_id] * 125,
                'hr': [hr] * 125,
                'o2Sat': [o2Sat] * 125,
                'label': [label] * 125

            }
            
            # Transpose rawData and FFT
            rawData_transposed = rawData[:125]  # Transpose to the correct shape
            fft_transposed = fft[:125]  # Transpose to the correct shape
            
            # Process rawData3D if it exists
            if rawData3D:
                # Convert rawData3D into lists of 3 (x, y, z)
                rawData3D_transposed = [rawData3D[i:i+3] for i in range(0, len(rawData3D), 3)]
                rawData3D_transposed = rawData3D_transposed[:125]  # Ensure only 125 rows
            else:
                rawData3D_transposed = [None] * 125  # If no rawData3D, set it to None
            
            # Create the reshaped row
            for i in range(125):
                reshaped_rows.append({
                    'Id': repeated_info['Id'][i],
                    'eventId': repeated_info['eventId'][i],
                    'userId': repeated_info['userId'][i],
                    'hr': repeated_info['hr'][i],
                    'o2Sat': repeated_info['o2Sat'][i],
                    'rawData': rawData_transposed[i],
                    'rawData3D': rawData3D_transposed[i],
                    'FFT': fft_transposed[i],
                    'label': repeated_info['label'][i],

                })
        
        # Create a new DataFrame from the reshaped rows
        reshaped_df = pd.DataFrame(reshaped_rows)
        return reshaped_df
    

In [34]:
class Interpolator:
    def __init__(self, df, column_to_interpolate):
        """
        Initialize the Interpolator class with a DataFrame and the column to interpolate.
        """
        self.df = df
        self.column_to_interpolate = column_to_interpolate

    def interpolate_column(self, new_column_name='interpolated_hr', interval=125, time_step=5):
        """
        Interpolate the specified column using the provided logic.
        
        Parameters:
        - new_column_name: Name of the new column to store interpolated values.
        - interval: Interval to sample the original column (e.g., every 125th element).
        - time_step: Time step in seconds for the interpolation process.
        """
        try:
            # Step 1: Extract every nth element from the specified column
            original_values = self.df[self.column_to_interpolate]
            selected_elements = original_values[0::interval]
            x = np.array(selected_elements)

            # Step 2: Create an array representing the time (in `time_step` intervals)
            time_values = np.arange(len(x)) * time_step

            # Step 3: Create a CubicSpline object for interpolation
            cs = CubicSpline(time_values, x, bc_type='clamped')

            # Step 4: Generate new time values for finer granularity
            num_original_points = len(x)
            new_time_values = np.linspace(0, (num_original_points - 1) * time_step, num_original_points * interval)

            # Step 5: Generate interpolated values
            interpolated_values = cs(new_time_values)

            # Step 6: Add the interpolated values to the DataFrame
            self.df[new_column_name] = interpolated_values[:len(self.df)]  # Match the original DataFrame length

            # Step 7: Rearrange columns so that 'label' is always last
            columns = list(self.df.columns)
            if 'label' in columns:
                columns.remove('label')
                columns.append('label')
            self.df = self.df[columns]

            print(f"Interpolation completed. New column '{new_column_name}' added to the DataFrame.")
        except Exception as e:
            print("An error occurred during interpolation:", e)

    def get_dataframe(self):
        """
        Return the updated DataFrame with interpolated values.
        """
        return self.df


In [35]:
# Configure logging for Jupyter Notebook
def log_message(level, message):
    print(f"{level}: {message}")

# Main function
def main(file_path):
    try:
        # Step 1: Load labeled data
        log_message("INFO", "1. Connecting to Dataset")
        if not os.path.exists(file_path):
            log_message("ERROR", f"File not found: {file_path}")
            return

        processor = OsdbLabelGenerator(file_path)
        df_result = processor.process_data()
        if df_result.empty:
            log_message("ERROR", "Loaded dataset is empty.")
            return
        log_message("INFO", "Successfully connected to and processed dataset.")

        # Step 2: Reshape the flattened DataFrame
        log_message("INFO", "2. Reshaping Dataset")
        reshaper = DataReshaper(df_result)
        reshaped_df = reshaper.reshape_data()
        if reshaped_df.empty:
            log_message("ERROR", "Reshaping failed. DataFrame is empty after reshaping.")
            return
        log_message("INFO", "Successfully reshaped dataset.")

        # Step 3: Interpolate missing data
        log_message("INFO", "3. Interpolating Dataset")
        interpolator = Interpolator(reshaped_df, column_to_interpolate="hr")
        interpolator.interpolate_column(
            new_column_name="interpolated_hr",
            interval=Config.N_TIME_STEPS,
            time_step=Config.length_time_step,
        )
        dataset_df = interpolator.get_dataframe()
        if dataset_df.empty:
            log_message("ERROR", "Interpolation failed. DataFrame is empty after processing.")
            return
        log_message("INFO", "Successfully interpolated dataset.")

        # Step 4: Display final DataFrame
        log_message("INFO", "4. Generated DataFrame")
        print(dataset_df.head(5))
        log_message("INFO", "5. Tasks Complete")

    except Exception as e:
        log_message("ERROR", f"An error occurred: {e}")

# Example file path for the notebook
file_path = "../Data/osdb_3min_allSeizures.json"
main(file_path)


INFO: 1. Connecting to Dataset
INFO: Successfully connected to and processed dataset.
INFO: 2. Reshaping Dataset
INFO: Successfully reshaped dataset.
INFO: 3. Interpolating Dataset
Interpolation completed. New column 'interpolated_hr' added to the DataFrame.
INFO: Successfully interpolated dataset.
INFO: 4. Generated DataFrame
   Id  eventId  userId  hr  o2Sat  rawData rawData3D           FFT  \
0   0      407      39  67     -1   1496.0      None  1.296030e-11   
1   0      407      39  67     -1   1480.0      None  1.430513e+02   
2   0      407      39  67     -1   1500.0      None  5.417937e+01   
3   0      407      39  67     -1   1492.0      None  1.404751e+02   
4   0      407      39  67     -1   1496.0      None  1.208051e+02   

   interpolated_hr  label  
0        67.000000      0  
1        66.999973      0  
2        66.999893      0  
3        66.999761      0  
4        66.999579      0  
INFO: 5. Tasks Complete


## 📁 **OSDB Data Processor Component** 

The **OSDB Data Processor** component is designed to process raw JSON data from the **OpenSeizure Database (OSDB)**. The component includes several modular classes located in the `OsdbDataProcessor` folder, which form a data processing pipeline. The pipeline loads raw **JSON**, reshapes the data for timeseries analysis, runs an interpolator and outputs a dataframe that can be integrated with the **AMBER Model**. 

### 📂 **OsdbDataProcessor Overview**

- **`Config`**: Defines the configuration parameters required throughout the data pipeline (e.g., batch size, time steps, and sampling rate).
- **`OsdbDataLoader`**: Connects to the **OSDB** and reads raw **JSON** data to generate an unlabelled dataframe of sensor data (acceleration (vector magntiude 1D and fast fourier transform **FFT**), heart rate and sp02)
- **`OsdbLabelGenerator`**:  Connects to the **OSDB** and reads raw **JSON** data to generate a labelled dataframe using the **seizureTimes** attribute
- **`OsdbDataReshaper`**: Reshapes the generated dataframe into a time series format for input into the **AMBER** model.
- **`Interpolator`**: Interpolates duplcaite **hr** values, applying cubic spline interpolation to ensure smooth and accurate data continuity between the timestep t and t+1


---

### **`How to Run the Data Processor (main.py)`** 🔍
#### 1. **Import OSDB Data Processing Classes**:
``` python
from config import Config
from OsdbDataProcessor.osdb_data_label_generator import OsdbDataLabelGenerator
from OsdbDataProcessor.osdb_data_reshaper import OsdbDataReshaper
from OsdbDataProcessor.osdb_interpolator import OsdbInterpolator
from OsdbDataProcessor.osdb_data_loader import OsdbDataLoader
```
#### 2. **Set path to the OSDB JSON file**:
``` python
file_path = 'Data/osdb_3min_allSeizures.json'  # Replace with your JSON file path
```

#### 2. **Load OSDB cata label generator and pass file path**:
``` python
osdb_processor = OsdbDataLabelGenerator(file_path)
df_result = processor.process_data()
```

#### 3. **Reshape the Dataframe for Timeseries Analysis**:
``` python
data_reshaper = OsdbDataReshaper(df_result)
reshaped_df = data_reshaper.reshape_data()
```

#### 4. **Initialise the Interpolator and interpolate the 'hr' column**:
``` python
interpolator = OsdbInterpolator(reshaped_df, column_to_interpolate="hr")
interpolator.interpolate_column(new_column_name="interpolated_hr", interval=config.N_TIME_STEPS, time_step=config.time_step_length)
# Retrieve the updated DataFrame
dataset_df = dataset_df.get_dataframe()    
print(dataset_df.sample())#print sample row
```
---

2. **Open Root from Integrated Terminal**:
```bash
path/AMBER/AMBER main python.py
```

