In [2]:
import os
import pandas as pd
import re

##DATA LOADER CLASS

In [3]:
import os
import pandas as pd
import re

class CasasDataLoader:
    """
    Handles loading and initial inspection of the CASAS-Kyoto dataset.
    """
    def __init__(self, data_path):
        """
        Initializes the DataLoader with the path to the dataset.

        Args:
            data_path (str): The path to the 'adl_noerror' directory.
        """
        if not os.path.isdir(data_path):
            raise FileNotFoundError(f"Data path not found: {data_path}")
        self.data_path = data_path
        self.raw_df = None


    def load_and_consolidate(self):
        """
        Loads all CSV files from the data path, consolidates them into a
        single DataFrame, and extracts participant and task IDs.
        """
        print("--- Loading and Consolidating Data ---")
        all_data = []
        # Regex to extract pXX and tX from filenames like 'p01.t1.csv'
        file_pattern = re.compile(r'p(\d+)\.t(\d+)\.csv')

        files_to_load = sorted(os.listdir(self.data_path))
        if not files_to_load:
            print("No files found to load.")
            return

        for filename in files_to_load:
            match = file_pattern.match(filename)
            if match:
                participant_id = int(match.group(1))
                task_id = int(match.group(2))

                file_path = os.path.join(self.data_path, filename)
                try:

                    df = pd.read_csv(file_path, sep=',')
                    df['participant_id'] = participant_id
                    df['task_id'] = task_id-1
                    all_data.append(df)
                except Exception as e:
                    print(f"Could not read {filename}: {e}")

        if not all_data:
            print("No data was loaded. Check file formats or naming convention.")
            return

        self.raw_df = pd.concat(all_data, ignore_index=True)
        # Correction: Combine date and time, and assign correct column names.
        self.raw_df.columns = ['date', 'time', 'sensor', 'value', 'participant_id', 'task_id']
        # Correction 2: Use format='mixed' to handle inconsistent microsecond precision.
        self.raw_df['timestamp'] = pd.to_datetime(self.raw_df['date'] + ' ' + self.raw_df['time'], format='mixed')

        # Reorder and drop original date/time columns
        self.raw_df = self.raw_df[['timestamp', 'sensor', 'value', 'participant_id', 'task_id']]

        print(f"Successfully loaded {len(all_data)} files.")
        print(f"Total sensor events loaded: {len(self.raw_df)}")
        print("\nConsolidated DataFrame head:")
        print(self.raw_df.head())
        print("\nConsolidated DataFrame tail:")
        print(self.raw_df.tail())
        print("\n")
        # Rename columns to be consistent with the previous logic
        self.raw_df.columns = ['Timestamp', 'SensorID', 'SensorValue', 'participant_id', 'task_id']


        return self.raw_df


In [6]:
# Path to the dataset
DATA_PATH = 'Datasets/adl_noerror'

data_loader = CasasDataLoader(DATA_PATH)
df = data_loader.load_and_consolidate()

if df is not None:
    print("\nCombined Dataset Created Successfully!")
else:
    print("\nCombined Dataset formation failed!")

df.to_csv('combined_dataset.csv', index=False)  

--- Loading and Consolidating Data ---
Successfully loaded 120 files.
Total sensor events loaded: 6425

Consolidated DataFrame head:
                   timestamp sensor value  participant_id  task_id
0 2008-02-27 12:43:27.416392    M08    ON               1        0
1 2008-02-27 12:43:27.848100    M07    ON               1        0
2 2008-02-27 12:43:28.487061    M09    ON               1        0
3 2008-02-27 12:43:29.222889    M14    ON               1        0
4 2008-02-27 12:43:29.499828    M23   OFF               1        0

Consolidated DataFrame tail:
                      timestamp sensor      value  participant_id  task_id
6420 2008-05-21 16:47:02.139875    M17         ON              51        4
6421 2008-05-21 16:47:04.533100  AD1-B   0.316997              51        4
6422 2008-05-21 16:47:04.533100  AD1-C   0.175393              51        4
6423 2008-05-21 16:47:06.322697    M18         ON              51        4
6424 2008-05-21 16:47:07.849500  AD1-B  0.0651572           

In [7]:
df = pd.read_csv('combined_dataset.csv')
df

Unnamed: 0,Timestamp,SensorID,SensorValue,participant_id,task_id
0,2008-02-27 12:43:27.416392,M08,ON,1,0
1,2008-02-27 12:43:27.848100,M07,ON,1,0
2,2008-02-27 12:43:28.487061,M09,ON,1,0
3,2008-02-27 12:43:29.222889,M14,ON,1,0
4,2008-02-27 12:43:29.499828,M23,OFF,1,0
...,...,...,...,...,...
6420,2008-05-21 16:47:02.139875,M17,ON,51,4
6421,2008-05-21 16:47:04.533100,AD1-B,0.316997,51,4
6422,2008-05-21 16:47:04.533100,AD1-C,0.175393,51,4
6423,2008-05-21 16:47:06.322697,M18,ON,51,4


##Data Processor Class

In [8]:
import pandas as pd

class DataProcessor:
    """
    Handles feature engineering: encoding sensor IDs/values and calculating time deltas.
    """
    def __init__(self, df):
        self.df = df.copy()
        self.sensor_id_map = None

    def encode_sensor_id(self):
        """
        Encodes SensorID as integers and stores the mapping.
        """
        unique_sensors = sorted(self.df['SensorID'].unique())

        self.sensor_id_map = {sid: idx for idx, sid in enumerate(unique_sensors)}
        self.df['SensorID_enc'] = self.df['SensorID'].map(self.sensor_id_map)

        return self.df

    def calculate_time_deltas(self):
        """
        Calculates time difference (in seconds) between consecutive events for each participant-task group.
        """
        # Ensure Timestamp is datetime
        self.df['Timestamp'] = pd.to_datetime(self.df['Timestamp'], errors='coerce')

        self.df = self.df.sort_values(['participant_id', 'task_id', 'Timestamp'])

        grouped = self.df.groupby(['participant_id', 'task_id'])['Timestamp']
        time_diff = grouped.diff()
        self.df['TimeDelta'] = time_diff.dt.total_seconds()

        self.df['TimeDelta'] = self.df['TimeDelta'].fillna(0)
        return self.df

    def add_time_features(self):
        """
        Adds time-based features: time_since_start, event_count, total_duration, progress_ratio.
        Grouped by participant_id and task_id.
        """
        # Ensure Timestamp is datetime
        self.df['Timestamp'] = pd.to_datetime(self.df['Timestamp'], errors='coerce')
        self.df = self.df.sort_values(['participant_id', 'task_id', 'Timestamp'])
        group_cols = ['participant_id', 'task_id']
        # time_since_start
        self.df['time_since_start'] = self.df.groupby(group_cols)['Timestamp'].transform(lambda x: (x - x.iloc[0]).dt.total_seconds())
        # event_count
        self.df['event_count'] = self.df.groupby(group_cols).cumcount()
        # total_duration
        self.df['total_duration'] = self.df.groupby(group_cols)['time_since_start'].transform('max')
        # progress_ratio
        self.df['progress_ratio'] = self.df['time_since_start'] / self.df['total_duration'].replace(0, 1)
        return self.df

    def add_time_of_day(self):
        """
        Adds a categorical feature 'TimeOfDay' (Morning, Afternoon, Evening, Night) based on the hour of the Timestamp.
        """
        def get_time_of_day(hour):
            if 5 <= hour < 12:
                return 'Morning'
            elif 12 <= hour < 17:
                return 'Afternoon'
            elif 17 <= hour < 21:
                return 'Evening'
            else:
                return 'Night'
        self.df['Hour'] = self.df['Timestamp'].dt.hour
        self.df['TimeOfDay'] = self.df['Hour'].apply(get_time_of_day)
        return self.df

    def add_sensor_type(self):
        """
        Adds a feature 'SensorType' based on the prefix of SensorID.
        """
        def sensor_type_from_id(sensor_id):
            if isinstance(sensor_id, str):
                if sensor_id.startswith('M'):
                    return 'motion'
                elif sensor_id.startswith('I'):
                    return 'item_use'
                elif sensor_id.startswith('D'):
                    return 'door'
                elif sensor_id.startswith('AD1-A') or sensor_id.startswith('AD1-B'):
                    return 'water'
                elif sensor_id.startswith('AD1-C'):
                    return 'burner'
                elif '*' in sensor_id:
                    return 'phone_use'
            return 'unknown'
        self.df['SensorType'] = self.df['SensorID'].apply(sensor_type_from_id)
        return self.df

    def create_differentiated_sensor_values(self):
        """
        Creates three separate sensor value columns based on the type of sensor.
        - sensor_value_binary: For ON/OFF, OPEN/CLOSE states.
        - sensor_value_continuous: For numeric values from sensors like water and burner.
        - sensor_value_phone: For events occurring during a phone call (between 'start' and 'end').
        """
        print("\nCreating differentiated sensor value columns...")

        # Initialize columns with default 0
        self.df['sensor_value_binary'] = 0
        self.df['sensor_value_continuous'] = 0.0
        self.df['sensor_value_phone'] = 0

        # 1. Binary values (ON/OFF, OPEN/CLOSE)
        binary_map = {'ON': 1, 'OFF': 0, 'OPEN': 1, 'CLOSE': 0}
        binary_mask = self.df['SensorValue'].isin(binary_map.keys())
        self.df.loc[binary_mask, 'sensor_value_binary'] = self.df.loc[binary_mask, 'SensorValue'].map(binary_map)
        print("- Created 'sensor_value_binary' for ON/OFF, OPEN/CLOSE states.")

        # 2. Continuous values (water, burner)
        continuous_mask = self.df['SensorType'].isin(['water', 'burner'])
        self.df.loc[continuous_mask, 'sensor_value_continuous'] = pd.to_numeric(self.df.loc[continuous_mask, 'SensorValue'], errors='coerce').fillna(0)
        print("- Created 'sensor_value_continuous' for water and burner sensors.")

        # 3. Phone values (events between 'start' and 'end' inclusive) - Robust implementation
        print("- Creating 'sensor_value_phone' for events between phone call start/end.")
        group_cols = ['participant_id', 'task_id']

        # Ensure SensorValue is string and handle case/whitespace
        sensor_values_lower = self.df['SensorValue'].astype(str).str.lower().str.strip()
        phone_mask = self.df['SensorType'] == 'phone_use'

        # Use temporary columns for clarity and robustness
        self.df['__start_marker'] = (phone_mask & (sensor_values_lower == 'start')).astype(int)
        self.df['__end_marker'] = (phone_mask & (sensor_values_lower == 'end')).astype(int)

        # Calculate cumulative starts and ends within each activity group
        self.df['__starts_cumsum'] = self.df.groupby(group_cols)['__start_marker'].cumsum()
        self.df['__ends_cumsum'] = self.df.groupby(group_cols)['__end_marker'].cumsum()

        # To include the 'end' event in the active state, we check if the call was active *before* this event.
        # We do this by looking at the cumulative end count from the *previous* event.
        self.df['__ends_cumsum_shifted'] = self.df.groupby(group_cols)['__ends_cumsum'].shift(1).fillna(0)

        # A phone call is active if the number of starts is greater than the number of previously completed calls.
        is_in_call = (self.df['__starts_cumsum'] > self.df['__ends_cumsum_shifted'])

        # Assign the final boolean series (as int) to the target column
        self.df['sensor_value_phone'] = is_in_call.astype(int)

        # Drop temporary columns
        self.df = self.df.drop(columns=['__start_marker', '__end_marker', '__starts_cumsum', '__ends_cumsum', '__ends_cumsum_shifted'])

        return self.df

    def encode_time_of_day(self):
        """
        Encodes 'TimeOfDay' into numerical format.
        """
        time_of_day_map = {'Morning': 0, 'Afternoon': 1, 'Evening': 2, 'Night': 3}
        self.df['TimeOfDay_enc'] = self.df['TimeOfDay'].map(time_of_day_map).fillna(-1).astype(int)
        return self.df

    def encode_sensor_type(self):
        """
        Encodes 'SensorType' into numerical format.
        """
        unique_types = sorted([t for t in self.df['SensorType'].unique() if t is not None])
        sensor_type_map = {stype: idx for idx, stype in enumerate(unique_types)}
        self.df['SensorType_enc'] = self.df['SensorType'].map(sensor_type_map).fillna(-1).astype(int)
        return self.df

    def normalize_features(self):
        """
        Normalizes continuous numerical features using Min-Max scaling within each participant-task group.
        """
        cols_to_normalize = [
            'TimeDelta',
            'event_count',
            'sensor_value_continuous',
        ]
        print("\nNormalizing the following columns using Min-Max scaling (0-1 range) per activity:")
        for col in cols_to_normalize:
            print(f"- {col}")

        group_cols = ['participant_id', 'task_id']

        for col in cols_to_normalize:
            grouped = self.df.groupby(group_cols)[col]
            min_val = grouped.transform('min')
            max_val = grouped.transform('max')
            range_val = max_val - min_val

            # Apply normalization, replacing original column
            self.df[col] = (self.df[col] - min_val) / range_val.replace(0, 1)
            self.df[col] = self.df[col].fillna(0)  # For single-event groups where range is 0

        return self.df

    def process(self):
        """
        Runs all feature engineering steps and returns the processed DataFrame.
        """
        self.encode_sensor_id()
        self.calculate_time_deltas()
        self.add_time_features()
        self.add_time_of_day()
        self.add_sensor_type()
        self.create_differentiated_sensor_values()
        self.encode_time_of_day()
        self.encode_sensor_type()
        self.normalize_features()

        print("\nRemoving redundant features: 'time_since_start', 'total_duration'")
        # Only keep the encoded and relevant columns
        processed = self.df[['Timestamp', 'SensorID_enc', 'TimeDelta',
                             'event_count', 'progress_ratio',
                             'participant_id', 'task_id', 'TimeOfDay_enc', 'SensorType_enc',
                             'sensor_value_binary', 'sensor_value_continuous','sensor_value_phone']]
        return processed


class LabelMapper:
    """
    Maps 0-based task_id to human-readable activity labels and adds them to the DataFrame.
    """
    def __init__(self):
        self.activity_map = {
            0: 'Make a phone call',
            1: 'Wash hands',
            2: 'Cook',
            3: 'Eat',
            4: 'Clean'
        }

    def map_labels(self, df):
        """
        Adds a column 'ActivityLabel' (string) to the DataFrame.
        """
        df = df.copy()
        df['ActivityLabel'] = df['task_id'].map(self.activity_map)
        return df

In [9]:
df = pd.read_csv('combined_dataset.csv')
if df is not None:
    processor = DataProcessor(df)
    processed_df = processor.process()

    label_mapper = LabelMapper()
    labeled_df = label_mapper.map_labels(processed_df)
    
else:
    print("Data loading failed. Cannot proceed.")

labeled_df.to_csv('combined_dataset_II.csv',index=False)


Creating differentiated sensor value columns...
- Created 'sensor_value_binary' for ON/OFF, OPEN/CLOSE states.
- Created 'sensor_value_continuous' for water and burner sensors.
- Creating 'sensor_value_phone' for events between phone call start/end.

Normalizing the following columns using Min-Max scaling (0-1 range) per activity:
- TimeDelta
- event_count
- sensor_value_continuous

Removing redundant features: 'time_since_start', 'total_duration'


In [11]:
df2 = pd.read_csv('combined_dataset_II.csv')
df2.head()

Unnamed: 0,Timestamp,SensorID_enc,TimeDelta,event_count,progress_ratio,participant_id,task_id,TimeOfDay_enc,SensorType_enc,sensor_value_binary,sensor_value_continuous,sensor_value_phone,ActivityLabel
0,2008-02-27 12:43:27.416392,15,0.0,0.0,0.0,1,0,1,3,1,0.0,0,Make a phone call
1,2008-02-27 12:43:27.848100,14,0.01596,0.014085,0.001211,1,0,1,3,1,0.0,0,Make a phone call
2,2008-02-27 12:43:28.487061,16,0.023622,0.028169,0.003003,1,0,1,3,1,0.0,0,Make a phone call
3,2008-02-27 12:43:29.222889,18,0.027203,0.042254,0.005067,1,0,1,3,1,0.0,0,Make a phone call
4,2008-02-27 12:43:29.499828,23,0.010238,0.056338,0.005844,1,0,1,3,0,0.0,0,Make a phone call


#Cascade Ensemble Bi-LSTM with Attention Mechanism

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, classification_report
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout, Concatenate, Layer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf

# ----------------------- Attention Layer -----------------------
class Attention(Layer):
    def __init__(self, **kwargs):
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(name='att_weight', shape=(input_shape[-1], 1),
                                 initializer='random_normal', trainable=True)
        self.b = self.add_weight(name='att_bias', shape=(input_shape[1], 1),
                                 initializer='zeros', trainable=True)
        super(Attention, self).build(input_shape)

    def call(self, x):
        e = tf.keras.backend.tanh(tf.keras.backend.dot(x, self.W) + self.b)
        a = tf.keras.backend.softmax(e, axis=1)
        output = x * a
        return tf.keras.backend.sum(output, axis=1)

# ---------------------- Load and Sequence Data ----------------------

PROCESSED_CSV = '/content/combined_dataset_II.csv'  # Update path
SEQUENCE_LENGTH = 50
STEP = 5

df = pd.read_csv(PROCESSED_CSV)
seq_gen = SequenceGenerator(df, sequence_length=SEQUENCE_LENGTH, step=STEP)
X, y = seq_gen.generate_sequences()

num_classes = 5
y_cat = to_categorical(y, num_classes)

# ---------------------- Build Model ----------------------
def build_cascade_ensemble_lstm_attention(input_shape, num_classes, n_cascade=3, lstm_units=64, dense_units=32, dropout_rate=0.3):
    inputs = Input(shape=input_shape)
    x = inputs
    for i in range(n_cascade):
        x = LSTM(lstm_units, return_sequences=True, name=f'lstm_{i+1}')(x)
        x = Dropout(dropout_rate, name=f'dropout_{i+1}')(x)
        if i < n_cascade - 1:
            repeated = tf.keras.layers.RepeatVector(input_shape[0])(x[:, -1, :])
            x = Concatenate()([inputs, repeated])
    x = Attention()(x)
    x = Dense(dense_units, activation='relu')(x)
    x = Dropout(0.2)(x)
    out = Dense(num_classes, activation='softmax')(x)
    model = Model(inputs=inputs, outputs=out)
    return model

# ---------------------- K-Fold Training ----------------------
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

fold = 1
for train_idx, test_idx in skf.split(X, y):
    print(f"\n===== Fold {fold} =====")

    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y_cat[train_idx], y_cat[test_idx]
    y_test_labels = y[test_idx]

    model = build_cascade_ensemble_lstm_attention((SEQUENCE_LENGTH, X.shape[2]), num_classes)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    model.fit(
        X_train, y_train,
        validation_split=0.2,
        epochs=50,
        batch_size=32,
        #callbacks=[early_stop],
        verbose=2
    )

    # Evaluation
    loss, acc = model.evaluate(X_test, y_test, verbose=0)
    print(f"Test Accuracy: {acc:.3f}")

    y_pred = np.argmax(model.predict(X_test), axis=1)
    print("\nClassification Report:")
    print(classification_report(y_test_labels, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test_labels, y_pred))

    fold += 1



===== Fold 1 =====
Epoch 1/50
7/7 - 4s - 604ms/step - accuracy: 0.4923 - loss: 1.2460 - val_accuracy: 0.6531 - val_loss: 0.8418
Epoch 2/50
7/7 - 0s - 33ms/step - accuracy: 0.5795 - loss: 1.0240 - val_accuracy: 0.8163 - val_loss: 0.7108
Epoch 3/50
7/7 - 0s - 33ms/step - accuracy: 0.6923 - loss: 0.9204 - val_accuracy: 0.7143 - val_loss: 0.6899
Epoch 4/50
7/7 - 0s - 34ms/step - accuracy: 0.6667 - loss: 0.9087 - val_accuracy: 0.6939 - val_loss: 0.6514
Epoch 5/50
7/7 - 0s - 37ms/step - accuracy: 0.6821 - loss: 0.8757 - val_accuracy: 0.6531 - val_loss: 0.7172
Epoch 6/50
7/7 - 0s - 39ms/step - accuracy: 0.7333 - loss: 0.7520 - val_accuracy: 0.6531 - val_loss: 0.6900
Epoch 7/50
7/7 - 0s - 39ms/step - accuracy: 0.7641 - loss: 0.7310 - val_accuracy: 0.6735 - val_loss: 0.7095
Epoch 8/50
7/7 - 0s - 26ms/step - accuracy: 0.7897 - loss: 0.6324 - val_accuracy: 0.7755 - val_loss: 0.5017
Epoch 9/50
7/7 - 0s - 26ms/step - accuracy: 0.7795 - loss: 0.6054 - val_accuracy: 0.7143 - val_loss: 0.7187
Epoch 1

#Tranformer Architecture