In [1]:
import pandas as pd
import numpy as np
import os
from datetime import datetime
# Step 1: Load and Parse Activity Labels
print("Loading and parsing activity labels...")
# Step 1: Load and Parse Activity Labels
print("Loading and parsing activity labels...")
activities_df = pd.read_csv(r'C:\projs\tmc\TrainingDataPD25\TrainingDataPD25\TrainActivities.csv')

Loading and parsing activity labels...
Loading and parsing activity labels...


### dateTime-transform

In [2]:
date_columns = ['Started', 'Finished', 'Updated']

# Convert while keeping NaNs intact
activities_df[date_columns] = activities_df[date_columns].apply(
    pd.to_datetime, format='%Y/%m/%d %H:%M', errors='coerce'
)

# Check result
print(activities_df[date_columns].dtypes)
activities_df.head()


Started     datetime64[ns]
Finished    datetime64[ns]
Updated     datetime64[ns]
dtype: object


Unnamed: 0,ID,Activity Type ID,Activity Type,Started,Finished,Updated,Subject
0,1130251,2806,1 (FACING camera) Sit and stand,2024-09-02 06:16:00,2024-09-02 06:16:00,2024-09-02 06:16:00,U22
1,1130254,2807,2 (FACING camera) both hands SHAKING (sitting ...,2024-09-02 06:17:00,2024-09-02 06:17:00,2024-09-02 06:17:00,U22
2,1130257,2807,2 (FACING camera) both hands SHAKING (sitting ...,2024-09-02 06:18:00,2024-09-02 06:18:00,2024-09-02 06:18:00,U22
3,1130261,2806,1 (FACING camera) Sit and stand,2024-09-02 06:20:00,2024-09-02 06:20:00,2024-09-02 06:20:00,U22
4,1130292,2806,1 (FACING camera) Sit and stand,2024-09-02 06:42:00,2024-09-02 06:42:00,2024-09-02 06:42:00,U2


### Checking for NaNs

In [3]:
# List the columns you want to check
columns_to_check = activities_df.columns  # change this list as needed

# Check for NaN/NaT counts in each of the specified columns
nan_counts = activities_df[columns_to_check].isna().sum()

# Display the result
print("NaN/NaT counts per column:")
print(nan_counts)


NaN/NaT counts per column:
ID                   0
Activity Type ID     0
Activity Type        0
Started             66
Finished            67
Updated              0
Subject              0
dtype: int64


### Making activity classes

In [5]:
activities_df['activity_class'] = activities_df['Activity Type'].str.extract(r'(\d+)').astype(int)

print(f"Loaded {len(activities_df)} activities for subjects: {activities_df['Subject'].unique()}")

Loaded 342 activities for subjects: ['U22' 'U2' 'U1' 'U21' 'U4' 'U5' 'U3' 'U6' 'U7']


In [11]:
import os
import glob
sensor_files = glob.glob(r'C:/projs/tmc/TrainingDataPD25/TrainingDataPD25/users_timeXYZ/users/*/*.csv', recursive=True)
print(f"Found {len(sensor_files)} sensor data files.")

Found 3530 sensor data files.


In [13]:
# Create metadata for sensor files (min/max timestamps)
metadata = []
for file in sensor_files:
    # Load only the timestamp column (index 1)
    df = pd.read_csv(file, header=None, parse_dates=[1])
    min_time = df[1].min()
    max_time = df[1].max()
    subdir = os.path.basename(os.path.dirname(file))
    metadata.append({'file': file, 'subdir': subdir, 'min_time': min_time, 'max_time': max_time})

metadata_df = pd.DataFrame(metadata)
print(f"Created metadata for {len(metadata_df)} sensor files across {metadata_df['subdir'].nunique()} subdirectories.")

  df = pd.read_csv(file, header=None, parse_dates=[1])
  df = pd.read_csv(file, header=None, parse_dates=[1])
  df = pd.read_csv(file, header=None, parse_dates=[1])


Created metadata for 3530 sensor files across 18 subdirectories.


In [None]:
metadata_df

Unnamed: 0,ID,Activity Type ID,Activity Type,Started,Finished,Updated,Subject,activity_class
0,1130251,2806,1 (FACING camera) Sit and stand,2024-09-02 06:16:00,2024-09-02 06:16:00,2024-09-02 06:16:00,U22,1
1,1130254,2807,2 (FACING camera) both hands SHAKING (sitting ...,2024-09-02 06:17:00,2024-09-02 06:17:00,2024-09-02 06:17:00,U22,2
2,1130257,2807,2 (FACING camera) both hands SHAKING (sitting ...,2024-09-02 06:18:00,2024-09-02 06:18:00,2024-09-02 06:18:00,U22,2
3,1130261,2806,1 (FACING camera) Sit and stand,2024-09-02 06:20:00,2024-09-02 06:20:00,2024-09-02 06:20:00,U22,1
4,1130292,2806,1 (FACING camera) Sit and stand,2024-09-02 06:42:00,2024-09-02 06:42:00,2024-09-02 06:42:00,U2,1
...,...,...,...,...,...,...,...,...
337,1164181,2815,"10 Slow walk (SHAKING hands/body, tiny step, h...",NaT,NaT,2024-09-11 04:55:00,U7,10
338,1164187,2806,1 (FACING camera) Sit and stand,2024-09-11 04:58:00,2024-09-11 04:58:00,2024-09-11 04:58:00,U7,1
339,1164217,2815,"10 Slow walk (SHAKING hands/body, tiny step, h...",2024-09-11 05:09:00,2024-09-11 05:10:00,2024-09-11 05:10:00,U7,10
340,1164218,2815,"10 Slow walk (SHAKING hands/body, tiny step, h...",2024-09-11 05:10:00,2024-09-11 05:10:00,2024-09-11 05:10:00,U7,10


In [17]:
subject_times = activities_df.groupby('Subject').agg({'Started': 'min', 'Finished': 'max'})
subdir_times = metadata_df.groupby('subdir').agg({'min_time': 'min', 'max_time': 'max'})

subdir_to_subject = {}
for subdir, row in subdir_times.iterrows():
    subdir_min = row['min_time']
    subdir_max = row['max_time']
    overlapping_subjects = subject_times[
        (subject_times['Started'] <= subdir_max) & 
        (subject_times['Finished'] >= subdir_min)
    ].index
    if len(overlapping_subjects) == 1:
        subdir_to_subject[subdir] = overlapping_subjects[0]
    else:
        print(f"Warning: Ambiguity for subdir {subdir} with subjects {overlapping_subjects}")

TypeError: Invalid comparison between dtype=datetime64[ns] and Timestamp

In [None]:
import os
import glob
import pandas as pd

# Root directory where the Activity Type ID folders are stored
root_dir = r'C:\projs\tmc\TrainingDataPD25\TrainingDataPD25\users_timeXYZ\users'

# List to hold all dataframes
all_csv_data = []

# Loop through each folder in the root directory
for activity_folder in os.listdir(root_dir):
    folder_path = os.path.join(root_dir, activity_folder)
    
    if os.path.isdir(folder_path):
        # Find all CSV files inside the activity folder
        csv_files = glob.glob(os.path.join(folder_path, '*.csv'))
        
        for file in csv_files:
            try:
                df = pd.read_csv(file)
                df['Activity_Type_ID'] = activity_folder  # Tag with folder name
                df['Source_File'] = os.path.basename(file)  # Optionally track file name
                all_csv_data.append(df)
            except Exception as e:
                print(f"❌ Error reading {file}: {e}")

# Combine all into a single DataFrame
combined_df = pd.concat(all_csv_data, ignore_index=True)

# Preview
print(combined_df.head())


In [11]:
def get_time_range(subdir):
    """Get the min and max timestamps from all CSV files in a subdirectory."""
    csv_files = [f for f in os.listdir(os.path.join(data_dir, subdir)) if f.endswith('.csv')]
    
    min_ts, max_ts = None, None
    for csv_file in csv_files:
        file_path = os.path.join(data_dir, subdir, csv_file)
        df = pd.read_csv(file_path, usecols=[1], header=None)
        df.columns = ['Timestamp']
        df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='mixed', dayfirst=True)

        # Convert to UTC
        df['Timestamp'] = df['Timestamp'].dt.tz_localize('UTC') if df['Timestamp'].dt.tz is None else df['Timestamp'].dt.tz_convert('UTC')

        current_min = df['Timestamp'].min()
        current_max = df['Timestamp'].max()
        min_ts = current_min if min_ts is None or current_min < min_ts else min_ts
        max_ts = current_max if max_ts is None or current_max > max_ts else max_ts
    return min_ts, max_ts

# Compute time ranges for each subdirectory
subdir_time_ranges = {subdir: get_time_range(subdir) for subdir in subdirs}

def intervals_overlap(subdir_min, subdir_max, intervals):
    """Check if subdirectory time range overlaps with any activity interval."""
    
    # Localize subdir timestamps if needed
    if subdir_min.tzinfo is None:
        subdir_min = subdir_min.tz_localize('UTC')
    else:
        subdir_min = subdir_min.tz_convert('UTC')

    if subdir_max.tzinfo is None:
        subdir_max = subdir_max.tz_localize('UTC')
    else:
        subdir_max = subdir_max.tz_convert('UTC')

    for start, end in intervals:
        # Localize each interval if needed
        if start.tzinfo is None:
            start = start.tz_localize('UTC')
        else:
            start = start.tz_convert('UTC')

        if end.tzinfo is None:
            end = end.tz_localize('UTC')
        else:
            end = end.tz_convert('UTC')

        if subdir_min <= end and subdir_max >= start:
            return True

    return False


# Map subdirectories to subjects based on time range overlap
subdir_to_subject = {}
for subdir, (subdir_min, subdir_max) in subdir_time_ranges.items():
    matching_subjects = [
        subject for subject, group in subject_activities
        if intervals_overlap(subdir_min, subdir_max, list(zip(group['Started'], group['Finished'])))
    ]
    if len(matching_subjects) == 1:
        subdir_to_subject[subdir] = matching_subjects[0]
    else:
        print(f"Warning: Subdirectory {subdir} matches {len(matching_subjects)} subjects.")

print(f"Mapped {len(subdir_to_subject)} subdirectories to subjects.")

# Step 3: Load and Label Accelerometer Data
all_labeled_data = []
for subdir, subject in subdir_to_subject.items():
    csv_files = [f for f in os.listdir(os.path.join(data_dir, subdir)) if f.endswith('.csv')]
    data_list = []
    for csv_file in csv_files:
        file_path = os.path.join(data_dir, subdir, csv_file)
        # Assuming columns: random (0), timestamp (1), x (2), y (3), z (4)
        df = pd.read_csv(file_path, usecols=[1, 2, 3, 4], header=None)
        df.columns = ['Timestamp', 'x', 'y', 'z']
        df['Timestamp'] = pd.to_datetime(df['Timestamp'])
        data_list.append(df)
    subdir_data = pd.concat(data_list, ignore_index=True)
    subdir_data['Subject'] = subject
    
    # Label data points with activity labels
    activities = subject_activities.get_group(subject)
    for _, activity in activities.iterrows():
        start = activity['Started']
        end = activity['Finished']
        activity_label = activity['Activity_Label']
        mask = (subdir_data['Timestamp'] >= start) & (subdir_data['Timestamp'] <= end)
        subdir_data.loc[mask, 'Activity'] = activity_label
    
    # Keep only labeled data
    labeled_data = subdir_data.dropna(subset=['Activity'])
    all_labeled_data.append(labeled_data)
    print(f"Processed data for subject {subject} from subdirectory {subdir}.")

all_data = pd.concat(all_labeled_data, ignore_index=True)
print(f"Total labeled data points: {len(all_data)}")

Mapped 1 subdirectories to subjects.
Processed data for subject U7 from subdirectory 2812.
Total labeled data points: 0


In [15]:
subdir_to_subject

{'2812': 'U7'}

In [13]:
# Step 4: Segment Data into Windows
def segment_into_windows(df, window_size, stride):
    """Segment data into fixed-size windows with specified stride."""
    df = df.sort_values(by=['Subject', 'Timestamp'])
    df['activity_change'] = df['Activity'].ne(df['Activity'].shift()).cumsum()
    grouped = df.groupby(['Subject', 'activity_change'])
    windows = []
    labels = []
    for _, group in grouped:
        group = group.sort_values('Timestamp')
        num_samples = len(group)
        for start in range(0, num_samples - window_size + 1, stride):
            end = start + window_size
            window = group.iloc[start:end]
            if len(window) == window_size:
                windows.append(window[['x', 'y', 'z']].values)
                labels.append(window['Activity'].iloc[0] - 1)  # Adjust to 0-based indexing
    return np.array(windows), np.array(labels)

# Define training and validation subjects
train_subjects = ['U1', 'U2', 'U3', 'U4', 'U5', 'U6', 'U7']
val_subjects = ['U21', 'U22']

train_data = all_data[all_data['Subject'].isin(train_subjects)]
val_data = all_data[all_data['Subject'].isin(val_subjects)]

window_size = 100  # e.g., 2 seconds at 50Hz
stride = 50       # e.g., 1-second overlap
print("Segmenting data into windows...")
train_windows, train_labels = segment_into_windows(train_data, window_size, stride)
val_windows, val_labels = segment_into_windows(val_data, window_size, stride)
print(f"Training windows: {len(train_windows)}, Validation windows: {len(val_windows)}")

# Step 5: Build and Train the CNN Model
model = Sequential([
    Conv1D(32, kernel_size=5, activation='relu', input_shape=(window_size, 3)),
    MaxPooling1D(pool_size=2),
    Conv1D(64, kernel_size=5, activation='relu'),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(10, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

print("Training the model...")
model.fit(
    train_windows, train_labels,
    epochs=50,
    batch_size=32,
    validation_data=(val_windows, val_labels),
    verbose=1
)

print("Model training completed.")

Segmenting data into windows...
Training windows: 0, Validation windows: 0


NameError: name 'Sequential' is not defined