In [21]:
import pandas as pd
import numpy as np
import os
from datetime import datetime
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout

# Step 1: Load and Parse Activity Labels
print("Loading and parsing activity labels...")
activities_df = pd.read_csv(r'C:\projs\tmc\TrainingDataPD25\TrainingDataPD25\TrainActivities.csv')
activities_df['Started'] = pd.to_datetime(activities_df['Started'], format='mixed', dayfirst=True)
activities_df['Finished'] = pd.to_datetime(activities_df['Finished'], format='mixed', dayfirst=True)
activities_df['Activity_Label'] = activities_df['Activity Type'].str.extract(r'^(\d+)').astype(int)

# Group activities by subject
subject_activities = activities_df.groupby('Subject')
print(f"Found activities for {len(subject_activities)} subjects.")

# Step 2: Map Subdirectories to Subjects
data_dir = r'C:\projs\tmc\TrainingDataPD25\TrainingDataPD25\users_timeXYZ\users'
subdirs = [d for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))]
print(f"Found {len(subdirs)} subdirectories with accelerometer data.")

Loading and parsing activity labels...
Found activities for 9 subjects.
Found 18 subdirectories with accelerometer data.


In [None]:
def get_time_range(subdir):
    """Get the min and max timestamps from all CSV files in a subdirectory."""
    csv_files = [f for f in os.listdir(os.path.join(data_dir, subdir)) if f.endswith('.csv')]
    
    min_ts, max_ts = None, None
    for csv_file in csv_files:
        file_path = os.path.join(data_dir, subdir, csv_file)
        # Assuming timestamp is the second column (index 1)
        df = pd.read_csv(file_path, usecols=[1], header=None)
        df.columns = ['Timestamp']
        df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='mixed', dayfirst=True)

        current_min = df['Timestamp'].min()
        current_max = df['Timestamp'].max()
        min_ts = current_min if min_ts is None or current_min < min_ts else min_ts
        max_ts = current_max if max_ts is None or current_max > max_ts else max_ts
    return min_ts, max_ts

# Compute time ranges for each subdirectory
subdir_time_ranges = {subdir: get_time_range(subdir) for subdir in subdirs}

def intervals_overlap(subdir_min, subdir_max, intervals):
    """Check if subdirectory time range overlaps with any activity interval."""
    for start, end in intervals:
        if subdir_min <= end and subdir_max >= start:
            return True
    return False

# Map subdirectories to subjects based on time range overlap
subdir_to_subject = {}
for subdir, (subdir_min, subdir_max) in subdir_time_ranges.items():
    matching_subjects = [
        subject for subject, group in subject_activities
        if intervals_overlap(subdir_min, subdir_max, list(zip(group['Started'], group['Finished'])))
    ]
    if len(matching_subjects) == 1:
        subdir_to_subject[subdir] = matching_subjects[0]
    else:
        print(f"Warning: Subdirectory {subdir} matches {len(matching_subjects)} subjects.")
print(f"Mapped {len(subdir_to_subject)} subdirectories to subjects.")

# Step 3: Load and Label Accelerometer Data
all_labeled_data = []
for subdir, subject in subdir_to_subject.items():
    csv_files = [f for f in os.listdir(os.path.join(data_dir, subdir)) if f.endswith('.csv')]
    data_list = []
    for csv_file in csv_files:
        file_path = os.path.join(data_dir, subdir, csv_file)
        # Assuming columns: random (0), timestamp (1), x (2), y (3), z (4)
        df = pd.read_csv(file_path, usecols=[1, 2, 3, 4], header=None)
        df.columns = ['Timestamp', 'x', 'y', 'z']
        df['Timestamp'] = pd.to_datetime(df['Timestamp'])
        data_list.append(df)
    subdir_data = pd.concat(data_list, ignore_index=True)
    subdir_data['Subject'] = subject
    
    # Label data points with activity labels
    activities = subject_activities.get_group(subject)
    for _, activity in activities.iterrows():
        start = activity['Started']
        end = activity['Finished']
        activity_label = activity['Activity_Label']
        mask = (subdir_data['Timestamp'] >= start) & (subdir_data['Timestamp'] <= end)
        subdir_data.loc[mask, 'Activity'] = activity_label
    
    # Keep only labeled data
    labeled_data = subdir_data.dropna(subset=['Activity'])
    all_labeled_data.append(labeled_data)
    print(f"Processed data for subject {subject} from subdirectory {subdir}.")

all_data = pd.concat(all_labeled_data, ignore_index=True)
print(f"Total labeled data points: {len(all_data)}")



In [None]:
# Step 4: Segment Data into Windows
def segment_into_windows(df, window_size, stride):
    """Segment data into fixed-size windows with specified stride."""
    df = df.sort_values(by=['Subject', 'Timestamp'])
    df['activity_change'] = df['Activity'].ne(df['Activity'].shift()).cumsum()
    grouped = df.groupby(['Subject', 'activity_change'])
    windows = []
    labels = []
    for _, group in grouped:
        group = group.sort_values('Timestamp')
        num_samples = len(group)
        for start in range(0, num_samples - window_size + 1, stride):
            end = start + window_size
            window = group.iloc[start:end]
            if len(window) == window_size:
                windows.append(window[['x', 'y', 'z']].values)
                labels.append(window['Activity'].iloc[0] - 1)  # Adjust to 0-based indexing
    return np.array(windows), np.array(labels)

# Define training and validation subjects
train_subjects = ['U1', 'U2', 'U3', 'U4', 'U5', 'U6', 'U7']
val_subjects = ['U21', 'U22']

train_data = all_data[all_data['Subject'].isin(train_subjects)]
val_data = all_data[all_data['Subject'].isin(val_subjects)]

window_size = 100  # e.g., 2 seconds at 50Hz
stride = 50       # e.g., 1-second overlap
print("Segmenting data into windows...")
train_windows, train_labels = segment_into_windows(train_data, window_size, stride)
val_windows, val_labels = segment_into_windows(val_data, window_size, stride)
print(f"Training windows: {len(train_windows)}, Validation windows: {len(val_windows)}")

# Step 5: Build and Train the CNN Model
model = Sequential([
    Conv1D(32, kernel_size=5, activation='relu', input_shape=(window_size, 3)),
    MaxPooling1D(pool_size=2),
    Conv1D(64, kernel_size=5, activation='relu'),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(10, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

print("Training the model...")
model.fit(
    train_windows, train_labels,
    epochs=50,
    batch_size=32,
    validation_data=(val_windows, val_labels),
    verbose=1
)

print("Model training completed.")