In [14]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

def load_activities(file_path):
    """Load and parse TrainActivities.csv."""
    try:
        # Read CSV with specified columns
        df = pd.read_csv(file_path, parse_dates=['Started', 'Finished', 'Updated'])
        return df
    except Exception as e:
        print(f"Error loading CSV: {e}")
        return None

def analyze_activities(df):
    """Analyze activity distribution and durations."""
    if df is None:
        print("No data to analyze.")
        return
    
    # Activity distribution by type
    activity_counts = df['Activity Type'].value_counts()
    print("\nActivity Distribution:")
    print(activity_counts)
    
    # Activity distribution by subject
    subject_activity_counts = df.groupby('Subject')['Activity Type'].value_counts().unstack().fillna(0)
    print("\nActivity Counts by Subject:")
    print(subject_activity_counts)
    
    # Calculate duration of each activity
    df['Duration'] = (df['Finished'] - df['Started']).dt.total_seconds()
    duration_stats = df.groupby('Activity Type')['Duration'].agg(['mean', 'min', 'max'])
    print("\nActivity Duration Stats (seconds):")
    print(duration_stats)
    
    # Plot activity distribution
    plt.figure(figsize=(10, 6))
    activity_counts.plot(kind='bar')
    plt.title('Activity Type Distribution')
    plt.xlabel('Activity Type')
    plt.ylabel('Count')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.savefig('activity_distribution.png')
    plt.close()


    # File path for TrainActivities.csv
file_path = r'TrainingDataPD25\TrainingDataPD25\TrainActivities.csv'

# Load and analyze data
df1 = load_activities(file_path)
analyze_activities(activities_df)



Activity Distribution:
Activity Type
1 (FACING camera) Sit and stand                                     48
2 (FACING camera) both hands SHAKING (sitting position)             43
3 Stand up from chair - both hands with SHAKING                     40
4 (Sideway) Sit & stand                                             36
5 (Sideway) both hands SHAKING (sitting)                            36
6 (Sideway) STAND up with - both hands SHAKING                      34
10 Slow walk (SHAKING hands/body, tiny step, head forward)          34
8 Walk (LEFT --> Right --> Left)                                    30
9 Walk & STOP/frozen, full body shaking, rotate then return back    30
7 Cool down - sitting/relax                                         11
Name: count, dtype: int64

Activity Counts by Subject:
Activity Type  1 (FACING camera) Sit and stand  \
Subject                                          
U1                                           5   
U2                                           4 

In [7]:
import os
import pandas as pd

try:
    activities_df = pd.read_csv(r'TrainingDataPD25\TrainingDataPD25\TrainActivities.csv')

    # Parse as JST (UTC+09:00)
    activities_df['Started'] = pd.to_datetime(
        activities_df['Started'],
        format='mixed',
        dayfirst=False
    ).dt.tz_localize('Asia/Tokyo')

    activities_df['Finished'] = pd.to_datetime(
        activities_df['Finished'],
        format='mixed',
        dayfirst=False
    ).dt.tz_localize('Asia/Tokyo')

    activities_df['Updated'] = pd.to_datetime(
        activities_df['Updated'],
        format='mixed',
        dayfirst=False
    ).dt.tz_localize('Asia/Tokyo')

    # Convert to UTC for consistent comparison
    for col in ['Started', 'Finished', 'Updated']:
        activities_df[col] = activities_df[col].dt.tz_convert('UTC')

    # ——— Your additional logic starts here ———

    # 1) Drop rows where Started is missing:
    activities_df = activities_df.dropna(subset=['Started'])

    # 2) If Started exists but Finished is missing, replace Finished with Updated:
    mask_no_finished = activities_df['Finished'].isna()
    activities_df.loc[mask_no_finished, 'Finished'] = activities_df.loc[mask_no_finished, 'Updated']

    # 3) Compute duration and drop rows where duration < 1 second:
    durations = activities_df['Finished'] - activities_df['Started']
    activities_df = activities_df[durations >= pd.Timedelta(seconds=1)]

    # ——— End of additional logic ———

    print("First few rows of activities_df (in UTC):")
    print(activities_df[['Started', 'Finished', 'Updated']].head())

except Exception as e:
    print(f"Error loading TrainActivities.csv: {e}")
    exit()


def get_file_time_range(file_path):
    """Load sensor data and calculate time range with consistent timezone."""
    try:
        df = pd.read_csv(file_path, names=['user_id', 'timestamp', 'x', 'y', 'z'])
        # Parse timestamps, preserving timezone (e.g., +0100), and convert to UTC
        df['timestamp'] = pd.to_datetime(df['timestamp'], format='mixed', dayfirst=False, errors='coerce').dt.tz_convert('UTC')
        df = df.dropna(subset=['timestamp'])
        min_time = df['timestamp'].min()
        max_time = df['timestamp'].max()
        return min_time, max_time, df
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        return None, None, None

# Base directory containing user subdirectories
base_dir = r'C:\projs\tmc\TrainingDataPD25\TrainingDataPD25\users_timeXYZ\users'

# Process each directory and file
labeled_data = []
for directory in os.listdir(base_dir):
    dir_path = os.path.join(base_dir, directory)
    if os.path.isdir(dir_path):
        for file in os.listdir(dir_path):
            if file.endswith('.csv'):
                file_path = os.path.join(dir_path, file)
                min_time, max_time, sensor_df = get_file_time_range(file_path)
                
                # Find overlapping activities
                overlapping_activities = activities_df[
                    (activities_df['Started'] <= max_time) & 
                    (activities_df['Finished'] >= min_time)
                ]
                
                # If there's at least one overlapping activity
                if not overlapping_activities.empty:
                    # Process each timestamp in the sensor data
                    for _, row in sensor_df.iterrows():
                        timestamp = row['timestamp']
                        # Find the specific activity this timestamp belongs to
                        matching_activity = overlapping_activities[
                            (overlapping_activities['Started'] <= timestamp) & 
                            (overlapping_activities['Finished'] >= timestamp)
                        ]
                        if not matching_activity.empty:
                            # Take the first matching activity (assuming no overlap within subject)
                            activity = matching_activity.iloc[0]
                            labeled_data.append({
                                'directory': directory,
                                'subject': activity['Subject'],
                                'timestamp': timestamp,
                                'x': row['x'],
                                'y': row['y'],
                                'z': row['z'],
                                'activity_type': activity['Activity Type']
                            })

# Convert to DataFrame
labeled_df = pd.DataFrame(labeled_data)

# Save or display the labeled data
labeled_df.to_csv('labeled_sensor_data.csv', index=False)

First few rows of activities_df (in UTC):
                     Started                  Finished  \
9  2024-09-03 10:56:00+00:00 2024-09-03 10:57:00+00:00   
14 2024-09-03 11:00:00+00:00 2024-09-03 11:01:00+00:00   
15 2024-09-03 11:01:00+00:00 2024-09-03 11:02:00+00:00   
19 2024-09-03 11:02:00+00:00 2024-09-03 11:03:00+00:00   
21 2024-09-03 11:03:00+00:00 2024-09-03 11:04:00+00:00   

                     Updated  
9  2024-09-03 10:57:00+00:00  
14 2024-09-03 11:01:00+00:00  
15 2024-09-03 11:02:00+00:00  
19 2024-09-03 11:03:00+00:00  
21 2024-09-03 11:04:00+00:00  


In [9]:
labeled_df

Unnamed: 0,directory,subject,timestamp,x,y,z,activity_type
0,2804,U22,2024-09-03 10:56:00.067000+00:00,-0.646,4.516,8.696,2 (FACING camera) both hands SHAKING (sitting ...
1,2804,U22,2024-09-03 10:56:00.202000+00:00,-0.682,4.463,8.680,2 (FACING camera) both hands SHAKING (sitting ...
2,2804,U22,2024-09-03 10:56:00.335000+00:00,-0.730,4.458,8.703,2 (FACING camera) both hands SHAKING (sitting ...
3,2804,U22,2024-09-03 10:56:00.470000+00:00,-0.696,4.457,8.655,2 (FACING camera) both hands SHAKING (sitting ...
4,2804,U22,2024-09-03 10:56:00.599000+00:00,-0.680,4.463,8.792,2 (FACING camera) both hands SHAKING (sitting ...
...,...,...,...,...,...,...,...
102927,2830,U1,2024-09-03 12:31:59.410000+00:00,4.654,-8.499,1.128,"10 Slow walk (SHAKING hands/body, tiny step, h..."
102928,2830,U1,2024-09-03 12:31:59.539000+00:00,5.971,-9.194,1.704,"10 Slow walk (SHAKING hands/body, tiny step, h..."
102929,2830,U1,2024-09-03 12:31:59.673000+00:00,4.621,-8.630,1.372,"10 Slow walk (SHAKING hands/body, tiny step, h..."
102930,2830,U1,2024-09-03 12:31:59.806000+00:00,3.639,-7.472,2.469,"10 Slow walk (SHAKING hands/body, tiny step, h..."


### Not Useful Files (around 500 CSVs) 
### {38, 1716, 2803, 2804, 2805, 2816, 2819, 2830}
