In [14]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

def load_activities(file_path):
    """Load and parse TrainActivities.csv."""
    try:
        # Read CSV with specified columns
        df = pd.read_csv(file_path, parse_dates=['Started', 'Finished', 'Updated'])
        return df
    except Exception as e:
        print(f"Error loading CSV: {e}")
        return None

def analyze_activities(df):
    """Analyze activity distribution and durations."""
    if df is None:
        print("No data to analyze.")
        return
    
    # Activity distribution by type
    activity_counts = df['Activity Type'].value_counts()
    print("\nActivity Distribution:")
    print(activity_counts)
    
    # Activity distribution by subject
    subject_activity_counts = df.groupby('Subject')['Activity Type'].value_counts().unstack().fillna(0)
    print("\nActivity Counts by Subject:")
    print(subject_activity_counts)
    
    # Calculate duration of each activity
    df['Duration'] = (df['Finished'] - df['Started']).dt.total_seconds()
    duration_stats = df.groupby('Activity Type')['Duration'].agg(['mean', 'min', 'max'])
    print("\nActivity Duration Stats (seconds):")
    print(duration_stats)
    
    # Plot activity distribution
    plt.figure(figsize=(10, 6))
    activity_counts.plot(kind='bar')
    plt.title('Activity Type Distribution')
    plt.xlabel('Activity Type')
    plt.ylabel('Count')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.savefig('activity_distribution.png')
    plt.close()


    # File path for TrainActivities.csv
file_path = r'TrainingDataPD25\TrainingDataPD25\TrainActivities.csv'

# Load and analyze data
df1 = load_activities(file_path)
analyze_activities(activities_df)



Activity Distribution:
Activity Type
1 (FACING camera) Sit and stand                                     48
2 (FACING camera) both hands SHAKING (sitting position)             43
3 Stand up from chair - both hands with SHAKING                     40
4 (Sideway) Sit & stand                                             36
5 (Sideway) both hands SHAKING (sitting)                            36
6 (Sideway) STAND up with - both hands SHAKING                      34
10 Slow walk (SHAKING hands/body, tiny step, head forward)          34
8 Walk (LEFT --> Right --> Left)                                    30
9 Walk & STOP/frozen, full body shaking, rotate then return back    30
7 Cool down - sitting/relax                                         11
Name: count, dtype: int64

Activity Counts by Subject:
Activity Type  1 (FACING camera) Sit and stand  \
Subject                                          
U1                                           5   
U2                                           4 

In [36]:
import os
import pandas as pd

# Load and parse TrainActivities.csv
activities_df = pd.read_csv(r'TrainingDataPD25\TrainingDataPD25\TrainActivities.csv')
    # Use format='mixed' to handle varying date formats
activities_df['Started'] = pd.to_datetime(activities_df['Started'], format='mixed', dayfirst=False).dt.tz_localize('Europe/London')
activities_df['Finished'] = pd.to_datetime(activities_df['Finished'], format='mixed', dayfirst=False).dt.tz_localize('Europe/London')
activities_df['Updated'] = pd.to_datetime(activities_df['Updated'], format='mixed', dayfirst=False).dt.tz_localize('Europe/London')

# Verify the parsing
print("First few rows of activities_df:")
print(activities_df[['Started', 'Finished', 'Updated']].head())
# Function to get time range of a sensor file
def get_file_time_range(file_path):
    df = pd.read_csv(file_path, names=['user_id', 'timestamp', 'x', 'y', 'z'])
    # Explicitly convert timestamp to datetime, preserving timezone
    df['timestamp'] = pd.to_datetime(df['timestamp'], format='mixed', dayfirst=False, errors='coerce').dt.tz_convert('Europe/London')
    
    # Drop any rows with invalid timestamps
    # df = df.dropna(subset=['timestamp'])
    
    # Calculate min and max times
    min_time = df['timestamp'].min()
    max_time = df['timestamp'].max()
    
    return min_time, max_time, df

# Base directory containing user subdirectories
base_dir = r'C:\projs\tmc\TrainingDataPD25\TrainingDataPD25\users_timeXYZ\users'

# Process each directory and file
labeled_data = []
for directory in os.listdir(base_dir):
    dir_path = os.path.join(base_dir, directory)
    if os.path.isdir(dir_path):
        for file in os.listdir(dir_path):
            if file.endswith('.csv'):
                file_path = os.path.join(dir_path, file)
                min_time, max_time, sensor_df = get_file_time_range(file_path)
                
                # Find overlapping activities
                overlapping_activities = activities_df[
                    (activities_df['Started'] <= max_time) & 
                    (activities_df['Finished'] >= min_time)
                ]
                
                # If there's at least one overlapping activity
                if not overlapping_activities.empty:
                    # Process each timestamp in the sensor data
                    for _, row in sensor_df.iterrows():
                        timestamp = row['timestamp']
                        # Find the specific activity this timestamp belongs to
                        matching_activity = overlapping_activities[
                            (overlapping_activities['Started'] <= timestamp) & 
                            (overlapping_activities['Finished'] >= timestamp)
                        ]
                        if not matching_activity.empty:
                            # Take the first matching activity (assuming no overlap within subject)
                            activity = matching_activity.iloc[0]
                            labeled_data.append({
                                'directory': directory,
                                'subject': activity['Subject'],
                                'timestamp': timestamp,
                                'x': row['x'],
                                'y': row['y'],
                                'z': row['z'],
                                'activity_type': activity['Activity Type']
                            })

# Convert to DataFrame
labeled_df = pd.DataFrame(labeled_data)

# Save or display the labeled data
print(labeled_df.head())
labeled_df.to_csv('labeled_sensor_data.csv', index=False)

First few rows of activities_df:
                    Started                  Finished  \
0 2024-09-02 06:16:00+01:00 2024-09-02 06:16:00+01:00   
1 2024-09-02 06:17:00+01:00 2024-09-02 06:17:00+01:00   
2 2024-09-02 06:18:00+01:00 2024-09-02 06:18:00+01:00   
3 2024-09-02 06:20:00+01:00 2024-09-02 06:20:00+01:00   
4 2024-09-02 06:42:00+01:00 2024-09-02 06:42:00+01:00   

                    Updated  
0 2024-09-02 06:16:00+01:00  
1 2024-09-02 06:17:00+01:00  
2 2024-09-02 06:18:00+01:00  
3 2024-09-02 06:20:00+01:00  
4 2024-09-02 06:42:00+01:00  
  directory subject                        timestamp      x      y      z  \
0      2807      U3 2024-09-05 21:25:00.002000+01:00  1.896 -0.558  9.938   
1      2807      U3 2024-09-05 21:25:00.162000+01:00  1.903 -0.536  9.938   
2      2807      U3 2024-09-05 21:25:00.325000+01:00  1.884 -0.575  9.936   
3      2807      U3 2024-09-05 21:25:00.481000+01:00  1.894 -0.551  9.924   
4      2807      U3 2024-09-05 21:25:00.642000+01:00  1.877

In [25]:
d1=pd.read_csv(r'C:\projs\tmc\TrainingDataPD25\TrainingDataPD25\users_timeXYZ\users\38\user-acc_38_2024-09-08T23_31_01.510+0100_97016.csv',header=None)

In [None]:
d1[1][1]

str

In [32]:
activities_df

Unnamed: 0,ID,Activity Type ID,Activity Type,Started,Finished,Updated,Subject,Duration
0,1130251,2806,1 (FACING camera) Sit and stand,2024-09-02 06:16:00+01:00,2024-09-02 06:16:00+01:00,2024-09-02 06:16:00+01:00,U22,0.0
1,1130254,2807,2 (FACING camera) both hands SHAKING (sitting ...,2024-09-02 06:17:00+01:00,2024-09-02 06:17:00+01:00,2024-09-02 06:17:00+01:00,U22,0.0
2,1130257,2807,2 (FACING camera) both hands SHAKING (sitting ...,2024-09-02 06:18:00+01:00,2024-09-02 06:18:00+01:00,2024-09-02 06:18:00+01:00,U22,0.0
3,1130261,2806,1 (FACING camera) Sit and stand,2024-09-02 06:20:00+01:00,2024-09-02 06:20:00+01:00,2024-09-02 06:20:00+01:00,U22,0.0
4,1130292,2806,1 (FACING camera) Sit and stand,2024-09-02 06:42:00+01:00,2024-09-02 06:42:00+01:00,2024-09-02 06:42:00+01:00,U2,0.0
...,...,...,...,...,...,...,...,...
337,1164181,2815,"10 Slow walk (SHAKING hands/body, tiny step, h...",NaT,NaT,2024-09-11 04:55:00+01:00,U7,
338,1164187,2806,1 (FACING camera) Sit and stand,2024-09-11 04:58:00+01:00,2024-09-11 04:58:00+01:00,2024-09-11 04:58:00+01:00,U7,0.0
339,1164217,2815,"10 Slow walk (SHAKING hands/body, tiny step, h...",2024-09-11 05:09:00+01:00,2024-09-11 05:10:00+01:00,2024-09-11 05:10:00+01:00,U7,60.0
340,1164218,2815,"10 Slow walk (SHAKING hands/body, tiny step, h...",2024-09-11 05:10:00+01:00,2024-09-11 05:10:00+01:00,2024-09-11 05:10:00+01:00,U7,0.0


In [38]:
labeled_df

Unnamed: 0,directory,subject,timestamp,x,y,z,activity_type
0,2807,U3,2024-09-05 21:25:00.002000+01:00,1.896,-0.558,9.938,2 (FACING camera) both hands SHAKING (sitting ...
1,2807,U3,2024-09-05 21:25:00.162000+01:00,1.903,-0.536,9.938,2 (FACING camera) both hands SHAKING (sitting ...
2,2807,U3,2024-09-05 21:25:00.325000+01:00,1.884,-0.575,9.936,2 (FACING camera) both hands SHAKING (sitting ...
3,2807,U3,2024-09-05 21:25:00.481000+01:00,1.894,-0.551,9.924,2 (FACING camera) both hands SHAKING (sitting ...
4,2807,U3,2024-09-05 21:25:00.642000+01:00,1.877,-0.577,9.922,2 (FACING camera) both hands SHAKING (sitting ...
...,...,...,...,...,...,...,...
745,2807,U3,2024-09-05 21:26:59.202000+01:00,1.887,-0.582,9.941,5 (Sideway) both hands SHAKING (sitting)
746,2807,U3,2024-09-05 21:26:59.365000+01:00,1.887,-0.572,9.946,5 (Sideway) both hands SHAKING (sitting)
747,2807,U3,2024-09-05 21:26:59.521000+01:00,1.896,-0.575,9.912,5 (Sideway) both hands SHAKING (sitting)
748,2807,U3,2024-09-05 21:26:59.684000+01:00,1.894,-0.584,9.922,5 (Sideway) both hands SHAKING (sitting)


In [39]:
import os
import pandas as pd

# Load and parse TrainActivities.csv
try:
    activities_df = pd.read_csv(r'TrainingDataPD25\TrainingDataPD25\TrainActivities.csv')
    activities_df['Started'] = pd.to_datetime(activities_df['Started'], format='mixed', dayfirst=False).dt.tz_localize('Europe/London')
    activities_df['Finished'] = pd.to_datetime(activities_df['Finished'], format='mixed', dayfirst=False).dt.tz_localize('Europe/London')
    activities_df['Updated'] = pd.to_datetime(activities_df['Updated'], format='mixed', dayfirst=False).dt.tz_localize('Europe/London')
    print("First few rows of activities_df:")
    print(activities_df[['Started', 'Finished', 'Updated']].head())
except Exception as e:
    print(f"Error loading TrainActivities.csv: {e}")
    exit()

def get_file_time_range(file_path):
    """Load sensor data and calculate time range with consistent timezone."""
    try:
        df = pd.read_csv(file_path, names=['user_id', 'timestamp', 'x', 'y', 'z'])
        df['timestamp'] = pd.to_datetime(df['timestamp'], format='mixed', dayfirst=False, errors='coerce').dt.tz_convert('Europe/London')
        min_time = df['timestamp'].min()
        max_time = df['timestamp'].max()
        return min_time, max_time, df
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        return None, None, None

# Base directory containing user subdirectories
base_dir = r'C:\projs\tmc\TrainingDataPD25\TrainingDataPD25\users_timeXYZ\users'

# Lists to store matched and unmatched data
labeled_data = []
unmatched_data = []

for directory in os.listdir(base_dir):
    dir_path = os.path.join(base_dir, directory)
    if os.path.isdir(dir_path):
        for file in os.listdir(dir_path):
            if file.endswith('.csv'):
                file_path = os.path.join(dir_path, file)
                min_time, max_time, sensor_df = get_file_time_range(file_path)
                
                if min_time is None or max_time is None:
                    # If file couldn't be parsed, mark all entries as unmatched
                    for _, row in sensor_df.iterrows():
                        unmatched_data.append({
                            'directory': directory,
                            'timestamp': row['timestamp'],
                            'x': row['x'],
                            'y': row['y'],
                            'z': row['z'],
                            'reason': 'Invalid timestamp or file parsing error'
                        })
                    continue
                
                # Find overlapping activities
                overlapping_activities = activities_df[
                    (activities_df['Started'] <= max_time) & 
                    (activities_df['Finished'] >= min_time)
                ]
                
                # Process each timestamp in the sensor data
                for _, row in sensor_df.iterrows():
                    timestamp = row['timestamp']
                    if pd.isna(timestamp):
                        # Handle invalid timestamps
                        unmatched_data.append({
                            'directory': directory,
                            'timestamp': timestamp,
                            'x': row['x'],
                            'y': row['y'],
                            'z': row['z'],
                            'reason': 'Invalid timestamp'
                        })
                        continue
                    
                    # Find the specific activity this timestamp belongs to
                    matching_activity = overlapping_activities[
                        (overlapping_activities['Started'] <= timestamp) & 
                        (overlapping_activities['Finished'] >= timestamp)
                    ]
                    
                    if not matching_activity.empty:
                        # Matched: add to labeled_data
                        activity = matching_activity.iloc[0]
                        labeled_data.append({
                            'directory': directory,
                            'subject': activity['Subject'],
                            'timestamp': timestamp,
                            'x': row['x'],
                            'y': row['y'],
                            'z': row['z'],
                            'activity_type': activity['Activity Type']
                        })
                    else:
                        # Unmatched: add to unmatched_data
                        unmatched_data.append({
                            'directory': directory,
                            'timestamp': timestamp,
                            'x': row['x'],
                            'y': row['y'],
                            'z': row['z'],
                            'reason': 'No matching activity'
                        })

# Convert to DataFrames
labeled_df = pd.DataFrame(labeled_data)
unmatched_df = pd.DataFrame(unmatched_data)

# Save and display results
print("Matched Data (first few rows):")
print(labeled_df.head())
labeled_df.to_csv('labeled_sensor_data.csv', index=False)

print("\nUnmatched Data (first few rows):")
print(unmatched_df.head())
unmatched_df.to_csv('unmatched_sensor_data.csv', index=False)

First few rows of activities_df:
                    Started                  Finished  \
0 2024-09-02 06:16:00+01:00 2024-09-02 06:16:00+01:00   
1 2024-09-02 06:17:00+01:00 2024-09-02 06:17:00+01:00   
2 2024-09-02 06:18:00+01:00 2024-09-02 06:18:00+01:00   
3 2024-09-02 06:20:00+01:00 2024-09-02 06:20:00+01:00   
4 2024-09-02 06:42:00+01:00 2024-09-02 06:42:00+01:00   

                    Updated  
0 2024-09-02 06:16:00+01:00  
1 2024-09-02 06:17:00+01:00  
2 2024-09-02 06:18:00+01:00  
3 2024-09-02 06:20:00+01:00  
4 2024-09-02 06:42:00+01:00  
Matched Data (first few rows):
  directory subject                        timestamp      x      y      z  \
0      2807      U3 2024-09-05 21:25:00.002000+01:00  1.896 -0.558  9.938   
1      2807      U3 2024-09-05 21:25:00.162000+01:00  1.903 -0.536  9.938   
2      2807      U3 2024-09-05 21:25:00.325000+01:00  1.884 -0.575  9.936   
3      2807      U3 2024-09-05 21:25:00.481000+01:00  1.894 -0.551  9.924   
4      2807      U3 2024-09-

In [60]:
activities_df[activities_df['Activity Type ID']==1716]

Unnamed: 0,ID,Activity Type ID,Activity Type,Started,Finished,Updated,Subject


Unnamed: 0,ID,Activity Type ID,Activity Type,Started,Finished,Updated,Subject
0,1130251,2806,1 (FACING camera) Sit and stand,2024-09-02 06:16:00+01:00,2024-09-02 06:16:00+01:00,2024-09-02 06:16:00+01:00,U22
1,1130254,2807,2 (FACING camera) both hands SHAKING (sitting ...,2024-09-02 06:17:00+01:00,2024-09-02 06:17:00+01:00,2024-09-02 06:17:00+01:00,U22
2,1130257,2807,2 (FACING camera) both hands SHAKING (sitting ...,2024-09-02 06:18:00+01:00,2024-09-02 06:18:00+01:00,2024-09-02 06:18:00+01:00,U22
3,1130261,2806,1 (FACING camera) Sit and stand,2024-09-02 06:20:00+01:00,2024-09-02 06:20:00+01:00,2024-09-02 06:20:00+01:00,U22
4,1130292,2806,1 (FACING camera) Sit and stand,2024-09-02 06:42:00+01:00,2024-09-02 06:42:00+01:00,2024-09-02 06:42:00+01:00,U2
...,...,...,...,...,...,...,...
337,1164181,2815,"10 Slow walk (SHAKING hands/body, tiny step, h...",NaT,NaT,2024-09-11 04:55:00+01:00,U7
338,1164187,2806,1 (FACING camera) Sit and stand,2024-09-11 04:58:00+01:00,2024-09-11 04:58:00+01:00,2024-09-11 04:58:00+01:00,U7
339,1164217,2815,"10 Slow walk (SHAKING hands/body, tiny step, h...",2024-09-11 05:09:00+01:00,2024-09-11 05:10:00+01:00,2024-09-11 05:10:00+01:00,U7
340,1164218,2815,"10 Slow walk (SHAKING hands/body, tiny step, h...",2024-09-11 05:10:00+01:00,2024-09-11 05:10:00+01:00,2024-09-11 05:10:00+01:00,U7


### Not Useful Files

In [78]:
set(unmatched_df['directory'].astype(int))-set(activities_df['Activity Type ID'])

{38, 1716, 2803, 2804, 2805, 2816, 2819, 2830}