In [25]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
from datetime import datetime

In [None]:
# check readable data
# set the dataset path 
dataset_path = r"C:/Users/mihir/OneDrive/Desktop/EPI233/multilevel-monitoring-of-activity-and-sleep-in-healthy-people-1.0.0/data"

print(f"checking dataset at path: {dataset_path}")

# check if main directory exists
if not os.path.exists(dataset_path):
    print(f"error: dataset path '{dataset_path}' does not exist.")
else:
    print(f"success: dataset base directory found.")
    
    # list all items in the directory to understand structure
    all_items = os.listdir(dataset_path)
    print(f"\nfound {len(all_items)} items in the base directory:")
    
    # count directories and files
    dirs = [item for item in all_items if os.path.isdir(os.path.join(dataset_path, item))]
    files = [item for item in all_items if os.path.isfile(os.path.join(dataset_path, item))]
    
    print(f"- {len(dirs)} directories: {', '.join(dirs[:5])}" + ("..." if len(dirs) > 5 else ""))
    print(f"- {len(files)} files: {', '.join(files[:5])}" + ("..." if len(files) > 5 else ""))
    
    # find user directories (using lowercase "user_" pattern as observed in the error message)
    user_dirs = glob(os.path.join(dataset_path, "user_*"))
    
    if not user_dirs:
        # if no "user_XX" directories found, check if we're one level too high/low
        potential_data_dirs = [d for d in dirs if "data" in d.lower()]
        if potential_data_dirs:
            print(f"\nno user directories found, but found potential data directories: {potential_data_dirs}")
            for potential_dir in potential_data_dirs:
                full_path = os.path.join(dataset_path, potential_dir)
                sub_user_dirs = glob(os.path.join(full_path, "user_*"))
                if sub_user_dirs:
                    print(f"found {len(sub_user_dirs)} user directories in '{potential_dir}'")
                    user_dirs = sub_user_dirs
                    dataset_path = full_path
                    break
    
    if not user_dirs:
        print("\nwarning: no user directories found with pattern 'user_*'")
        print("searching for any directories that might contain the data...")
        
        # try to find any directory that might contain our expected files
        expected_files = ["sleep.csv", "rr.csv", "user_info.csv"]
        for root, dirs, files in os.walk(dataset_path):
            for filename in files:
                if filename.lower() in [f.lower() for f in expected_files]:
                    print(f"found expected file {filename} in {root}")
    else:
        print(f"\nfound {len(user_dirs)} user directories.")
        
        # define expected files (using lowercase as files might be lowercase)
        expected_files = [
            "user_info.csv",
            "sleep.csv", 
            "RR.csv",
            "questionnaire.csv",
            "Activity.csv", 
            "Actigraph.csv",
            "saliva.csv"
        ]
        
        # create a report dictionary
        file_report = {file: {"present": 0, "readable": 0} for file in expected_files}
        user_report = {}
        
        print("\nvalidating individual user directories:")
        
        # check a subset of users for detailed inspection (to avoid long output)
        sample_users = user_dirs[:3] if len(user_dirs) > 3 else user_dirs
        
        for user_dir in sample_users:
            user_id = os.path.basename(user_dir)
            print(f"\n  checking {user_id}...")
            user_report[user_id] = {"files_present": 0, "files_readable": 0}
            
            for expected_file in expected_files:
                file_path = os.path.join(user_dir, expected_file)
                
                # check if file exists
                if os.path.exists(file_path):
                    file_report[expected_file]["present"] += 1
                    user_report[user_id]["files_present"] += 1
                    print(f"    ✓ found {expected_file}")
                    
                    # try reading the file
                    try:
                        if expected_file.endswith('.csv'):
                            # read just a few rows to validate
                            df = pd.read_csv(file_path, nrows=5)
                            row_count = len(df)
                            col_count = len(df.columns)
                            file_report[expected_file]["readable"] += 1
                            user_report[user_id]["files_readable"] += 1
                            print(f"      ✓ successfully read {expected_file}: {row_count} rows × {col_count} columns")
                            print(f"      ✓ columns: {', '.join(df.columns[:3])}..." if col_count > 3 else f"      ✓ columns: {', '.join(df.columns)}")
                        else:
                            # for non-CSV files
                            with open(file_path, 'r') as f:
                                lines = f.readlines(100)  # read just a few lines
                            file_report[expected_file]["readable"] += 1
                            user_report[user_id]["files_readable"] += 1
                            print(f"      ✓ successfully read {expected_file}: {len(lines)} lines")
                    except Exception as e:
                        print(f"      ✗ error reading {expected_file}: {str(e)}")
                else:
                    # try with capitalized filename as well
                    cap_file_path = os.path.join(user_dir, expected_file.capitalize())
                    if os.path.exists(cap_file_path):
                        file_report[expected_file]["present"] += 1
                        user_report[user_id]["files_present"] += 1
                        print(f"    ✓ found {expected_file} (capitalized)")
                        
                        try:
                            if cap_file_path.endswith('.csv'):
                                df = pd.read_csv(cap_file_path, nrows=5)
                                row_count = len(df)
                                col_count = len(df.columns)
                                file_report[expected_file]["readable"] += 1
                                user_report[user_id]["files_readable"] += 1
                                print(f"      ✓ successfully read {expected_file}: {row_count} rows × {col_count} columns")
                                print(f"      ✓ columns: {', '.join(df.columns[:3])}..." if col_count > 3 else f"      ✓ columns: {', '.join(df.columns)}")
                        except Exception as e:
                            print(f"      ✗ error reading {expected_file}: {str(e)}")
                    else:
                        # check for file with .CSV extension instead of .csv
                        upper_ext_file_path = os.path.join(user_dir, expected_file[:-4] + ".CSV")
                        if os.path.exists(upper_ext_file_path):
                            file_report[expected_file]["present"] += 1
                            user_report[user_id]["files_present"] += 1
                            print(f"    ✓ found {expected_file} (with .CSV extension)")
                            
                            try:
                                df = pd.read_csv(upper_ext_file_path, nrows=5)
                                row_count = len(df)
                                col_count = len(df.columns)
                                file_report[expected_file]["readable"] += 1
                                user_report[user_id]["files_readable"] += 1
                                print(f"      ✓ successfully read {expected_file}: {row_count} rows × {col_count} columns")
                                print(f"      ✓ columns: {', '.join(df.columns[:3])}..." if col_count > 3 else f"      ✓ columns: {', '.join(df.columns)}")
                            except Exception as e:
                                print(f"      ✗ error reading {expected_file}: {str(e)}")
                        else:
                            print(f"    ✗ missing {expected_file}")
        
        # now check all remaining users without detailed output
        if len(user_dirs) > 3:
            print(f"\nchecking remaining {len(user_dirs) - len(sample_users)} users...")
            
            for user_dir in user_dirs[3:]:
                user_id = os.path.basename(user_dir)
                user_report[user_id] = {"files_present": 0, "files_readable": 0}
                
                for expected_file in expected_files:
                    file_path = os.path.join(user_dir, expected_file)
                    cap_file_path = os.path.join(user_dir, expected_file.capitalize())
                    upper_ext_file_path = os.path.join(user_dir, expected_file[:-4] + ".CSV")
                    
                    # check all possible file path variants
                    for path in [file_path, cap_file_path, upper_ext_file_path]:
                        if os.path.exists(path):
                            file_report[expected_file]["present"] += 1
                            user_report[user_id]["files_present"] += 1
                            
                            # try reading the file
                            try:
                                if path.lower().endswith('.csv'):
                                    df = pd.read_csv(path, nrows=5)
                                    file_report[expected_file]["readable"] += 1
                                    user_report[user_id]["files_readable"] += 1
                            except Exception:
                                pass
                            
                            break  # file found, move to next expected file
        
        # generate summary report
        print("\n" + "="*60)
        print("dataset validation summary")
        print("="*60)
        
        print(f"\nfile availability (across {len(user_dirs)} users):")
        for file, stats in file_report.items():
            present_pct = (stats["present"] / len(user_dirs)) * 100
            readable_pct = (stats["readable"] / len(user_dirs)) * 100
            print(f"- {file:<15}: present: {stats['present']}/{len(user_dirs)} ({present_pct:.1f}%), readable: {stats['readable']}/{len(user_dirs)} ({readable_pct:.1f}%)")
        
        # check which users have all files
        complete_users = [user for user, stats in user_report.items() if stats["files_present"] == len(expected_files)]
        readable_users = [user for user, stats in user_report.items() if stats["files_readable"] == len(expected_files)]
        
        print(f"\nuser completeness:")
        print(f"- users with all files present: {len(complete_users)}/{len(user_dirs)} ({(len(complete_users)/len(user_dirs))*100:.1f}%)")
        print(f"- users with all files readable: {len(readable_users)}/{len(user_dirs)} ({(len(readable_users)/len(user_dirs))*100:.1f}%)")
        
        if len(readable_users) < len(user_dirs):
            print("\nusers with incomplete/unreadable data:")
            incomplete_users = [user for user, stats in user_report.items() if stats["files_readable"] < len(expected_files)]
            for user in incomplete_users[:5]:  # show first 5 incomplete users
                print(f"- {user}: {user_report[user]['files_readable']}/{len(expected_files)} readable files")
            if len(incomplete_users) > 5:
                print(f"  ... and {len(incomplete_users) - 5} more")
                
        # overall validation result
        if len(readable_users) > 0:
            print("\nvalidation result: passed ✓")
            print(f"dataset contains {len(readable_users)} fully usable user directories.")
        else:
            print("\nvalidation result: failed ✗")
            print("no user directories with complete readable data found.")

        # try reading specific files to ensure data format is as expected
        print("\nperforming targeted file content validation...")
        
        if readable_users:
            # use the first complete user for validation
            sample_user_dir = os.path.join(dataset_path, readable_users[0])
            
            # function to find file with case-insensitive matching
            def find_file(directory, filename):
                for f in os.listdir(directory):
                    if f.lower() == filename.lower():
                        return os.path.join(directory, f)
                return None
            
            # 1. validate sleep.csv structure
            sleep_path = find_file(sample_user_dir, "sleep.csv")
            if sleep_path:
                try:
                    sleep_df = pd.read_csv(sleep_path)
                    # convert column names to lowercase for comparison
                    cols_lower = [col.lower() for col in sleep_df.columns]
                    expected_sleep_cols = ['in bed date', 'in bed time', 'out bed date', 'out bed time']
                    missing_cols = [col for col in expected_sleep_cols if col not in cols_lower]
                    
                    if not missing_cols:
                        print("✓ sleep.csv has the expected column structure")
                    else:
                        print(f"✗ sleep.csv is missing expected columns: {missing_cols}")
                        print(f"  available columns: {sleep_df.columns.tolist()}")
                except Exception as e:
                    print(f"✗ error validating sleep.csv: {str(e)}")
            else:
                print("✗ could not find sleep.csv for validation")
            
            # 2. validate rr.csv structure
            rr_path = find_file(sample_user_dir, "rr.csv")
            if rr_path:
                try:
                    rr_df = pd.read_csv(rr_path)
                    cols_lower = [col.lower() for col in rr_df.columns]
                    expected_rr_cols = ['ibi_s', 'day', 'time']
                    missing_cols = [col for col in expected_rr_cols if col not in cols_lower]
                    
                    if not missing_cols:
                        print("✓ rr.csv has the expected column structure")
                    else:
                        print(f"✗ rr.csv is missing expected columns: {missing_cols}")
                        print(f"  available columns: {rr_df.columns.tolist()}")
                except Exception as e:
                    print(f"✗ error validating rr.csv: {str(e)}")
            else:
                print("✗ could not find rr.csv for validation")
                
            # 3. validate actigraph.csv (sample only due to potential size)
            actigraph_path = find_file(sample_user_dir, "actigraph.csv")
            if actigraph_path:
                try:
                    actigraph_df = pd.read_csv(actigraph_path, nrows=10)
                    cols_lower = [col.lower() for col in actigraph_df.columns]
                    expected_actigraph_cols = ['axis1', 'axis2', 'axis3', 'steps']
                    missing_cols = [col for col in expected_actigraph_cols if col not in cols_lower]
                    
                    if not missing_cols:
                        print("✓ actigraph.csv has the expected column structure")
                    else:
                        print(f"✗ actigraph.csv is missing expected columns: {missing_cols}")
                        print(f"  available columns: {actigraph_df.columns.tolist()}")
                except Exception as e:
                    print(f"✗ error validating actigraph.csv: {str(e)}")
            else:
                print("✗ could not find actigraph.csv for validation")

print("\nvalidation process complete.")

In [None]:
# matplotlib
print(plt.style.available)
plt.style.use('seaborn-v0_8-pastel')
plt.rcParams['figure.figsize'] = (12, 8)
sns.set_context("notebook", font_scale=1.5)

['Solarize_Light2', '_classic_test_patch', '_mpl-gallery', '_mpl-gallery-nogrid', 'bmh', 'classic', 'dark_background', 'fast', 'fivethirtyeight', 'ggplot', 'grayscale', 'petroff10', 'seaborn-v0_8', 'seaborn-v0_8-bright', 'seaborn-v0_8-colorblind', 'seaborn-v0_8-dark', 'seaborn-v0_8-dark-palette', 'seaborn-v0_8-darkgrid', 'seaborn-v0_8-deep', 'seaborn-v0_8-muted', 'seaborn-v0_8-notebook', 'seaborn-v0_8-paper', 'seaborn-v0_8-pastel', 'seaborn-v0_8-poster', 'seaborn-v0_8-talk', 'seaborn-v0_8-ticks', 'seaborn-v0_8-white', 'seaborn-v0_8-whitegrid', 'tableau-colorblind10']


In [None]:
# Data Preprocessing

base_path = "C:/Users/mihir/OneDrive/Desktop/EPI233/multilevel-monitoring-of-activity-and-sleep-in-healthy-people-1.0.0/data"

# function to find file with case-insensitive matching
def find_file(directory, filename):
    """finds a file in a directory regardless of case"""
    for f in os.listdir(directory):
        if f.lower() == filename.lower():
            return os.path.join(directory, f)
    return None

# function to flexibly parse time strings in various formats
def parse_time_flexibly(day, time_str):
    if pd.isna(day) or pd.isna(time_str):
        return None
        
    # convert day to string if it's not already
    day_str = str(day)
    
    # try different time parsing approaches
    try:
        # first try direct parsing with day + time
        return pd.to_datetime(f"{day_str} {time_str}")
    except:
        pass
    
    # try standardizing the time format first
    try:
        # handle cases with just hours and minutes (0:00 or 00:00)
        if len(time_str.split(':')) == 2:
            # ensure hours are zero-padded
            hours, minutes = time_str.split(':')
            time_str = f"{int(hours):02d}:{minutes}"
            return pd.to_datetime(f"{day_str} {time_str}")
    except:
        pass
    
    # try another approach - extract components manually
    try:
        time_parts = time_str.split(':')
        if len(time_parts) >= 2:
            # get hours and minutes
            hour = int(time_parts[0])
            minute = int(time_parts[1])
            
            # handle seconds if present
            second = 0
            if len(time_parts) >= 3:
                second = int(time_parts[2])
            
            # create datetime
            from datetime import datetime
            return pd.to_datetime(datetime(2000, 1, int(day), hour, minute, second))
    except:
        pass
    
    return None

# function to load data for a single user
def load_user_data(user_id):
    # format user_id according to directory naming in dataset
    if isinstance(user_id, int):
        user_id = f"user_{user_id}"
    elif not user_id.startswith("user_"):
        user_id = f"user_{user_id}"
        
    user_path = os.path.join(base_path, user_id)
    
    if not os.path.exists(user_path):
        print(f"warning: user directory '{user_path}' not found")
        return {}
    
    data = {}
    
    # files to load with potential case variations
    files_to_load = {
        'user_info': 'user_info.csv',
        'sleep': 'sleep.csv',
        'rr': 'RR.csv',
        'questionnaire': 'questionnaire.csv',
        'activity': 'Activity.csv',
        'actigraph': 'Actigraph.csv',
        'saliva': 'saliva.csv'
    }
    
    # load each file
    for key, filename in files_to_load.items():
        # find the file path regardless of case
        file_path = find_file(user_path, filename)
        
        if file_path:
            try:
                # for actigraph, sample rows due to potential size
                if key == 'actigraph':
                    data[key] = pd.read_csv(file_path, nrows=10000)
                    data['actigraph_full_path'] = file_path
                else:
                    data[key] = pd.read_csv(file_path)
                print(f"  loaded {os.path.basename(file_path)} for {user_id}")
            except Exception as e:
                print(f"  error loading {filename} for {user_id}: {str(e)}")
        else:
            print(f"  warning: {filename} not found for {user_id}")
    
    return data

# function to discover all available users in the dataset
def get_all_user_ids():
    user_dirs = sorted(glob(os.path.join(base_path, "user_*")))
    user_ids = [os.path.basename(d) for d in user_dirs]
    return user_ids

# load data for all users
def load_all_users():
    user_ids = get_all_user_ids()
    print(f"found {len(user_ids)} users in the dataset")
    
    all_users_data = []
    
    # aggregate dataframes
    combined_data = {
        'user_info': [],
        'sleep': [],
        'questionnaire': [],
        'saliva': [],
        'rr': [],
        'activity': [],
        'actigraph': []
    }
    
    for user_id in user_ids:
        print(f"loading data for {user_id}...")
        user_data = load_user_data(user_id)
        
        # add user id to each dataframe for tracking
        for key in user_data:
            if isinstance(user_data[key], pd.DataFrame):
                if 'user_id' not in user_data[key].columns:
                    user_data[key]['user_id'] = user_id
        
        # append to combined dataframes for specific files
        for key in combined_data:
            if key in user_data and isinstance(user_data[key], pd.DataFrame) and not user_data[key].empty:
                combined_data[key].append(user_data[key])
        
        all_users_data.append(user_data)
    
    # concatenate combined dataframes
    for key in combined_data:
        if combined_data[key]:
            combined_data[key] = pd.concat(combined_data[key], ignore_index=True)
        else:
            combined_data[key] = pd.DataFrame()
    
    return {'combined': combined_data, 'individual': all_users_data}

# function to check for missing values
def check_missing_values(data):
    missing_info = {}
    
    for key, df in data.items():
        if isinstance(df, pd.DataFrame):
            missing = df.isnull().sum()
            missing_percent = (missing / len(df)) * 100
            missing_info[key] = pd.DataFrame({
                'missing values': missing,
                'percent missing': missing_percent
            }).reset_index().rename(columns={'index': 'column'})
    
    return missing_info

# function to extract hrv features from rr intervals
def extract_hrv_features(rr_data, window_size=300):
    """
    extracts heart rate variability features from rr interval data.
    
    parameters:
    -----------
    rr_data : pandas.dataframe
        dataframe containing rr interval data
    window_size : int, optional
        window size in seconds for feature extraction (default: 300 seconds = 5 minutes)
    
    returns:
    --------
    pandas.dataframe
        dataframe with hrv features
    """
    if rr_data.empty:
        return pd.DataFrame()
    
    # convert time to datetime if it's not already
    if 'time' in rr_data.columns and not pd.api.types.is_datetime64_any_dtype(rr_data['time']):
        # Create a new datetime column using our flexible parser
        datetime_values = []
        for i, row in rr_data.iterrows():
            dt = parse_time_flexibly(row['day'], row['time'])
            datetime_values.append(dt)
        
        rr_data['datetime'] = datetime_values
        
        # Remove rows where datetime conversion failed
        if rr_data['datetime'].isna().any():
            print(f"warning: {rr_data['datetime'].isna().sum()} rows with invalid datetime were removed")
            rr_data = rr_data.dropna(subset=['datetime'])
            
        if rr_data.empty:
            print("error: all datetime conversions failed")
            return pd.DataFrame()
    
    # sort by datetime
    try:
        rr_data = rr_data.sort_values('datetime')
    except Exception as e:
        print(f"error sorting by datetime: {str(e)}")
        return pd.DataFrame()
    
    # initialize lists to store features
    windows = []
    mean_rr = []
    sdnn = []
    rmssd = []
    pnn50 = []
    
    # set the start time
    start_time = rr_data['datetime'].min()
    end_time = start_time + pd.Timedelta(seconds=window_size)
    
    # loop through windows
    while end_time <= rr_data['datetime'].max():
        # get rr intervals in the current window
        window_data = rr_data[(rr_data['datetime'] >= start_time) & (rr_data['datetime'] < end_time)]
        
        if len(window_data) > 1:  # need at least 2 intervals for calculations
            rr_intervals = window_data['ibi_s'].values * 1000  # convert to milliseconds
            
            # calculate features
            windows.append(start_time)
            mean_rr.append(np.mean(rr_intervals))
            sdnn.append(np.std(rr_intervals))
            
            # calculate rmssd
            diffs = np.diff(rr_intervals)
            rmssd.append(np.sqrt(np.mean(diffs**2)))
            
            # calculate pnn50
            pnn50.append(np.sum(np.abs(diffs) > 50) / len(diffs) * 100)
        
        # move to the next window
        start_time = end_time
        end_time = start_time + pd.Timedelta(seconds=window_size)
    
    # create dataframe with features
    hrv_features = pd.DataFrame({
        'timestamp': windows,
        'mean_rr': mean_rr,
        'sdnn': sdnn,
        'rmssd': rmssd,
        'pnn50': pnn50
    })
    
    return hrv_features

# function to preprocess sleep data
def preprocess_sleep_data(sleep_df):
    """
    preprocesses sleep data by converting time columns to datetime and calculating additional metrics.
    
    parameters:
    -----------
    sleep_df : pandas.dataframe
        dataframe containing sleep data
    
    returns:
    --------
    pandas.dataframe
        processed sleep data
    """
    if sleep_df.empty:
        return sleep_df
    
    # make a copy to avoid modifying the original
    df = sleep_df.copy()
    
    # check for expected column patterns (case insensitive)
    col_map = {}
    time_cols = ['in bed time', 'out bed time', 'onset time']
    date_cols = ['in bed date', 'out bed date', 'onset date']
    
    # map actual column names to expected names (case-insensitive)
    for col in df.columns:
        col_lower = col.lower()
        for expected in time_cols + date_cols:
            if expected in col_lower:
                col_map[expected] = col
    
    # convert time columns to datetime
    for time_col, date_col in zip(time_cols, date_cols):
        if time_col in col_map and date_col in col_map:
            # create datetime column using flexible parser
            datetime_col_name = f'{time_col}_datetime'
            datetime_values = []
            
            for i, row in df.iterrows():
                date_val = row[col_map[date_col]]
                time_val = row[col_map[time_col]]
                dt = parse_time_flexibly(date_val, time_val)
                datetime_values.append(dt)
            
            df[datetime_col_name] = datetime_values
    
    # calculate sleep duration in hours
    if 'onset time_datetime' in df.columns and 'out bed time_datetime' in df.columns:
        valid_mask = ~(df['onset time_datetime'].isna() | df['out bed time_datetime'].isna())
        if valid_mask.any():
            df.loc[valid_mask, 'sleep_duration_hours'] = (
                df.loc[valid_mask, 'out bed time_datetime'] - 
                df.loc[valid_mask, 'onset time_datetime']
            ).dt.total_seconds() / 3600
    
    # calculate sleep efficiency as percentage - need to find TST and minutes in bed columns
    tst_col = None
    minutes_col = None
    
    for col in df.columns:
        col_lower = col.lower()
        if 'sleep time' in col_lower or 'tst' in col_lower:
            tst_col = col
        if 'minutes in bed' in col_lower:
            minutes_col = col
    
    if tst_col and minutes_col:
        valid_mask = ~(df[tst_col].isna() | df[minutes_col].isna()) & (df[minutes_col] > 0)
        if valid_mask.any():
            df.loc[valid_mask, 'sleep_efficiency'] = (
                df.loc[valid_mask, tst_col] / df.loc[valid_mask, minutes_col]
            ) * 100
    
    return df

# function to preprocess activity data
def preprocess_activity_data(activity_df):
    """
    preprocesses activity data by creating datetime columns and calculating durations.
    
    parameters:
    -----------
    activity_df : pandas.dataframe
        dataframe containing activity data
    
    returns:
    --------
    pandas.dataframe
        processed activity data
    """
    if activity_df.empty:
        return activity_df
    
    # make a copy to avoid modifying the original
    df = activity_df.copy()
    
    # identify column names (case insensitive)
    start_col = None
    end_col = None
    day_col = None
    activity_col = None
    
    for col in df.columns:
        col_lower = col.lower()
        if 'start' in col_lower:
            start_col = col
        elif 'end' in col_lower:
            end_col = col
        elif 'day' in col_lower:
            day_col = col
        elif 'activity' in col_lower:
            activity_col = col
    
    # convert start and end times to datetime using flexible parser
    if start_col and day_col:
        start_datetime_values = []
        for i, row in df.iterrows():
            dt = parse_time_flexibly(row[day_col], row[start_col])
            start_datetime_values.append(dt)
        df['start_datetime'] = start_datetime_values
    
    if end_col and day_col:
        end_datetime_values = []
        for i, row in df.iterrows():
            dt = parse_time_flexibly(row[day_col], row[end_col])
            end_datetime_values.append(dt)
        df['end_datetime'] = end_datetime_values
    
    # calculate activity duration in minutes
    if 'start_datetime' in df.columns and 'end_datetime' in df.columns:
        valid_mask = ~(df['start_datetime'].isna() | df['end_datetime'].isna())
        if valid_mask.any():
            df.loc[valid_mask, 'duration_minutes'] = (
                df.loc[valid_mask, 'end_datetime'] - 
                df.loc[valid_mask, 'start_datetime']
            ).dt.total_seconds() / 60
    
    # map activity codes to descriptive names if activity column exists
    activity_mapping = {
        1: 'sleeping',
        2: 'laying_down',
        3: 'sitting',
        4: 'light_movement',
        5: 'medium_movement',
        6: 'heavy_movement',
        7: 'eating',
        8: 'small_screen_usage',
        9: 'large_screen_usage',
        10: 'caffeinated_drink',
        11: 'smoking',
        12: 'alcohol'
    }
    
    if activity_col:
        df['activity_name'] = df[activity_col].map(activity_mapping)
    
    return df

# function to preprocess actigraph data
def preprocess_actigraph_data(actigraph_df):
    """
    preprocesses actigraph data by creating datetime columns and calculating additional metrics.
    
    parameters:
    -----------
    actigraph_df : pandas.dataframe
        dataframe containing actigraph data
    
    returns:
    --------
    pandas.dataframe
        processed actigraph data
    """
    if actigraph_df.empty:
        return actigraph_df
    
    # make a copy to avoid modifying the original
    df = actigraph_df.copy()
    
    # identify column names (case insensitive)
    time_col = None
    day_col = None
    axis_cols = []
    position_cols = []
    
    for col in df.columns:
        col_lower = col.lower()
        if 'time' in col_lower and not 'timestamp' in col_lower:
            time_col = col
        elif 'day' in col_lower:
            day_col = col
        elif 'axis' in col_lower:
            axis_cols.append(col)
        elif 'inclinometer' in col_lower:
            position_cols.append(col)
    
    # convert time to datetime using flexible parser
    if time_col and day_col:
        datetime_values = []
        # For large dataframes, process in chunks to avoid memory issues
        chunk_size = 10000
        total_rows = len(df)
        
        print(f"  converting {total_rows} timestamps to datetime (this may take a while)...")
        
        for chunk_start in range(0, total_rows, chunk_size):
            chunk_end = min(chunk_start + chunk_size, total_rows)
            chunk = df.iloc[chunk_start:chunk_end]
            
            chunk_datetimes = []
            for i, row in chunk.iterrows():
                dt = parse_time_flexibly(row[day_col], row[time_col])
                chunk_datetimes.append(dt)
            
            datetime_values.extend(chunk_datetimes)
            
            # Print progress
            print(f"    processed {chunk_end}/{total_rows} rows ({(chunk_end/total_rows)*100:.1f}%)")
        
        df['datetime'] = datetime_values
        print(f"  datetime conversion complete")
    
    # calculate magnitude of acceleration
    if len(axis_cols) >= 3:
        try:
            df['accel_magnitude'] = np.sqrt(df[axis_cols[0]]**2 + df[axis_cols[1]]**2 + df[axis_cols[2]]**2)
        except Exception as e:
            print(f"  error calculating acceleration magnitude: {str(e)}")
    
    # create position indicator from inclinometer columns
    if position_cols:
        df['position'] = 'unknown'
        for col in position_cols:
            position_name = col.split(' ')[-1].lower() if ' ' in col else 'position_' + col.lower()
            try:
                df.loc[df[col] == 1, 'position'] = position_name
            except Exception as e:
                print(f"  error setting position from {col}: {str(e)}")
    
    return df

# main preprocessing function
def preprocess_data(data):
    processed_data = {}
    
    # process user_info
    if 'user_info' in data and isinstance(data['user_info'], pd.DataFrame) and not data['user_info'].empty:
        processed_data['user_info'] = data['user_info'].copy()
        print("  preprocessed user_info data")
    
    # process sleep data
    if 'sleep' in data and isinstance(data['sleep'], pd.DataFrame) and not data['sleep'].empty:
        processed_data['sleep'] = preprocess_sleep_data(data['sleep'])
        print("  preprocessed sleep data")
    
    # process rr data and extract hrv features
    if 'rr' in data and isinstance(data['rr'], pd.DataFrame) and not data['rr'].empty:
        processed_data['rr'] = data['rr'].copy()
        print("  preprocessing RR data and extracting HRV features...")
        processed_data['hrv_features'] = extract_hrv_features(data['rr'])
        print("  completed HRV feature extraction")
    
    # process questionnaire data
    if 'questionnaire' in data and isinstance(data['questionnaire'], pd.DataFrame) and not data['questionnaire'].empty:
        processed_data['questionnaire'] = data['questionnaire'].copy()
        print("  preprocessed questionnaire data")
    
    # process activity data
    if 'activity' in data and isinstance(data['activity'], pd.DataFrame) and not data['activity'].empty:
        processed_data['activity'] = preprocess_activity_data(data['activity'])
        print("  preprocessed activity data")
    
    # process actigraph data
    if 'actigraph' in data and isinstance(data['actigraph'], pd.DataFrame) and not data['actigraph'].empty:
        processed_data['actigraph'] = preprocess_actigraph_data(data['actigraph'])
        print("  preprocessed actigraph data")
    
    # process saliva data
    if 'saliva' in data and isinstance(data['saliva'], pd.DataFrame) and not data['saliva'].empty:
        processed_data['saliva'] = data['saliva'].copy()
        print("  preprocessed saliva data")
    
    return processed_data

# execute the data loading and preprocessing
print("starting data preprocessing...")

# load all user data
all_data = load_all_users()

# check for missing values in combined data
missing_values = check_missing_values(all_data['combined'])

# preprocess data for each user
preprocessed_individual = []
for i, user_data in enumerate(all_data['individual']):
    print(f"preprocessing data for user {i+1}/{len(all_data['individual'])}...")
    preprocessed_user_data = preprocess_data(user_data)
    preprocessed_individual.append(preprocessed_user_data)

# preprocess combined data
preprocessed_combined = preprocess_data(all_data['combined'])

# store all preprocessed data
preprocessed_data = {
    'combined': preprocessed_combined,
    'individual': preprocessed_individual,
    'missing_values': missing_values
}

print("data preprocessing complete!")

# display summary of the dataset
print("\ndataset summary:")
for key, df in preprocessed_combined.items():
    if isinstance(df, pd.DataFrame):
        print(f"{key}: {df.shape[0]} rows × {df.shape[1]} columns")

# example visualization: distribution of sleep quality metrics
if 'sleep' in preprocessed_combined and not preprocessed_combined['sleep'].empty:
    print("\ngenerating sleep quality visualizations...")
    plt.figure(figsize=(15, 10))
    
    # find sleep quality metrics with flexible column name matching
    sleep_metrics = []
    for col in preprocessed_combined['sleep'].columns:
        col_lower = col.lower()
        if 'sleep time' in col_lower or 'tst' in col_lower:
            sleep_metrics.append(col)
        elif 'fragmentation index' in col_lower:
            sleep_metrics.append(col)
        elif 'sleep_efficiency' in col_lower:
            sleep_metrics.append(col)
        elif 'awakenings' in col_lower and 'number' in col_lower:
            sleep_metrics.append(col)
    
    # limit to 4 metrics for visualization
    sleep_metrics = sleep_metrics[:4]
    
    for i, metric in enumerate(sleep_metrics):
        if i < 4:  # ensure we don't exceed the 2x2 grid
            plt.subplot(2, 2, i+1)
            sns.histplot(preprocessed_combined['sleep'][metric].dropna(), kde=True)
            plt.title(f'distribution of {metric}')
            plt.xlabel(metric)
            plt.ylabel('count')
    
    plt.tight_layout()
    plt.savefig('sleep_quality_distributions.png')
    print("sleep quality visualizations saved to 'sleep_quality_distributions.png'")
    plt.close()

# save preprocessed data for future use
import pickle

with open('mmash_preprocessed_data.pkl', 'wb') as f:
    pickle.dump(preprocessed_data, f)

print("preprocessed data saved to 'mmash_preprocessed_data.pkl'")
