In [6]:
import pandas as pd
import numpy as np 
import matplotlib as plt
import datetime

## Imports

As a first step, we will import all the required data and set the appropriate data types.

In [7]:
# -------------------------------------------------------
#                       Imports 
# ------------------------------------------------------

# Load necessary data
def convert_dates(df, date_cols, date_format):
    for col in date_cols:
        df[col] = pd.to_datetime(df[col], format=date_format)
    return df

# -------------------- User weight and height ------------------------ #
weight = pd.read_csv('Data/samsunghealth_nevzad.nuhiu_202103172116/com.samsung.health.weight.202103172116.csv', 
                      skiprows=[0], 
                      index_col=False)
weight = convert_dates(weight, ['start_time', 'create_time', 'update_time'], '%Y-%m-%d %H:%M:%S')

# --------------------- Heart rate ----------------------------------- #
heart_rate = pd.read_csv('Data/samsunghealth_nevzad.nuhiu_202103172116/com.samsung.shealth.tracker.heart_rate.202103172116.csv', 
                          skiprows=[0],
                          index_col=False)
heart_date_cols = ['com.samsung.health.heart_rate.start_time', 
                    'com.samsung.health.heart_rate.update_time', 
                    'com.samsung.health.heart_rate.create_time', 
                    'com.samsung.health.heart_rate.end_time']
heart_rate = convert_dates(heart_rate, heart_date_cols, '%Y-%m-%d %H:%M:%S')

# --------------------- Step daily trend: speed, distance, calories ---------------------------- #

steps = pd.read_csv('Data/samsunghealth_nevzad.nuhiu_202103172116/com.samsung.shealth.tracker.pedometer_day_summary.202103172116.csv', 
                          skiprows=[0],
                          index_col=False)
steps_date_cols = ['update_time', 'create_time']
steps = convert_dates(steps, steps_date_cols, '%Y-%m-%d %H:%M:%S')

# -------------------------- Sleep data ----------------------------- # 

sleep = pd.read_csv('Data/samsunghealth_nevzad.nuhiu_202103172116/com.samsung.shealth.sleep.202103172116.csv', 
                          skiprows=[0],
                          index_col=False)

sleep_date_cols = ['com.samsung.health.sleep.start_time', 'com.samsung.health.sleep.create_time', 'com.samsung.health.sleep.end_time', 'com.samsung.health.sleep.update_time']
sleep = convert_dates(sleep, sleep_date_cols, '%Y-%m-%d %H:%M:%S')
sleep.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 712 entries, 0 to 711
Data columns (total 35 columns):
 #   Column                                Non-Null Count  Dtype         
---  ------                                --------------  -----         
 0   original_efficiency                   0 non-null      float64       
 1   mental_recovery                       286 non-null    float64       
 2   factor_01                             286 non-null    float64       
 3   factor_02                             286 non-null    float64       
 4   factor_03                             286 non-null    float64       
 5   factor_04                             286 non-null    float64       
 6   factor_05                             286 non-null    float64       
 7   factor_06                             286 non-null    float64       
 8   factor_07                             286 non-null    float64       
 9   factor_08                             286 non-null    float64       
 10  fa

In [8]:
# -------------------------------------------------------------------- #
#                           Data Validation 
# -------------------------------------------------------------------- #

# temporal validation, to make sure indices are sorted 

# --------------------------- heart rate -------------------------------------- #
heart_rate = heart_rate.sort_values(by='com.samsung.health.heart_rate.start_time')
heart_rate = heart_rate[['com.samsung.health.heart_rate.start_time', 'com.samsung.health.heart_rate.heart_rate']]
heart_rate = heart_rate.groupby(heart_rate['com.samsung.health.heart_rate.start_time'].dt.date).mean().reset_index()
heart_rate = convert_dates(heart_rate, ['com.samsung.health.heart_rate.start_time'], '%Y-%m-%d')

# ------------------------- daily trend: speed, distance, calories ------------- # 
steps['date_time'] = steps['day_time'].apply(lambda d: datetime.datetime.fromtimestamp(d / 1e3))
steps = steps.sort_values(by='date_time')
steps = steps[['date_time', 'run_step_count', 'walk_step_count', 'speed', 'distance', 'calorie']]
steps = steps.groupby(steps['date_time'].dt.date).mean().reset_index()
steps = convert_dates(steps, ['date_time'], '%Y-%m-%d')

# -------------------------- sleep data ---------------------------------------- #
sleep = sleep.sort_values(by='com.samsung.health.sleep.start_time')
sleep = sleep[['com.samsung.health.sleep.start_time', 'efficiency', 'extra_data']]
sleep = sleep.groupby(sleep['com.samsung.health.sleep.start_time'].dt.date).mean().reset_index()
sleep = convert_dates(sleep, ['com.samsung.health.sleep.start_time'], '%Y-%m-%d')

# get min and max dates
print(heart_rate['com.samsung.health.heart_rate.start_time'].iloc[0], heart_rate['com.samsung.health.heart_rate.start_time'].iloc[-1])
print(steps['date_time'].iloc[0], steps['date_time'].iloc[-1])
print(sleep['com.samsung.health.sleep.start_time'].iloc[0], sleep['com.samsung.health.sleep.start_time'].iloc[-1])

min_date = '2020-03-24'
max_date = '2021-03-17'

# get time intervals for the different datasets 
heart_rate['interval'] = heart_rate['com.samsung.health.heart_rate.start_time'] - heart_rate['com.samsung.health.heart_rate.start_time'].shift(1)
print(heart_rate['interval'].value_counts())

steps['interval'] = steps['date_time'] - steps['date_time'].shift(1)
print(steps['interval'].value_counts())

sleep['interval'] = sleep['com.samsung.health.sleep.start_time'] - sleep['com.samsung.health.sleep.start_time'].shift(1)
print(sleep['interval'].value_counts())

2020-03-24 00:00:00 2021-03-14 00:00:00
2020-03-24 00:00:00 2021-03-17 00:00:00
2020-03-24 00:00:00 2021-03-17 00:00:00
1 days     285
2 days       2
3 days       1
6 days       1
26 days      1
10 days      1
16 days      1
5 days       1
Name: interval, dtype: int64
1 days    354
4 days      1
Name: interval, dtype: int64
1 days     305
2 days       6
3 days       1
26 days      1
12 days      1
Name: interval, dtype: int64


In [9]:
# Sample by day would be most appropriate  
date_range = pd.DataFrame(pd.date_range(start=min_date, end=max_date), columns=['date'])
full = date_range.merge(heart_rate, how='left', left_on='date', right_on='com.samsung.health.heart_rate.start_time', validate='one_to_one')\
                 .merge(steps, how='left', left_on='date', right_on='date_time', validate='one_to_one')\
                 .merge(sleep, how='left', left_on='date', right_on='com.samsung.health.sleep.start_time', validate='one_to_one')
print(full.columns)
full = full[['date', 
            'com.samsung.health.heart_rate.heart_rate', 
            'run_step_count', 
            'walk_step_count', 
            'speed', 
            'distance', 
            'calorie', 
            'efficiency']]

Index(['date', 'com.samsung.health.heart_rate.start_time',
       'com.samsung.health.heart_rate.heart_rate', 'interval_x', 'date_time',
       'run_step_count', 'walk_step_count', 'speed', 'distance', 'calorie',
       'interval_y', 'com.samsung.health.sleep.start_time', 'efficiency',
       'interval'],
      dtype='object')


In [10]:
# Look at missing values 
for col in full.columns:
    full[col] = full[col].interpolate()

full.to_csv('Data/samsung_dataset.csv', index=False)