## Explore AppleHealthKit data

In [1]:
import numpy as np
import pandas as pd
#import modin.pandas as pd

In [2]:
df = pd.read_csv('datasample.csv')
df.drop(columns=['#'], inplace=True)
df.head(2)

Unnamed: 0,_id,owner,source,sourceid,sourcename,type,startdate,enddate,value,originaldata,createdat,updatedat,timezone,mood,tags,_fivetran_deleted,_fivetran_synced,content
0,013dbe8e-4388-571a-a475-46cdccb95124,f9946d46-0b26-444c-bf9f-d6322c0c0ce7,healthkit:com.apple.health.7B5BE2EC-D560-46B8-...,92C32624-8A45-4D5B-B32E-254BD3E06E48,Kathleenâs AppleÂ Watch,health-heart-rate,2019-05-03 14:53:31.783000,2019-05-03 14:53:31.783000,100.0,,2019-08-19 18:48:16.277721,2019-08-19 18:48:16.277721,America/New_York,,,,2019-09-17 01:39:26.000000,
1,01cfa004-e070-59a9-9044-a33df3d0e5ed,f9946d46-0b26-444c-bf9f-d6322c0c0ce7,healthkit:com.apple.health.7B5BE2EC-D560-46B8-...,6254D17D-5E03-4409-958F-213E45FFAE5B,Kathleenâs AppleÂ Watch,health-step-distance,2019-06-28 16:59:51.148000,2019-06-28 16:59:53.706000,2.549228,,2019-08-19 21:51:59.376037,2019-08-19 21:51:59.376037,America/New_York,,,,2019-09-17 02:15:55.634000,


### Primary objective: To clean up the data with 23 million rows

In [3]:
print(f'No of rows: {df.shape[0]}, No of columns: {df.shape[1]}')

No of rows: 23602366, No of columns: 18


In [4]:
#2# Owner column name is little hard to read, let's label the owner with easy id
dicti = {}
x=0
for i in df['owner'].unique():  
    dicti[i]=x
    x+=1
df['easy_id']=df['owner'].map(dicti)

#3# Source needs to be cleaned: I am interested to know where the data is coming from
a = df['source'].nunique()

y = df['source'].str.split('healthkit:com.')
z = []
for i in y:
    try:
        string = i[1]
        str2 = string.split('.')
        str3 = str2[0]
        z.append(str3)
    except:
        continue
        
df['source'] = pd.Series(z)
b = df['source'].nunique()
print('No of unique source before cleaning: ',a)
print('No of unique source after cleaning: ',b)

No of unique source before cleaning:  583
No of unique source after cleaning:  75


In [5]:
#4# sourceid very similar to _id # we can remove it
#5# sourcename-user-friendly name for source of data eg-Fitbit, we get that from cleaned source column already
#6# Type will be important for user-segmentation
df['type'].value_counts()

health-heart-rate              8957081
health-step-distance           8543295
health-step-count              5821398
health-flights-climbed          219242
health-sleep                     52353
health-weight                     3891
health-mindfulness-duration       1811
health-body-fat                   1673
health-bmi                        1622
Name: type, dtype: int64

In [6]:
#7#8# startdate, enddate would be used as a datetime column to aggregate activity /month/week etc
#9# value column will be improtant to understand how active is the user
#10# originaldata-remove no information
#11#12# createdat,updatedat: will be remove- information will come from startdate and enddate anyways

#13# Timezone is important to understand user demographics. 
df['continent'] = df['timezone'].str.split('/').apply(lambda x:x[0])
df['city'] = df['timezone'].str.split('/').apply(lambda x:x[1])

#14#15#16#18 # Only NaN values in the column
df[['mood','tags','_fivetran_deleted','content']].nunique()

mood                 0
tags                 0
_fivetran_deleted    0
content              0
dtype: int64

In [7]:
cols = ['sourceid', 'sourcename', 'createdat','updatedat', 'originaldata', 'timezone', 'mood','tags',
        '_fivetran_deleted','_fivetran_synced','content']
df_clean = df.drop(columns=cols)
# arrange columns in a meaningful way
cols = ['_id', 'owner','easy_id','source', 'type', 'startdate', 'enddate', 'value', 'continent', 'city']
df_clean = df_clean[cols]

# remove missing values in source column as 'Missing'
df_clean['source'].fillna('Missing', inplace=True)
df_clean.isna().sum().sum()

0

In [8]:
print(f'Before cleaning: No of rows: {df.shape[0]}, No of columns: {df.shape[1]}')
print(f'After cleaning: No of rows:  {df_clean.shape[0]}, No of columns: {df_clean.shape[1]}')

Before cleaning: No of rows: 23602366, No of columns: 21
After cleaning: No of rows:  23602366, No of columns: 10


In [9]:
df_clean.to_csv("df_clean.csv", index=False)
df_clean_ = pd.read_csv("df_clean.csv", parse_dates=["startdate","enddate"])
df_clean_.head(2)

Unnamed: 0,_id,owner,easy_id,source,type,startdate,enddate,value,continent,city
0,013dbe8e-4388-571a-a475-46cdccb95124,f9946d46-0b26-444c-bf9f-d6322c0c0ce7,0,apple,health-heart-rate,2019-05-03 14:53:31.783,2019-05-03 14:53:31.783,100.0,America,New_York
1,01cfa004-e070-59a9-9044-a33df3d0e5ed,f9946d46-0b26-444c-bf9f-d6322c0c0ce7,0,apple,health-step-distance,2019-06-28 16:59:51.148,2019-06-28 16:59:53.706,2.549228,America,New_York


In [10]:
######### Next: 2.Aggregation_AppleHealthKitData_G : aggregate AppleHealthKit Data ###########