In [40]:
import altair as alt
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from random import randint
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans

In [41]:
#data from https://www.kaggle.com/arashnic/fitbit

#we are going to analyse these two files, which are combinations of many of the other files into two larger datasets.
#We will see how various columns are related and check to see if the amount of sleep impacts performance.
activityDF = pd.read_csv("Fitabase Data 4.12.16-5.12.16/dailyActivity_merged.csv")
sleepDF = pd.read_csv("Fitabase Data 4.12.16-5.12.16/sleepDay_merged.csv")

#I noticed that the hour is always 12:00 AM, so we can strip that off
#this will help do some cool stuff later on
print("Before:\n" + f"{sleepDF['SleepDay'][0:5]}", end="\n\n")
sleepDF["SleepDay"] = sleepDF["SleepDay"].map(lambda date: date.split(" ")[0])
print("After:\n" + f"{sleepDF['SleepDay'][0:5]}")

Before:
0    4/12/2016 12:00:00 AM
1    4/13/2016 12:00:00 AM
2    4/15/2016 12:00:00 AM
3    4/16/2016 12:00:00 AM
4    4/17/2016 12:00:00 AM
Name: SleepDay, dtype: object

After:
0    4/12/2016
1    4/13/2016
2    4/15/2016
3    4/16/2016
4    4/17/2016
Name: SleepDay, dtype: object


In [42]:
#The first thing to do is check the data types
print(activityDF.dtypes, end="\n\n")
print(sleepDF.dtypes)

Id                            int64
ActivityDate                 object
TotalSteps                    int64
TotalDistance               float64
TrackerDistance             float64
LoggedActivitiesDistance    float64
VeryActiveDistance          float64
ModeratelyActiveDistance    float64
LightActiveDistance         float64
SedentaryActiveDistance     float64
VeryActiveMinutes             int64
FairlyActiveMinutes           int64
LightlyActiveMinutes          int64
SedentaryMinutes              int64
Calories                      int64
dtype: object

Id                     int64
SleepDay              object
TotalSleepRecords      int64
TotalMinutesAsleep     int64
TotalTimeInBed         int64
dtype: object


In [43]:
#We can see that we are mostly working with integers and floats with the exception of the two strings representing the date.

#now I can do some simple visualizations to show what different people are doing over different days



In [44]:
#To keep things from getting too messy, Lets sample a few Id's and only plot those
sampledIds = sleepDF["Id"].sample(n=5).tolist()

plt = alt.Chart(sleepDF[sleepDF["Id"].isin(sampledIds)]).mark_line().encode(
        x='SleepDay:T',
        y='TotalMinutesAsleep:Q',
        color='Id:N',
    ).interactive()
display(plt)

plt = alt.Chart(activityDF[activityDF["Id"].isin(sampledIds)]).mark_line().encode(
        x='ActivityDate:T',
        y='TotalSteps:Q',
        color='Id:N',
    ).interactive()
display(plt)


#you can interact, pan, and zoom with the plots below.

In [45]:
#you can rerun this box to get different id's
sampledId = int(sleepDF["Id"].sample())

chart1 = alt.Chart(sleepDF.loc[sleepDF["Id"] == sampledId]).mark_line(color='red').encode(
        x='SleepDay:T',
        y=alt.Y('TotalMinutesAsleep:Q', axis=alt.Axis(title="Minutes  of  Sleep")),
    ).interactive()
chart2 = alt.Chart(activityDF.loc[activityDF["Id"] == sampledId]).mark_line(color='blue').encode(
        x='ActivityDate:T',
        y=alt.Y('TotalSteps:Q', axis=alt.Axis(title="Total  Steps")),
    ).interactive()


display(alt.layer(chart1, chart2).resolve_scale(y='independent').configure_axisLeft(titleColor='red').configure_axisRight(titleColor='blue'))

#this results in a direct comparison of sleep to steps achieved.

In [46]:
#This plot sometimes has cases where the minutes of sleep sometimes randomly cuts off
#this demonstrates that we need to join our dataframes so that we only have the columns where we have both sleep and activity data

In [47]:
#This is an inner join to eliminate the columns we don't want
df = activityDF.merge(sleepDF, how='inner', left_on=["Id", "ActivityDate"], right_on=["Id","SleepDay"])
df[0:15]

Unnamed: 0,Id,ActivityDate,TotalSteps,TotalDistance,TrackerDistance,LoggedActivitiesDistance,VeryActiveDistance,ModeratelyActiveDistance,LightActiveDistance,SedentaryActiveDistance,VeryActiveMinutes,FairlyActiveMinutes,LightlyActiveMinutes,SedentaryMinutes,Calories,SleepDay,TotalSleepRecords,TotalMinutesAsleep,TotalTimeInBed
0,1503960366,4/12/2016,13162,8.5,8.5,0.0,1.88,0.55,6.06,0.0,25,13,328,728,1985,4/12/2016,1,327,346
1,1503960366,4/13/2016,10735,6.97,6.97,0.0,1.57,0.69,4.71,0.0,21,19,217,776,1797,4/13/2016,2,384,407
2,1503960366,4/15/2016,9762,6.28,6.28,0.0,2.14,1.26,2.83,0.0,29,34,209,726,1745,4/15/2016,1,412,442
3,1503960366,4/16/2016,12669,8.16,8.16,0.0,2.71,0.41,5.04,0.0,36,10,221,773,1863,4/16/2016,2,340,367
4,1503960366,4/17/2016,9705,6.48,6.48,0.0,3.19,0.78,2.51,0.0,38,20,164,539,1728,4/17/2016,1,700,712
5,1503960366,4/19/2016,15506,9.88,9.88,0.0,3.53,1.32,5.03,0.0,50,31,264,775,2035,4/19/2016,1,304,320
6,1503960366,4/20/2016,10544,6.68,6.68,0.0,1.96,0.48,4.24,0.0,28,12,205,818,1786,4/20/2016,1,360,377
7,1503960366,4/21/2016,9819,6.34,6.34,0.0,1.34,0.35,4.65,0.0,19,8,211,838,1775,4/21/2016,1,325,364
8,1503960366,4/23/2016,14371,9.04,9.04,0.0,2.81,0.87,5.36,0.0,41,21,262,732,1949,4/23/2016,1,361,384
9,1503960366,4/24/2016,10039,6.41,6.41,0.0,2.92,0.21,3.28,0.0,39,5,238,709,1788,4/24/2016,1,430,449


In [48]:
#now we can try to predict our total steps based on sleep
X = df[["TotalMinutesAsleep"]].values
# ones = np.ones((X.shape[0], 1))

# X = np.hstack((X, ones)) I'm used to doing it by hand, but sklearn will do the bias for me
Y = df[["TotalSteps"]].values

reg = LinearRegression().fit(X, Y)
pred = reg.predict(X)
predDF = pd.DataFrame(pred)
predDF.columns = ["prediction"]
predDF["Minutes of Sleep"] = X
predDF

Unnamed: 0,prediction,Minutes of Sleep
0,9148.077058,327
1,8773.940657,384
2,8590.154355,412
3,9062.747703,340
4,6699.780960,700
...,...,...
408,9043.056314,343
409,7992.848872,503
410,8570.462965,415
411,7907.519518,516


In [49]:
#Now lets do a scatter plot with our line of best fit
chart1 = alt.Chart(df[["TotalSteps", "TotalMinutesAsleep"]]).mark_point().encode(
    x='TotalMinutesAsleep',
    y='TotalSteps'
).interactive()

chart2 = alt.Chart(predDF.reset_index()).mark_line(color='red').encode(
    x='Minutes of Sleep',
    y='prediction'
).interactive()

display(alt.layer(chart1, chart2))

In [50]:
#Here we can see that sleep is not a very good indicator of how many steps a person walks
#this makes me question if the amount of sleep recorded in the data is for the night before, or that night.
#If it is for the night before, then we need to shift all the dates to get a proper prediction
#lets see if this provides a better prediction

#First I will make a new column of data that is sleepday but the day before.
print("Before:\n" + f"{sleepDF['SleepDay'][0:5]}", end="\n\n")
sleepDF["SleepDay2"] = sleepDF["SleepDay"].map(lambda d: "{dt.month}/{dt.day}/{dt.year}".format(dt = (datetime.strptime(d, "%m/%d/%Y") - timedelta(days=1))))
print("After:\n" + f"{sleepDF['SleepDay2'][0:5]}")

Before:
0    4/12/2016
1    4/13/2016
2    4/15/2016
3    4/16/2016
4    4/17/2016
Name: SleepDay, dtype: object

After:
0    4/11/2016
1    4/12/2016
2    4/14/2016
3    4/15/2016
4    4/16/2016
Name: SleepDay2, dtype: object


In [51]:
#Now I will repeat the steps above
#join on the new column
df2 = activityDF.merge(sleepDF, how='inner', left_on=["Id", "ActivityDate"], right_on=["Id","SleepDay2"])
df2[0:5]

Unnamed: 0,Id,ActivityDate,TotalSteps,TotalDistance,TrackerDistance,LoggedActivitiesDistance,VeryActiveDistance,ModeratelyActiveDistance,LightActiveDistance,SedentaryActiveDistance,VeryActiveMinutes,FairlyActiveMinutes,LightlyActiveMinutes,SedentaryMinutes,Calories,SleepDay,TotalSleepRecords,TotalMinutesAsleep,TotalTimeInBed,SleepDay2
0,1503960366,4/12/2016,13162,8.5,8.5,0.0,1.88,0.55,6.06,0.0,25,13,328,728,1985,4/13/2016,2,384,407,4/12/2016
1,1503960366,4/14/2016,10460,6.74,6.74,0.0,2.44,0.4,3.91,0.0,30,11,181,1218,1776,4/15/2016,1,412,442,4/14/2016
2,1503960366,4/15/2016,9762,6.28,6.28,0.0,2.14,1.26,2.83,0.0,29,34,209,726,1745,4/16/2016,2,340,367,4/15/2016
3,1503960366,4/16/2016,12669,8.16,8.16,0.0,2.71,0.41,5.04,0.0,36,10,221,773,1863,4/17/2016,1,700,712,4/16/2016
4,1503960366,4/18/2016,13019,8.59,8.59,0.0,3.25,0.64,4.71,0.0,42,16,233,1149,1921,4/19/2016,1,304,320,4/18/2016


In [52]:
#now we can try to predict our total steps based on sleep with the new df
X = df2[["TotalMinutesAsleep"]].values
Y = df2[["TotalSteps"]].values

reg = LinearRegression().fit(X, Y)
pred = reg.predict(X)
predDF = pd.DataFrame(pred)
predDF.columns = ["prediction"]
predDF["Minutes of Sleep"] = X


In [53]:
#Now lets do a scatter plot with our line of best fit
chart1 = alt.Chart(df2[["TotalSteps", "TotalMinutesAsleep"]]).mark_point().encode(
    x='TotalMinutesAsleep',
    y='TotalSteps'
).interactive()

chart2 = alt.Chart(predDF.reset_index()).mark_line(color='red').encode(
    x='Minutes of Sleep',
    y='prediction'
).interactive()

display(alt.layer(chart1, chart2))

In [54]:
#This has an even worse correlation, leading me to believe that my initial assumption that the sleepDay was how much sleep you got the night before.

In [55]:
#Lets try a few more comparisons. I'm thinking we see if sleep can predict the level of calories burned and very active minutes

In [56]:
#predict calories
X = df[["TotalMinutesAsleep"]].values
Y = df[["Calories"]].values

reg = LinearRegression().fit(X, Y)
pred = reg.predict(X)
predDF = pd.DataFrame(pred)
predDF.columns = ["prediction"]
predDF["Minutes of Sleep"] = X

In [57]:
#Now lets do a scatter plot with our line of best fit
chart1 = alt.Chart(df[["Calories", "TotalMinutesAsleep"]]).mark_point().encode(
    x='TotalMinutesAsleep',
    y='Calories'
).interactive()

chart2 = alt.Chart(predDF.reset_index()).mark_line(color='red').encode(
    x='Minutes of Sleep',
    y='prediction'
).interactive()

display(alt.layer(chart1, chart2))
#very little correlation

In [58]:
X = df[["TotalMinutesAsleep"]].values
Y = df[["VeryActiveMinutes"]].values

reg = LinearRegression().fit(X, Y)
pred = reg.predict(X)
predDF = pd.DataFrame(pred)
predDF.columns = ["prediction"]
predDF["Minutes of Sleep"] = X

In [59]:
chart1 = alt.Chart(df[["VeryActiveMinutes", "TotalMinutesAsleep"]]).mark_point().encode(
    x='TotalMinutesAsleep',
    y='VeryActiveMinutes'
).interactive()

chart2 = alt.Chart(predDF.reset_index()).mark_line(color='red').encode(
    x='Minutes of Sleep',
    y='prediction'
).interactive()

display(alt.layer(chart1, chart2))

In [21]:
#This graph is a lot more interesting. At first glance it might seem that anyone who gets anywhere from 300-600 minutes of sleep is more likely to have zero very active minutes.
#I believe that those entries are simply the most common, leading to the large cluster of data points in that range, and the equally high number at zero.

In [60]:
#Now for one last regression, I want to go in a different direction and see if the amount of time spent sedentary increases with lower amount of sleep
X = df[["TotalMinutesAsleep"]].values
Y = df[["SedentaryMinutes"]].values

reg = LinearRegression().fit(X, Y)
pred = reg.predict(X)
predDF = pd.DataFrame(pred)
predDF.columns = ["prediction"]
predDF["Minutes of Sleep"] = X

chart1 = alt.Chart(df[["SedentaryMinutes", "TotalMinutesAsleep"]]).mark_point().encode(
    x='TotalMinutesAsleep',
    y='SedentaryMinutes'
).interactive()

chart2 = alt.Chart(predDF.reset_index()).mark_line(color='red').encode(
    x='Minutes of Sleep',
    y='prediction'
).interactive()

display(alt.layer(chart1, chart2))
#There does seem to be a very good correlation here, shows a downwards trend in sedentary minutes as more sleep is accrued.
#This is the first real correlation, though there are plenty of outliers here.

In [61]:
#I did also want to try to link BMI to lower levels of sleep, but the BMI dataset is very lacking so I decided not to.

In [62]:
#To wrap up this analysis I will perform k-means clustering trying to predict amount of sleep using sedentary minutes and very active minutes

#To do this I have to bucket the amounts of sleep.


In [63]:
#these will also be my target variables
bucketLabels = [0, 1, 2, 3]

df["sleepBucket"] = pd.qcut(df['TotalMinutesAsleep'], q=4, labels=bucketLabels)

In [64]:
#Now that I have my target column I can do my clustering on my two selected variables
X = df[["SedentaryMinutes", "VeryActiveMinutes"]].values
kmeans = KMeans(n_clusters=4).fit(X)
df["label"] = kmeans.labels_

In [71]:
plt = alt.Chart(df[["SedentaryMinutes", "VeryActiveMinutes", "label"]]).mark_point().encode(
    x='SedentaryMinutes',
    y='VeryActiveMinutes',
    color='label:N'
).interactive()

display(plt)
#clustering results

In [73]:
plt = alt.Chart(df[["SedentaryMinutes", "VeryActiveMinutes", "label", "sleepBucket"]]).mark_point().encode(
    x='SedentaryMinutes',
    y='VeryActiveMinutes',
    color='sleepBucket:N'
).interactive()
display(plt)

In [None]:
#So yeah, this didn't work out very well, but it did reveal a trend in the data that the people who had high(650-850 minutes) 
#amounts of sedentary time also had the most highly active time to make up for the sedentary time
#This was a fun project, to end up a fun class. Thank you for a wonderful semester and all of your hard work :)