# Fibit Time Series independent project

## Deliverables

1. This notebook containing your analysis, summary, and conclusions


2. A tidied data set. The source data is a little messy the data was edited in Google Sheets then downloaded into a csv for upload here.


3. A summarization of the data.
    - What do you make of the data?
    - What can you say say about the individual who was wearing this fitness tracker?   
  
  
4. Predictions for the missing two weeks worth of data in a separate csv file.


## Environment Setup

In [1]:
import numpy as np
import pandas as pd

from datetime import datetime
from sklearn.metrics import mean_squared_error
from math import sqrt

import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import register_matplotlib_converters

import statsmodels.api as sm
from statsmodels.tsa.api import Holt

import warnings
warnings.filterwarnings("ignore")

In [2]:
# plotting defaults
plt.rc('figure', figsize=(13, 7))
plt.style.use('seaborn-whitegrid')
plt.rc('font', size=16)

# Acquire

In [3]:
df = pd.read_csv('fitbit_all_activity.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'fitbit_all_activity.csv'

In [None]:
df.head()

# Prepare

In [None]:
df1 = df.copy()
df1.date = pd.to_datetime(df1.date)
df1 = df1.set_index('date').sort_index()
df1.head()

In [None]:
df1.isna().sum()

In [None]:
df1.info()
# need to convert all columns to numbers, fixed

In [None]:
# view histograms - ok before split b/c all independent
df1.hist(figsize=(9, 9))

## split and validate split

In [None]:
# split data
train_size = int(len(df1) * .5)
validate_size = int(len(df1) * .3)
test_size = int(len(df1) - train_size - validate_size)
validate_end_index = train_size + validate_size

# split into train, validation, test
train = df1[: train_size]
validate = df1[train_size : validate_end_index]
test = df1[validate_end_index : ]

In [None]:
# Does the length of each df equate to the length of the original df?
print(len(train) + len(validate) + len(test) == len(df))

In [None]:
# Does the first row of original df equate to the first row of train?
print(df1.head(1) == train.head(1))

In [None]:
# Is the last row of train the day before the first row of validate? And the same for validate to test?
pd.concat([train.tail(1), validate.head(1)])
pd.concat([validate.tail(1), test.head(1)])

In [None]:
# Is the last row of test the same as the last row of our original dataframe?
pd.concat([test.tail(1), df1.tail(1)])

In [None]:
for col in train.columns:
    plt.figure(figsize=(12,4))
    plt.plot(train[col])
    plt.plot(validate[col])
    plt.plot(test[col])
    plt.ylabel(col)
    plt.title(col)
    plt.show()

# Explore

In [None]:
cb = train.cal_burned
cb.head()

In [None]:
steps = train.steps
steps.head()

In [None]:
cb.plot.hist()

In [None]:
steps.plot.hist()

In [None]:
type(train.index)

In [None]:
train['month'] = train.index.month
train.groupby('month').cal_burned.mean().plot.bar()

In [None]:
train.groupby('month').steps.mean().plot.bar()

In [None]:
train['weekday'] = train.index.day_name()
order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
sns.boxplot(data=train, y='cal_burned', x='weekday', order=order)

In [None]:
sns.boxplot(data=train, y='steps', x='weekday', order=order)

In [None]:
sns.boxplot(data=train, y='min_sed', x='weekday', order=order)

In [None]:
sns.boxplot(data=train, y='min_light', x='weekday', order=order)

In [None]:
sns.boxplot(data=train, y='min_fair', x='weekday', order=order)

In [None]:
sns.boxplot(data=train, y='min_very', x='weekday', order=order)

In [None]:
cb.plot()

In [None]:
cb.resample('3D').mean().plot(title='3 Day average')

In [None]:
cb.resample('W').mean().plot(title='week average')

In [None]:
steps = train.steps
steps.head()

In [None]:
steps.plot()

In [None]:
steps.resample('3D').mean().plot(title='3 Day average')

In [None]:
steps.resample('W').mean().plot(title='week average')

In [None]:
plt.scatter(cb, cb.shift(-1))
plt.xlabel('$cb$')
plt.ylabel('$cb_{t + 1}$')
plt.title('Lag plot with lag=1')

In [None]:
# del train['cb(t + 1)']
# del train['month']
train.head()

In [None]:
train[['min_sed', 'min_light', 'min_fair', 'min_very']].resample('M').sum().plot.bar()

In [None]:
weekly = train.resample('W').mean()
weekly['the_next_week'] = weekly.cal_burned.shift(-1)
weekly = weekly.rename(columns={'cal_burned': 'this_week'})
weekly.plot.scatter(x='this_week', y='the_next_week')
weekly

**Takeaways**     
- there is not a cyclical/seasonal pattern to this data
- slight linear upward trend
- dataset is too small for cycle or seasonal trend to show

# Model

## Functions to help with modeling

In [None]:
# evaluation function to compute rmse
def evaluate(target_var):
    rmse = round(sqrt(mean_squared_error(validate[target_var], yhat_df[target_var])), 0)
    return rmse

# plot and evaluate 
def plot_and_eval(target_var):
    plt.figure(figsize = (12,4))
    plt.plot(train[target_var], label = 'Train', linewidth = 1)
    plt.plot(validate[target_var], label = 'Validate', linewidth = 1)
    plt.plot(yhat_df[target_var])
    plt.title(target_var)
    rmse = evaluate(target_var)
    print(target_var, '-- RMSE: {:.0f}'.format(rmse))
    plt.show()
    
# Create the empty dataframe
eval_df = pd.DataFrame(columns=['model_type', 'target_var', 'rmse'])

# function to store rmse for comparison purposes
def append_eval_df(model_type, target_var):
    rmse = evaluate(target_var)
    d = {'model_type': [model_type], 'target_var': [target_var], 'rmse': [rmse]}
    d = pd.DataFrame(d)
    return eval_df.append(d, ignore_index = True)

## Forecasting

### Last Observed Value

In [None]:
# remove columns not needed for modeling from train
del train['distance']
del train['floors']
del train['activity_cal']
del train['month']
del train['weekday']

# remove columns not needed for modeling from validate
del validate['distance']
del validate['floors']
del validate['activity_cal']

# remove columns not needed for modeling from test
del test['distance']
del test['floors']
del test['activity_cal']

train.head()

In [None]:
# get last observed value and set that as prediction for all in validate
cal_burned = train['cal_burned'][-1:][0]
steps = train['steps'][-1:][0]
min_sed = train['min_sed'][-1:][0]
min_light = train['min_light'][-1:][0]
min_fair = train['min_fair'][-1:][0]
min_very = train['min_very'][-1:][0]

yhat_df = pd.DataFrame({'cal_burned': [cal_burned], 'steps': [steps], 'min_sed': [min_sed], 
                        'min_light': [min_light], 'min_fair': [min_fair], 'min_very': [min_very]}, 
                       index = validate.index)

yhat_df.head(2)

In [None]:
train.tail(1)

In [None]:
# plot predicted values
for col in train.columns:
    plot_and_eval(col)

In [None]:
for col in train.columns:
    eval_df = append_eval_df(model_type = 'last_observed_value', 
                             target_var = col)

In [None]:
eval_df

### Simple Average

In [None]:
# get average and use that to make predictions for all in validate
cal_burned = train['cal_burned'].mean()
steps = train['steps'].mean()
min_sed = train['min_sed'].mean()
min_light = train['min_light'].mean()
min_fair = train['min_fair'].mean()
min_very = train['min_very'].mean()

# create function for later repitition
def make_predictions():
    yhat_df = pd.DataFrame({'cal_burned': [cal_burned], 'steps': [steps], 'min_sed': [min_sed], 
                        'min_light': [min_light], 'min_fair': [min_fair], 'min_very': [min_very]}, 
                           index = validate.index)
    return yhat_df

yhat_df = make_predictions()

In [None]:
yhat_df.head(2)

In [None]:
for col in train.columns:
    plot_and_eval(col)

In [None]:
for col in train.columns:
    eval_df = append_eval_df(model_type='simple_average', 
                             target_var = col)

In [None]:
eval_df

### 30 day rolling average

In [None]:
# compute a 30 day rolling average, 
# use the most recent/last 30 day period value to predict forward. 
period = 30

cal_burned = round(train['cal_burned'].rolling(period).mean().iloc[-1], 1)
steps = round(train['steps'].rolling(period).mean().iloc[-1], 1)
min_sed = round(train['min_sed'].rolling(period).mean().iloc[-1], 1)
min_light = round(train['min_light'].rolling(period).mean().iloc[-1], 1)
min_fair = round(train['min_fair'].rolling(period).mean().iloc[-1], 1)
min_very = round(train['min_very'].rolling(period).mean().iloc[-1], 1)

yhat_df = make_predictions()
yhat_df.head(3)

In [None]:
for col in train.columns:
    plot_and_eval(col)

In [None]:
for col in train.columns:
    eval_df = append_eval_df(model_type='30d moving average', 
                             target_var = col)

In [None]:
eval_df

#### additional rolling average periods

In [None]:
periods = [1, 4, 12, 26, 52, 104]

for p in periods:
    cal_burned = round(train['cal_burned'].rolling(period).mean().iloc[-1], 1)
    steps = round(train['steps'].rolling(period).mean().iloc[-1], 1)
    min_sed = round(train['min_sed'].rolling(period).mean().iloc[-1], 1)
    min_light = round(train['min_light'].rolling(period).mean().iloc[-1], 1)
    min_fair = round(train['min_fair'].rolling(period).mean().iloc[-1], 1)
    min_very = round(train['min_very'].rolling(period).mean().iloc[-1], 1)
    yhat_df = make_predictions()
    model_type = str(p) + 'd moving average'
    for col in train.columns:
        eval_df = append_eval_df(model_type = model_type, 
                                 target_var = col)

In [None]:
eval_df

### Best Model so far

In [None]:
# get the min rmse for each variable

# min_rmse_cb = eval_df.groupby('target_var')['rmse'].min()[0]
# min_rmse_min_fair = eval_df.groupby('target_var')['rmse'].min()[1]
# min_rmse_min_light = eval_df.groupby('target_var')['rmse'].min()[2]
# min_rmse_min_sed = eval_df.groupby('target_var')['rmse'].min()[3]
# min_rmse_min_very = eval_df.groupby('target_var')['rmse'].min()[-2]
# min_rmse_steps = eval_df.groupby('target_var')['rmse'].min()[-1]

# # filter only the rows that match those rmse to find out 
# # which models are best thus far
# eval_df[((eval_df.rmse == min_rmse_cb) | 
#          (eval_df.rmse == min_rmse_min_fair) | (eval_df.rmse == min_rmse_min_light) | 
#          (eval_df.rmse == min_rmse_min_sed) | (eval_df.rmse == min_rmse_min_very) | 
#          (eval_df.rmse == min_rmse_steps)
#         )]

eval_df.groupby(['target_var', 'model_type'])[['rmse']].min()

- Best so far = any of rolling average

### Holts Linear Trend

In [None]:
for col in train.columns:
    print(col,'\n')
    _ = sm.tsa.seasonal_decompose(train[col].resample('D').mean()).plot()
    plt.show()

In [None]:
for col in train.columns:
    model = Holt(train[col], exponential = False, damped=True)
    model = model.fit(smoothing_level = .1, 
                      smoothing_slope = .1, 
                      optimized = True)
    yhat_items = model.predict(start = validate.index[0], 
                               end = validate.index[-1])
    yhat_df[col] = round(yhat_items, 2)

In [None]:
yhat_df.head()

In [None]:
for col in train.columns:
    plot_and_eval(target_var = col)

In [None]:
for col in train.columns:
    eval_df = append_eval_df(model_type = 'Holts', 
                             target_var = col)

In [None]:
eval_df

### Predict on previous cycle

Not going to use this model because data does not have seasonal/cycle in it

## Best model evaluation and test

- best performing models are the rolling average, no difference between # of days
- will use 7 days and try on test data

In [None]:
eval_df.groupby(['target_var', 'model_type'])[['rmse']].min()

In [None]:
test_eval = pd.DataFrame(eval_df[eval_df.model_type == '4d moving average'])
test_eval

In [None]:
# concat train with validate to create predictions for test
train_val = pd.concat(train, validate)

In [None]:
# compute a 7 day rolling average, 
 
period = 7

cal_burned = round(train['cal_burned'].rolling(period).mean().iloc[-1], 1)
steps = round(train['steps'].rolling(period).mean().iloc[-1], 1)
min_sed = round(train['min_sed'].rolling(period).mean().iloc[-1], 1)
min_light = round(train['min_light'].rolling(period).mean().iloc[-1], 1)
min_fair = round(train['min_fair'].rolling(period).mean().iloc[-1], 1)
min_very = round(train['min_very'].rolling(period).mean().iloc[-1], 1)

yhat_df = pd.DataFrame({'cal_burned': [cal_burned], 'steps': [steps], 'min_sed': [min_sed], 
                        'min_light': [min_light], 'min_fair': [min_fair], 'min_very': [min_very]}, 
                           index = test.index)
yhat_df.head(3)

In [None]:
def final_plot(target_var):
    plt.figure(figsize=(12,4))
    plt.plot(train[target_var], label='train')
    plt.plot(validate[target_var], label = 'validate')
    plt.plot(test[target_var], label = 'test')
    plt.plot(yhat_df[target_var], alpha=.5)
    plt.title(target_var)
    plt.show()

In [None]:
for col in train.columns:
    final_plot(col)

In [None]:
rmse_cal_burned = sqrt(mean_squared_error(test['cal_burned'], yhat_df['cal_burned']))
rmse_steps = sqrt(mean_squared_error(test['steps'], yhat_df['steps']))
rmse_min_sed = sqrt(mean_squared_error(test['min_sed'], yhat_df['min_sed']))
rmse_min_light = sqrt(mean_squared_error(test['min_light'], yhat_df['min_light']))
rmse_min_fair = sqrt(mean_squared_error(test['min_fair'], yhat_df['min_fair']))
rmse_min_very = sqrt(mean_squared_error(test['min_very'], yhat_df['min_very']))

print('rmse_cal_burned=', rmse_cal_burned)
print('rmse_steps=', rmse_steps)
print('rmse_min_sed=', rmse_min_sed)
print('rmse_min_light=', rmse_min_light)
print('rmse_min_fair=', rmse_min_fair)
print('rmse_min_very=', rmse_min_very)

In [None]:
def final_evaluate(target_var):
    rmse = round(sqrt(mean_squared_error(test[target_var], yhat_df[target_var])), 0)
    return rmse

In [None]:
# function to store rmse for comparison purposes
def final_append_eval_df(model_type, target_var):
    rmse = final_evaluate(target_var)
    d = {'model_type': [model_type], 'target_var': [target_var], 'rmse': [rmse]}
    d = pd.DataFrame(d)
    return test_eval.append(d, ignore_index = True)

In [None]:
for col in train.columns:
    test_eval = final_append_eval_df(model_type = '7d rolling everage', 
                             target_var = col)

In [None]:
test_eval

In [None]:
# to predict 2018

# yhat_df = test + train.diff(365).mean()
# yhat_df.index = test.index + pd.Timedelta('1Y')

In [None]:
# compute a 7 day rolling average using test, 
# predict out 2 weeks based on this average for next 2 weeks predictions
period = 7

cal_burned = round(test['cal_burned'].rolling(period).mean().iloc[-1], 1)
steps = round(test['steps'].rolling(period).mean().iloc[-1], 1)
min_sed = round(test['min_sed'].rolling(period).mean().iloc[-1], 1)
min_light = round(test['min_light'].rolling(period).mean().iloc[-1], 1)
min_fair = round(test['min_fair'].rolling(period).mean().iloc[-1], 1)
min_very = round(test['min_very'].rolling(period).mean().iloc[-1], 1)

yhat_df = pd.DataFrame({'cal_burned': [cal_burned], 'steps': [steps], 'min_sed': [min_sed], 
                        'min_light': [min_light], 'min_fair': [min_fair], 'min_very': [min_very]}, 
                           index = test.index + pd.Timedelta('2W'))
yhat_df.head(3)

In [None]:
# visual with future 2 week forecast
for col in train.columns:
    final_plot(col)

# Deliverables

## Summary of Data

## Conclusions

## 2 week prediction csv

In [None]:
# create csv of 2 week predictions
yhat_df.to_csv('predictions_2weeks')