# Creating basemodels

In [117]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, mean_absolute_error, mean_squared_error

In [118]:
cycle_df = pd.read_csv('Cleaned-Data/cleaned-physiological-cycles.csv')

In [119]:
cycle_df.head()

Unnamed: 0,Cycle start time,Cycle end time,Cycle timezone,Recovery score %,Resting heart rate (bpm),Heart rate variability (ms),Day Strain,Energy burned (cal),Max HR (bpm),Average HR (bpm),...,Deep (SWS) duration (min),REM duration (min),Awake duration (min),Sleep need (min),Sleep debt (min),Sleep efficiency %,Sleep consistency %,Recovery category,day_of_week,Day of the week
0,2021-10-16 22:13:29,2021-10-17 23:26:57,UTC-04:00,58.0,37.0,69.0,16.3,3257.0,156.0,64.0,...,87.0,45.0,99.0,684.0,134.0,79.0,76.0,1,Saturday,Saturday
1,2021-10-15 23:10:05,2021-10-16 22:13:29,UTC-04:00,25.0,36.0,46.0,19.8,3607.0,195.0,70.0,...,65.0,31.0,149.0,654.0,117.0,66.0,77.0,0,Friday,Friday
2,2021-10-14 23:51:49,2021-10-15 23:10:05,UTC-04:00,45.0,38.0,66.0,17.9,3611.0,178.0,70.0,...,97.0,91.0,75.0,663.0,115.0,84.0,75.0,1,Thursday,Thursday
3,2021-10-13 23:00:18,2021-10-14 23:51:49,UTC-04:00,18.0,37.0,45.0,19.5,4247.0,178.0,73.0,...,82.0,11.0,115.0,679.0,139.0,75.0,87.0,0,Wednesday,Wednesday
4,2021-10-13 00:56:19,2021-10-13 23:00:18,UTC-04:00,44.0,37.0,73.0,18.3,3380.0,189.0,70.0,...,40.0,27.0,67.0,656.0,112.0,80.0,77.0,1,Wednesday,Wednesday


In [120]:
cycle_df.drop(['Cycle start time', 'Cycle end time', 'Cycle timezone', 'Day of the week', 'day_of_week'], axis=1, inplace=True)

In [121]:
cycle_df.head()

Unnamed: 0,Recovery score %,Resting heart rate (bpm),Heart rate variability (ms),Day Strain,Energy burned (cal),Max HR (bpm),Average HR (bpm),Sleep performance %,Respiratory rate (rpm),Asleep duration (min),In bed duration (min),Light sleep duration (min),Deep (SWS) duration (min),REM duration (min),Awake duration (min),Sleep need (min),Sleep debt (min),Sleep efficiency %,Sleep consistency %,Recovery category
0,58.0,37.0,69.0,16.3,3257.0,156.0,64.0,57.0,16.3,391.0,490.0,259.0,87.0,45.0,99.0,684.0,134.0,79.0,76.0,1
1,25.0,36.0,46.0,19.8,3607.0,195.0,70.0,44.0,15.9,290.0,439.0,194.0,65.0,31.0,149.0,654.0,117.0,66.0,77.0,0
2,45.0,38.0,66.0,17.9,3611.0,178.0,70.0,62.0,16.5,413.0,488.0,225.0,97.0,91.0,75.0,663.0,115.0,84.0,75.0,1
3,18.0,37.0,45.0,19.5,4247.0,178.0,73.0,52.0,16.6,355.0,470.0,262.0,82.0,11.0,115.0,679.0,139.0,75.0,87.0,0
4,44.0,37.0,73.0,18.3,3380.0,189.0,70.0,42.0,16.1,276.0,343.0,209.0,40.0,27.0,67.0,656.0,112.0,80.0,77.0,1


Lets try to predict Recovery scores for the reggressions and Recovery Categories for the classifiers

In [122]:
recovery_score = cycle_df['Recovery score %']
recovery_cat = cycle_df['Recovery category']
cycle_df.drop('Recovery score %', axis=1, inplace=True)

In [123]:
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(cycle_df, recovery_score, test_size=.2) #using .3 because the original whoop algo uses a months worth of data to start to customize the algorithm 
X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(cycle_df.drop('Recovery category', axis=1), recovery_cat, test_size=.2)

In [124]:
tree_reg, tree_clf, lin_reg, log_reg = DecisionTreeRegressor(), DecisionTreeClassifier(criterion='entropy'), LinearRegression(), LogisticRegression()

In [125]:
tree_reg.fit(X_train_reg, y_train_reg)
predictions = tree_reg.predict(X_test_reg)

mae = mean_absolute_error(y_test_reg, predictions)
mse = mean_squared_error(y_test_reg, predictions)
rmse = np.sqrt(mse)

metrics_dict = {
    'Metric': ['MAE', 'MSE', 'RMSE'],
    'Value': [mae, mse, rmse]
}

metrics_df = pd.DataFrame(metrics_dict)
metrics_df

Unnamed: 0,Metric,Value
0,MAE,8.428571
1,MSE,99.47619
2,RMSE,9.973775


In [126]:
lin_reg.fit(X_train_reg, y_train_reg)
predictions = lin_reg.predict(X_test_reg)

mae = mean_absolute_error(y_test_reg, predictions)
mse = mean_squared_error(y_test_reg, predictions)
rmse = np.sqrt(mse)

metrics_dict = {
    'Metric': ['MAE', 'MSE', 'RMSE'],
    'Value': [mae, mse, rmse]
}

metrics_df = pd.DataFrame(metrics_dict)
metrics_df

Unnamed: 0,Metric,Value
0,MAE,5.926305
1,MSE,56.576969
2,RMSE,7.521766


In [127]:
log_reg.fit(X_train_clf, y_train_clf)
predictions = log_reg.predict(X_test_clf)

f1 = f1_score(y_test_clf, predictions, average='weighted')
precision = precision_score(y_test_clf, predictions, average='weighted')
recall = recall_score(y_test_clf, predictions, average='weighted')
accuracy = accuracy_score(y_test_clf, predictions)

metrics_dict = {
    'Metric': ['accuracy', 'precision', 'recall', 'f1'],
    'Value': [accuracy, precision, recall, f1]
}

metrics_df = pd.DataFrame(metrics_dict)
metrics_df

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,Metric,Value
0,accuracy,0.52381
1,precision,0.535979
2,recall,0.52381
3,f1,0.522706


In [128]:
tree_clf.fit(X_train_clf, y_train_clf)
predictions = tree_clf.predict(X_test_clf)


f1 = f1_score(y_test_clf, predictions, average='weighted')
precision = precision_score(y_test_clf, predictions, average='weighted')
recall = recall_score(y_test_clf, predictions, average='weighted')
accuracy = accuracy_score(y_test_clf, predictions)

metrics_dict = {
    'Metric': ['accuracy', 'precision', 'recall', 'f1'],
    'Value': [accuracy, precision, recall, f1]
}

metrics_df = pd.DataFrame(metrics_dict)
metrics_df

Unnamed: 0,Metric,Value
0,accuracy,0.761905
1,precision,0.76912
2,recall,0.761905
3,f1,0.763732
