In [1]:
%pylab --no-import-all inline

Populating the interactive namespace from numpy and matplotlib


# Most general form of cross-validation, with history
---

This provides little personalization, and still avoids the issue of using a subject's future data for prediction.

In [2]:
from os import path
import sys

import pandas as pd
import seaborn as sns
import numpy as np

from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import cross_val_predict

# Load the "autoreload" extension
%load_ext autoreload

# always reload modules marked with "%aimport"
%autoreload 1

# add the 'src' directory as one where we can import modules
src_dir = path.join("..", 'src')
sys.path.append(src_dir)

# import my method from the source code
%aimport features.build_features
%aimport models.fit_predict
%aimport visualization.visualize
from features.build_features import previous_value
from models.fit_predict import cv_predict
from visualization.visualize import modified_bland_altman_plot
from sklearn import metrics
from sklearn.cluster import KMeans

#import utilities

In [3]:
file = path.join("..", "data", "interim", "df.csv")
df = pd.read_csv(file, index_col=0)


## Compute features

In [4]:
df.groupby(['ID'])['AGE'].value_counts().sum()

19385

In [5]:
df['CYCLE_ID'].count().sum()
#UPDATE: Arya's code caluclates an age for the beginning of each cycle
#because there's a different start date for each cycle UGH 

29298

In [6]:
non_null_num = len(df[~df.AGE.isnull()])
print(non_null_num/len(df))

0.6616492593351082


In [9]:
non_null_num_users = len(df[~df.AGE.isnull()].ID.unique())
print(non_null_num)#/len(df.ID.unique()))

19385


In [10]:
df[~df.AGE.isnull()].ID.unique().size

1066

In [11]:
df.ID.unique().size

1780

In [12]:
NUMBER_OF_DAYS = 10
df = df[df.L_PREOVULATION > NUMBER_OF_DAYS]  # No use predicting backward in time.
temp_measurements = ["TEMP" + str(i + 1) for i in range(NUMBER_OF_DAYS)]
features = [*temp_measurements,"past_L_CYCLE", "past_L_PREOVULATION"]

In [13]:
df['past_L_PREOVULATION'] = previous_value('L_PREOVULATION', df)
df['past_L_CYCLE'] = previous_value('L_CYCLE', df)

df.dropna(subset=[
    'past_L_PREOVULATION', 
    'past_L_CYCLE'
], inplace=True)

features += ['past_L_PREOVULATION', 'past_L_CYCLE']

### $n$ days of temperature measurements.

The use case requires deleting those whose ovulation occurs before these $n$ days.

In [14]:
NUMBER_OF_DAYS = 10
df = df[df.L_PREOVULATION > NUMBER_OF_DAYS]  # No use predicting backward in time.
temp_measurements = ["TEMP" + str(i + 1) for i in range(NUMBER_OF_DAYS)]
features += temp_measurements

In [17]:
X = df[features]
y = df.L_PREOVULATION
grouping = df.ID

## Perform regression

In [18]:
y_pred = cv_predict(X, y, grouping)

[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   22.5s finished


In [None]:
mean_squared_error(y_pred=y_pred, y_true=y)

In [None]:
mean_absolute_error(y_pred=y_pred, y_true=y)

In [None]:
modified_bland_altman_plot(y_pred, y);