In [1]:
%pylab --no-import-all inline

Populating the interactive namespace from numpy and matplotlib


# Most general form of cross-validation, with history
---

This provides little personalization, and still avoids the issue of using a subject's future data for prediction.

In [2]:
from os import path
import sys

import pandas as pd
import seaborn as sns

from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import cross_val_predict

# Load the "autoreload" extension
%load_ext autoreload

# always reload modules marked with "%aimport"
%autoreload 1

# add the 'src' directory as one where we can import modules
src_dir = path.join("..", 'src')
sys.path.append(src_dir)

# import my method from the source code
%aimport features.build_features
%aimport models.fit_predict
%aimport visualization.visualize
from features.build_features import previous_value
from models.fit_predict import cv_predict
from visualization.visualize import modified_bland_altman_plot, residual_plot

In [3]:
import keras; print(keras.__version__)

from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv1D, MaxPool1D
from keras.wrappers.scikit_learn import KerasRegressor

Using TensorFlow backend.


2.0.4


In [4]:
file = path.join("..", "data", "interim", "df.csv")
df = pd.read_csv(file, index_col=0)

## Compute features

In [5]:
features = []

### $n$ days of temperature measurements.

The use case requires deleting those whose ovulation occurs before these $n$ days.

In [6]:
NUMBER_OF_DAYS = 10
df = df[df.L_PREOVULATION > NUMBER_OF_DAYS]  # No use predicting backward in time.
temp_measurements = ["TEMP" + str(i + 1) for i in range(NUMBER_OF_DAYS)]
features += temp_measurements

In [7]:
features

['TEMP1',
 'TEMP2',
 'TEMP3',
 'TEMP4',
 'TEMP5',
 'TEMP6',
 'TEMP7',
 'TEMP8',
 'TEMP9',
 'TEMP10']

In [8]:
X = df[features]
y = df.L_PREOVULATION
grouping = df.ID

## Perform regression

In [13]:
from sklearn.preprocessing import StandardScaler, Imputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GroupKFold

In [16]:
def mlp_model():
    model = Sequential()
    model.add(Dense(20, input_dim=10, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [None]:
reg = KerasRegressor(build_fn=mlp_model, 
                     epochs=20, batch_size=5, verbose=1)
imp = Imputer(strategy='mean')
scl = StandardScaler()
pipeline = Pipeline([('imp', imp), ('scl', scl), ('reg', reg)])

cv = GroupKFold(n_splits=5)

y_pred = cross_val_predict(pipeline, X, y,
                           cv=cv, groups=grouping,
                           verbose=True, n_jobs=-1)

Epoch 1/20
Epoch 1/20
Epoch 1/20
Epoch 1/20
Epoch 2/20
  180/22687 [..............................] - ETA: 20s - loss: 24.1370Epoch 2/20
  375/22687 [..............................] - ETA: 18s - loss: 18.5779
  225/22687 [..............................] - ETA: 16s - loss: 14.7950Epoch 2/20
Epoch 2/20
Epoch 3/20
    5/22687 [..............................] - ETA: 68s - loss: 5.2955Epoch 3/20
  170/22687 [..............................] - ETA: 26s - loss: 15.2456
  245/22687 [..............................] - ETA: 21s - loss: 15.2518Epoch 3/20
 2595/22688 [==>...........................] - ETA: 44s - loss: 182.6668
Epoch 4/20
  140/22687 [..............................] - ETA: 17s - loss: 19.2032Epoch 4/20
Epoch 2/20
Epoch 5/20
  195/22687 [..............................] - ETA: 19s - loss: 18.8929
Epoch 5/20
Epoch 5/20
Epoch 3/20
 3030/22688 [===>..........................] - ETA: 17s - loss: 13.5393
Epoch 6/20
 3110/22688 [===>..........................] - ETA: 16s - loss: 13.8562
Epoc

 4520/22688 [====>.........................] - ETA: 17s - loss: 15.4099Epoch 20/20
Epoch 20/20
Epoch 20/20
 5005/22688 [=====>........................] - ETA: 21s - loss: 14.3410
Epoch 19/20
Epoch 20/20

In [None]:
mean_squared_error(y_pred=y_pred, y_true=y)

In [12]:
mean_absolute_error(y_pred=y_pred, y_true=y)

2.7331466205322847

In [None]:
modified_bland_altman_plot(y_pred, y);

In [None]:
residual_plot(y_pred, y);

In [None]:
y_pred

## Discussion
---

Our features are only the first ten temperatures of the cycle and the participant's last cycle length and follicular phase length. With it, we achieve a MSE of about 12, which beats the Bortot paper's 15. In terms of use case, this is about equal to the Bortot result.

In [None]:
df.L_PERIOD.median()

Now, the median period length is 5, which means that we are really using measurements of BBT during the period to determine the day of ovulation.

Since this model has only slight personalization, it's exciting to see how well a personalized model will do.