In [1]:
%pylab --no-import-all

Using matplotlib backend: Qt5Agg
Populating the interactive namespace from numpy and matplotlib


# Evaluating the three-over-six rule

The Miolo 1993 paper, which defines the data set, explains that three-over-six was used to determine the day of ovulation. How well does that stand up? Did our data get altered along the way?

In [2]:
from os import path
import pandas as pd
from tqdm import tqdm

from sklearn.metrics import mean_absolute_error, mean_squared_error, accuracy_score

In [3]:
file = path.join("..", "data", "interim", "df.csv")
df = pd.read_csv(file, index_col=0)

In [4]:
# Get the min of the next three days' temperatures.
three = df[["TEMP" + str(i + 1) for i in range(99)]].shift(-3, axis=1).rolling(3, axis=1).min()
# Get the max of the six temperatures leading up to today.
six = df[["TEMP" + str(i + 1) for i in range(99)]].rolling(6, axis=1).max()
three_over_six = ((three - six) > 0).idxmax(axis=1)  # First column with positive difference = three-over-six day.
three_over_six = three_over_six.apply(lambda x: int(x[4:])).replace(1, np.nan)  # Remove 'TEMP' and convert 1s to NAs.

In [5]:
print("Total number of cycles: {}".format(len(df)))
print("No Pre-Ov calculated: {}".format(three_over_six.isnull().sum()))
print("Accuracy: {}".format(accuracy_score(y_true=df.L_PREOVULATION, y_pred=three_over_six.fillna(-1))))

Total number of cycles: 23219
No Pre-Ov calculated: 3744
Accuracy: 0.7586028683405831


In [6]:
# Another method--kind of iffy.
no_calc = 0
diff_calc = 0

for idx, row in tqdm(df.iterrows()):
    computed_L_PREOVULATION = None
    for i in range(1, 91):
        six_days = [row['TEMP'+str(i+j)] for j in range(0, 6)]
        three_days = [row['TEMP'+str(i+k)] for k in range(6, 9)]
        if min(three_days) > max(six_days):
            computed_L_PREOVULATION = i + 5
            break
    if computed_L_PREOVULATION is None:
        no_calc += 1
    elif computed_L_PREOVULATION != int(row.L_PREOVULATION):
        diff_calc += 1
total_errors = no_calc + diff_calc
print("Total number of cycles: {}".format(len(df)))
print("Total diffs: {} ({:5.2f}%)".format(total_errors, total_errors / len(df) * 100))
print("No Pre-Ov calculated: {}".format(no_calc))
print("Different calculated value: {}".format(diff_calc))
print("Accuracy: {}".format(100 * (1 - total_errors / len(df))))

23219it [00:51, 447.91it/s]

Total number of cycles: 29298
Total diffs: 6690 (22.83%)
No Pre-Ov calculated: 3152
Different calculated value: 3538
Accuracy: 77.16567683800942





Even with hindsight, this method isn't fantastic. <80% accuracy, after the fact?

In [None]:
no_calc = 0
diff_calc = 0

for idx, row in tqdm(df.iterrows()):
    computed_L_PREOVULATION = None
    for i in range(1, 91):
        six_days = [row['TEMP'+str(i+j)] for j in range(0, 6)]
        three_days = [row['TEMP'+str(i+k)] for k in range(6, 9)]
        if min(three_days) > max(six_days):
            computed_L_PREOVULATION = i + 5
            break
    if computed_L_PREOVULATION is None:
        no_calc += 1
    elif abs(computed_L_PREOVULATION - int(row.L_PREOVULATION)) > 1:
        diff_calc += 1
total_errors = no_calc + diff_calc
print("Total number of cycles: {}".format(len(df)))
print("Total big diffs: {} ({:5.2f}%)".format(total_errors, total_errors / len(df) * 100))
print("No Pre-Ov calculated: {}".format(no_calc))
print("Difference greater than 1: {}".format(diff_calc))
print("Softened accuracy: {}".format(100 * (1 - total_errors / len(df))))

And it doesn't improve much when we relax the problem, so that it can be one day off. Grrrr.

In [8]:
no_calc = 0
diff_calc = 0
three_over_six = []

for idx, row in tqdm(df.iterrows()):
    computed_L_PREOVULATION = None
    for i in range(1, 91):
        six_days = [row['TEMP'+str(i+j)] for j in range(0, 6)]
        three_days = [row['TEMP'+str(i+k)] for k in range(6, 9)]
        if min(three_days) > max(six_days):
            computed_L_PREOVULATION = i + 5
            three_over_six.append(computed_L_PREOVULATION)
            break
    if computed_L_PREOVULATION is None:
        no_calc += 1
        three_over_six.append(np.nan)
    elif computed_L_PREOVULATION != int(row.L_PREOVULATION):
        diff_calc += 1
three_over_six = pd.Series(three_over_six)

23219it [00:52, 441.32it/s]


---
### The bottom line

The dataset defines the day of ovulation as the three-over-six day. We don't see that reflected, so the data must have been altered before reaching us. The fact that the temperature measurements are totally unrealistic supports this.