In [1]:
from pathlib import Path

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


# Load the "autoreload" extension
%load_ext autoreload

# always reload modules marked with "%aimport"
%autoreload 1

# add the 'src' directory as one where we can import modules
from os import path
import sys
src_dir = path.join("..", 'src')
sys.path.append(src_dir)

# import my method from the source code
%aimport features.build_features
%aimport models.fit_predict
%aimport visualization.visualize
from features.build_features import previous_value

In [2]:
file = Path("..") / "data" / "interim" / "df.csv"
df = pd.read_csv(file, index_col=0)

In [3]:
# List the non-temperature features.
[col for col in df.columns if not col.startswith("TEMP")]

['ID',
 'SEGMENT_ID',
 'CYCLE_ID',
 'BIRTH_YR',
 'BEGIN_DATE',
 'N_SEGMENTS',
 'N_CYCLES',
 'L_CYCLE',
 'L_PREOVULATION',
 'L_PERIOD',
 'CHILDREN',
 'AGE']

In [4]:
features = ["AGE", "L_PERIOD"]

In [5]:
df['past_L_PREOVULATION'] = previous_value('L_PREOVULATION', df)
df['past_L_CYCLE'] = previous_value('L_CYCLE', df)

df.dropna(subset=[
    'past_L_PREOVULATION', 
    'past_L_CYCLE'
], inplace=True)

features += ['past_L_PREOVULATION', 'past_L_CYCLE']

In [6]:
# Hide temperatures after ovulation.

def censor_row(row: pd.Series) -> pd.Series:
    l_preov = int(row.L_PREOVULATION)
    columns_to_censor = ["TEMP" + str(i) for i in range(l_preov + 1, 99 + 1)]
    row[columns_to_censor] = np.nan
    return row

df = df.apply(censor_row, axis='columns')

In [7]:
# Delete columns (days) without any temperature for anyone
df.dropna(how='all', axis='columns', inplace=True)

In [8]:
temperature_columns = [col for col in df.columns if col.startswith("TEMP")]
features += temperature_columns

In [9]:
X = df[features]
y = df.L_PREOVULATION
grouping = df.ID

In [10]:
destination = ["..", "data", "processed"]
X.to_csv(path.join(*destination, "X.csv"))
y.to_csv(path.join(*destination, "y.csv"))
grouping.to_csv(path.join(*destination, "grouping.csv"))