In [33]:
# Importing dependencies
from sklearn import datasets
import pandas as pd
from pathlib import Path



In [34]:
# Loading a toy dataset into a DataFrame
data = datasets.load_breast_cancer()
data_df = pd.DataFrame(data=data.data, columns=data.feature_names)

In [35]:
# Splitting the dataset into arbitrary sets of features
data_df1 = data_df[data.feature_names[:5]]
data_df2 = data_df[data.feature_names[5:10]]
data_df3 = data_df[data.feature_names[10:17]]
data_df4 = data_df[data.feature_names[17:30]]
target_df = pd.DataFrame(data=data.target, columns=["target"])

In [36]:
# Creating timestamps for the data
timestamps = pd.date_range(
    end=pd.Timestamp.now(), 
    periods=len(data_df), 
    freq='D').to_frame(name="event_timestamp", index=False)

In [37]:
# Adding the timestamp column to each DataFrame
data_df1 = pd.concat(objs=[data_df1, timestamps], axis=1)
data_df2 = pd.concat(objs=[data_df2, timestamps], axis=1)
data_df3 = pd.concat(objs=[data_df3, timestamps], axis=1)
data_df4 = pd.concat(objs=[data_df4, timestamps], axis=1)
target_df = pd.concat(objs=[target_df, timestamps], axis=1)

In [38]:
# Creating a list of arbitrary IDs for feature rows
patient_ids = pd.DataFrame(data=list(range(len(data_df))), columns=["patient_id"])

# Adding the timestamp column to each DataFrame
data_df1 = pd.concat(objs=[data_df1, patient_ids], axis=1)
data_df2 = pd.concat(objs=[data_df2, patient_ids], axis=1)
data_df3 = pd.concat(objs=[data_df3, patient_ids], axis=1)
data_df4 = pd.concat(objs=[data_df4, patient_ids], axis=1)
target_df = pd.concat(objs=[target_df, patient_ids], axis=1)

In [39]:
# Inspecting the feature DataFrames
data_df1.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,event_timestamp,patient_id
0,17.99,10.38,122.8,1001.0,0.1184,2021-02-05 13:32:24.520821,0
1,20.57,17.77,132.9,1326.0,0.08474,2021-02-06 13:32:24.520821,1
2,19.69,21.25,130.0,1203.0,0.1096,2021-02-07 13:32:24.520821,2
3,11.42,20.38,77.58,386.1,0.1425,2021-02-08 13:32:24.520821,3
4,20.29,14.34,135.1,1297.0,0.1003,2021-02-09 13:32:24.520821,4


In [40]:
# Writing our DataFrames to csv files

Path("./data").mkdir(parents=True, exist_ok=True)
data_df1.to_csv('./data/data_df1.csv', index=False)
data_df2.to_csv('./data/data_df2.csv', index=False)
data_df3.to_csv('./data/data_df3.csv', index=False)
data_df4.to_csv('./data/data_df4.csv', index=False)
target_df.to_csv('./data/target_df.csv', index=False)