# ML Pipeline for Accelerometer Data

First just dabble with accelerometer data.

We will use it for seizure detection, and then seizure prediction. 

Using autoML approach plus feature set (either TSFresh or pycatch22)

This notebook is for getting things to work.

In [None]:
# Import Packages
import sys
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import io 
import re
from scipy.fft import fft, fftfreq
from pathlib import Path

import RandFuncs
import pycatch22
import autosklearn.classification
from sklearn.model_selection import train_test_split
from joblib import dump, load
from sklearn import metrics
import pickle

: 

In [19]:
# Get all parquet files in the directory and subdirectory. 
# https://stackoverflow.com/questions/2909975/python-list-directory-subdirectory-and-files

# root is the folder/directory of the patient.  
root = Path(os.getcwd()+ '/Ignore/Output/1110/')
files = [os.path.join(path, name) for path, subdirs, files in os.walk(root) for name in files if name[-8:]==".parquet"]
# Sort Path by Ascending Order
files.sort(reverse=False)
# Print a few of the file paths as a check
print(files[0:5])

['/Users/pokhims/Documents/Coding/PK_EForecasting/Ignore/Output/1110/060/UTC-2020_05_25-23_20_00.parquet', '/Users/pokhims/Documents/Coding/PK_EForecasting/Ignore/Output/1110/060/UTC-2020_05_25-23_30_00.parquet', '/Users/pokhims/Documents/Coding/PK_EForecasting/Ignore/Output/1110/060/UTC-2020_05_25-23_40_00.parquet', '/Users/pokhims/Documents/Coding/PK_EForecasting/Ignore/Output/1110/060/UTC-2020_05_25-23_50_00.parquet', '/Users/pokhims/Documents/Coding/PK_EForecasting/Ignore/Output/1110/060/UTC-2020_05_26-00_00_00.parquet']


In [20]:
# First load one of the parquet files of the patient
file_path = files[0]
data = pd.read_parquet(file_path,engine='pyarrow')
# Change first column to datetime format
data['utc_timestamp'] = pd.to_datetime(data['utc_timestamp'], utc=True, unit='s')

data

Unnamed: 0,utc_timestamp,acc_x,acc_y,acc_z,acc_mag,bvp,eda,hr,temp
0,2020-05-25 23:20:00+00:00,,,,,,,,
1,2020-05-25 23:20:00.007812864+00:00,,,,,,,,
2,2020-05-25 23:20:00.015624960+00:00,,,,,,,,
3,2020-05-25 23:20:00.023437824+00:00,,,,,,,,
4,2020-05-25 23:20:00.031249920+00:00,,,,,,,,
...,...,...,...,...,...,...,...,...,...
76795,2020-05-25 23:29:59.960937472+00:00,-0.997473,62.996995,6.002834,62.291661,-11.010712,0.0,123.394669,
76796,2020-05-25 23:29:59.968750336+00:00,-0.997473,62.996995,6.002834,62.291661,-24.282581,0.0,123.421586,
76797,2020-05-25 23:29:59.976562432+00:00,-0.997473,62.996995,6.002834,62.291661,-32.718963,0.0,123.448503,
76798,2020-05-25 23:29:59.984375296+00:00,-0.997473,62.996995,6.002834,62.291661,-36.834271,0.0,123.475420,


In [21]:
# As the patient "only has 32GB of data", we could probably load it all into memory.

# Iterate through all the files and turn it into a single dataframe
# Load all the DataFrames into an Empty List and Concat at end for efficiency.  
# Initialise an empty List
li = []
# Load the labels
labels = pd.read_csv('/Users/pokhims/Documents/Coding/PK_EForecasting/Ignore/Output/train_labels.csv')

for file_path in files:

    # First load one of the parquet files of the patient
    data = pd.read_parquet(file_path,engine='pyarrow')
    # Change first column to datetime format
    data['utc_timestamp'] = pd.to_datetime(data['utc_timestamp'], utc=True, unit='s')

    # Add in the training labels. 
    # No clue which ones are the updated ones, so just use the old ones. 
    # Get rid of the os.cwd() component of the path to match it with the labels csv
    path = file_path.replace(os.getcwd()+'/Ignore/Output/', '')
    label = labels[['label']].loc[labels['filepath'] == str(path)].values[0]

    data['label'] = np.repeat(label, repeats = data.shape[0])

    # Perform NAN treatment later

    # Append to list
    li.append(data)

# Convert to df
data = pd.concat(li,axis=0,ignore_index=True)
# Sort by the time
data = data.sort_values(by="utc_timestamp")

# Return data
data

Unnamed: 0,utc_timestamp,acc_x,acc_y,acc_z,acc_mag,bvp,eda,hr,temp,label
0,2020-05-25 23:20:00+00:00,,,,,,,,,0
1,2020-05-25 23:20:00.007812864+00:00,,,,,,,,,0
2,2020-05-25 23:20:00.015624960+00:00,,,,,,,,,0
3,2020-05-25 23:20:00.023437824+00:00,,,,,,,,,0
4,2020-05-25 23:20:00.031249920+00:00,,,,,,,,,0
...,...,...,...,...,...,...,...,...,...,...
19967995,2020-08-01 16:49:59.960937472+00:00,,,,,,,,,0
19967996,2020-08-01 16:49:59.968750336+00:00,,,,,,,,,0
19967997,2020-08-01 16:49:59.976562432+00:00,,,,,,,,,0
19967998,2020-08-01 16:49:59.984375296+00:00,,,,,,,,,0


In [22]:
# Let's just keep as much as possible and the Nans for now.

# # Drop temp for now as it's all nans for this data file
# data = data.drop(columns=['temp'])

# # Drop EDA - I have 0 belief that this will be useful
# data = data.drop(columns=['eda'])

# # If acc_mag is a Nan drop it. 
# data = data.dropna(subset=['acc_mag'])

In [23]:
data

Unnamed: 0,utc_timestamp,acc_x,acc_y,acc_z,acc_mag,bvp,eda,hr,temp,label
0,2020-05-25 23:20:00+00:00,,,,,,,,,0
1,2020-05-25 23:20:00.007812864+00:00,,,,,,,,,0
2,2020-05-25 23:20:00.015624960+00:00,,,,,,,,,0
3,2020-05-25 23:20:00.023437824+00:00,,,,,,,,,0
4,2020-05-25 23:20:00.031249920+00:00,,,,,,,,,0
...,...,...,...,...,...,...,...,...,...,...
19967995,2020-08-01 16:49:59.960937472+00:00,,,,,,,,,0
19967996,2020-08-01 16:49:59.968750336+00:00,,,,,,,,,0
19967997,2020-08-01 16:49:59.976562432+00:00,,,,,,,,,0
19967998,2020-08-01 16:49:59.984375296+00:00,,,,,,,,,0


In [24]:
# Want to transform from Cartesian to Polar Coordinates

# First get the magnitude of the acceleration
# For some reason acc_mag is different from the calcualted magnitude.  By roughly 1.  My guess is that their mag was calculated after a normalisation step which removed the mean of 1g. 
# print(data["acc_mag"] - np.sqrt(data["acc_x"]**2 + data["acc_y"]**2 + data["acc_z"]**2))
# Therefore, replace it with a calculated one
data["acc_mag"] = np.sqrt(data["acc_x"]**2 + data["acc_y"]**2 + data["acc_z"]**2)

# Then get theta
data["acc_theta"] = np.arctan2(data["acc_y"],data["acc_x"])

# Then get phi
data["acc_phi"] = np.arccos(data["acc_z"]/data["acc_mag"])

In [25]:
print(np.min(data["acc_z"]/data["acc_mag"]))
print(np.max(data["acc_z"]/data["acc_mag"]))

-0.9999959374196743
0.9999994077848491


In [26]:
data

# Might need to consider z-score normalisation.  But only for selected columns. 
# Catch22 normalises for most of the features. Not sure if autosklearn does it automatically.
# Normalisation probably needs to occur after a co-ordinate transform?

Unnamed: 0,utc_timestamp,acc_x,acc_y,acc_z,acc_mag,bvp,eda,hr,temp,label,acc_theta,acc_phi
0,2020-05-25 23:20:00+00:00,,,,,,,,,0,,
1,2020-05-25 23:20:00.007812864+00:00,,,,,,,,,0,,
2,2020-05-25 23:20:00.015624960+00:00,,,,,,,,,0,,
3,2020-05-25 23:20:00.023437824+00:00,,,,,,,,,0,,
4,2020-05-25 23:20:00.031249920+00:00,,,,,,,,,0,,
...,...,...,...,...,...,...,...,...,...,...,...,...
19967995,2020-08-01 16:49:59.960937472+00:00,,,,,,,,,0,,
19967996,2020-08-01 16:49:59.968750336+00:00,,,,,,,,,0,,
19967997,2020-08-01 16:49:59.976562432+00:00,,,,,,,,,0,,
19967998,2020-08-01 16:49:59.984375296+00:00,,,,,,,,,0,,


In [27]:
# Count number of rows with Nans
print("Rows with nans = ",np.isnan(data).any(axis=1).sum())
print(np.isnan(data).any(axis=1).sum()/data.shape[0])

Rows with nans =  19098280
0.9564443108974359


In [28]:
# plt.figure(figsize=(20,10))
# plt.plot(data['utc_timestamp'],data['acc_x'])
# plt.plot(data['utc_timestamp'],data['acc_y'])
# plt.plot(data['utc_timestamp'],data['acc_z'])
# plt.plot(data['utc_timestamp'],data['acc_mag'])
# plt.grid()

In [29]:
# plt.figure(figsize=(20,10))
# plt.plot(data['utc_timestamp'],data['acc_mag'])
# plt.plot(data['utc_timestamp'],data['acc_theta'])
# plt.plot(data['utc_timestamp'],data['acc_phi'])
# plt.grid()

In [30]:
window_size = 10
f_s = 128

# Specify columns to use
cols = ["acc_mag","acc_theta","acc_phi",]
# Extract relevant columns
#data = data[cols + ["utc_timestamp"]]   

# Figure out how many elements goes into each window.
number = int(window_size*f_s)

# Length of the data_file
file_len = data.shape[0]

# Initialize list to save data per window
li = []

# Loop through the data file, creating windows of data and running Catch22.
for k in np.arange(np.floor(file_len/number)):

    # Create or specify the window of data.
    # Just in case, maybe the 10min data file is completely empty or something, we need to do a try-except to move on.
    try:
        # Specify window
        window = data.iloc[int(k*number):int((k+1)*number),:]
    except:
        continue

    # Basic checks which might throw errors.  If they do, we just skip that window.
    # If array is empty skip it.
    if window.size==0:
        continue
    # If array has Nans skip it.  - This is not ideal, but just want to get something going quickly. 
    # elif np.isnan(window).any().any():
    #     continue

    # For each column of dataframe apply catch22
    li1 = []
    for i in cols:
        features = pycatch22.catch22_all(window[i], catch24=True, )
        li1.append(features["values"])  

    # Flatten the list of lists
    li1 = [item for sublist in li1 for item in sublist]
    # Record the utc_timestamp of the start of each window
    li1.append(window['utc_timestamp'].iloc[0])    
    # Record the label of the start of each window
    li1.append(window['label'].iloc[0])

    # # Record the utc_timestamp of the start of each window
    # li1.append(window['utc_timestamp'].iloc[0])
    # # Turn this list into an array 
    # li1 = np.array(li1)
    # # Flatten this first list
    # li1 = li1.flatten()

    # Add to list saving data per window
    li.append(li1)

# Turn the list into the appropriate pd.DataFrame
# Create column names
feature_names = np.loadtxt("Catch22_Featurenames",dtype=str)
col_names = [i + "_" + j for i in cols for j in feature_names]
col_names = col_names + ["utc_timestamp"] + ["label"]

# Create dataframe
df = pd.DataFrame(li,columns=col_names)

KeyboardInterrupt: 

In [None]:
df

Unnamed: 0,acc_mag_DN_HistogramMode_5,acc_mag_DN_HistogramMode_10,acc_mag_CO_f1ecac,acc_mag_CO_FirstMin_ac,acc_mag_CO_HistogramAMI_even_2_5,acc_mag_CO_trev_1_num,acc_mag_MD_hrv_classic_pnn40,acc_mag_SB_BinaryStats_mean_longstretch1,acc_mag_SB_TransitionMatrix_3ac_sumdiagcov,acc_mag_PD_PeriodicityWang_th0_01,...,acc_phi_SB_BinaryStats_diff_longstretch0,acc_phi_SB_MotifThree_quantile_hh,acc_phi_SC_FluctAnal_2_rsrangefit_50_1_logi_prop_r1,acc_phi_SC_FluctAnal_2_dfa_50_1_2_logi_prop_r1,acc_phi_SP_Summaries_welch_rect_centroid,acc_phi_FC_LocalSimple_mean3_stderr,acc_phi_DN_Mean,acc_phi_DN_Spread_Std,utc_timestamp,label
0,,,0.0,0,,,,,,0,...,,,,,,,,,2020-05-25 23:20:00+00:00,0
1,,,0.0,0,,,,,,0,...,,,,,,,,,2020-05-25 23:20:10+00:00,0
2,,,0.0,0,,,,,,0,...,,,,,,,,,2020-05-25 23:20:20+00:00,0
3,,,0.0,0,,,,,,0,...,,,,,,,,,2020-05-25 23:20:30+00:00,0
4,,,0.0,0,,,,,,0,...,,,,,,,,,2020-05-25 23:20:40+00:00,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15595,,,0.0,0,,,,,,0,...,,,,,,,,,2020-08-01 16:49:10+00:00,0
15596,,,0.0,0,,,,,,0,...,,,,,,,,,2020-08-01 16:49:20+00:00,0
15597,,,0.0,0,,,,,,0,...,,,,,,,,,2020-08-01 16:49:30+00:00,0
15598,,,0.0,0,,,,,,0,...,,,,,,,,,2020-08-01 16:49:40+00:00,0


In [None]:
# df.to_pickle("Patient_1110.pkl")
# Is 9MB compared to 380mb for the raw data.  So that's a 97% reduction in size.

In [None]:
df['label'].mean()


0.0

In [None]:
# Add Quality Features
df["acc_quality"] = RandFuncs.acc_quality_adj(data, f_s)
#df["bvp_quality"] = RandFuncs.bvp_quality_adj(data, f_s)
#df["eda_quality"] = RandFuncs.eda_quality_adj(data, f_s)

# There is a possibility that these need to be performed on a larger window? 
# This is just giving out NaNs.... There is a bug somewhere.  

short chunk


ValueError: cannot reshape array of size 38514 into shape (15)

In [None]:
df

Unnamed: 0,acc_mag_DN_HistogramMode_5,acc_mag_DN_HistogramMode_10,acc_mag_CO_f1ecac,acc_mag_CO_FirstMin_ac,acc_mag_CO_HistogramAMI_even_2_5,acc_mag_CO_trev_1_num,acc_mag_MD_hrv_classic_pnn40,acc_mag_SB_BinaryStats_mean_longstretch1,acc_mag_SB_TransitionMatrix_3ac_sumdiagcov,acc_mag_PD_PeriodicityWang_th0_01,...,acc_phi_SP_Summaries_welch_rect_area_5_1,acc_phi_SB_BinaryStats_diff_longstretch0,acc_phi_SB_MotifThree_quantile_hh,acc_phi_SC_FluctAnal_2_rsrangefit_50_1_logi_prop_r1,acc_phi_SC_FluctAnal_2_dfa_50_1_2_logi_prop_r1,acc_phi_SP_Summaries_welch_rect_centroid,acc_phi_FC_LocalSimple_mean3_stderr,acc_phi_DN_Mean,acc_phi_DN_Spread_Std,utc_timestamp
0,0.572093,0.016976,3.644704,9,0.060949,-0.047277,0.457389,18.0,0.018476,43,...,0.911547,6.0,1.013508,0.229167,0.125000,0.297592,0.696617,1.471254,0.005907,2020-05-25 23:29:42+00:00
1,0.993162,0.208790,3.552545,9,0.019007,-0.064214,0.478499,18.0,0.011553,23,...,0.903904,9.0,0.995992,0.208333,0.125000,0.309864,0.701677,1.471049,0.005915,2020-05-25 23:29:52+00:00
2,0.965658,0.183288,3.270531,7,0.089803,-0.205094,0.417514,16.0,0.037096,22,...,0.901257,13.0,1.115961,0.291667,0.145833,0.294524,0.679431,1.472015,0.006263,2020-05-25 23:30:02+00:00
3,-0.274656,0.453445,2.866077,6,0.102263,-0.192702,0.420641,22.0,0.014118,18,...,0.921022,10.0,1.032467,0.187500,0.125000,0.208621,0.612867,1.470081,0.006447,2020-05-25 23:30:12+00:00
4,0.412421,-0.264990,5.210617,16,0.177042,-0.016347,0.290852,34.0,0.006655,29,...,0.992576,18.0,1.482908,0.125000,0.333333,0.009204,0.115930,1.595345,0.327130,2020-05-25 23:30:22+00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15395,-0.700592,-0.340732,6.650192,18,0.500885,-0.000820,0.763096,39.0,0.006221,43,...,0.998260,26.0,1.256045,0.125000,0.875000,0.006136,0.066724,2.147000,0.357710,2020-08-01 16:40:11.437499904+00:00
15396,-0.325017,0.248460,5.300336,13,0.272229,0.021856,0.802971,36.0,0.000975,71,...,0.998531,33.0,1.182322,0.125000,0.750000,0.003068,0.080454,2.186284,0.219101,2020-08-01 16:40:21.437499904+00:00
15397,-0.107770,-0.479224,8.292483,25,0.537915,0.000933,0.794371,42.0,0.003158,63,...,0.996084,29.0,1.271838,0.187500,0.145833,0.006136,0.132335,2.309776,0.150432,2020-08-01 16:40:31.437499904+00:00
15398,0.475277,-0.136008,5.950824,23,0.297600,-0.037166,0.698984,52.0,0.004030,66,...,0.996511,29.0,1.245089,0.125000,0.750000,0.003068,0.104860,2.125000,0.361957,2020-08-01 16:40:41.437499904+00:00


In [None]:
print(np.sum(np.isnan(df["acc_quality"])))
print(df.shape)

KeyError: 'acc_quality'

In [None]:
# The current files are all 0s so just to give it some 1s
df['label'] = np.repeat(np.random.randint(0,1),df.shape[0])

In [2]:
# Import the whole dataset from pkl file
df = pd.read_pickle(os.getcwd()+ "/Ignore/Output/Patient_1110.pkl")

In [3]:
df

Unnamed: 0,acc_mag_DN_HistogramMode_5,acc_mag_DN_HistogramMode_10,acc_mag_CO_f1ecac,acc_mag_CO_FirstMin_ac,acc_mag_CO_HistogramAMI_even_2_5,acc_mag_CO_trev_1_num,acc_mag_MD_hrv_classic_pnn40,acc_mag_SB_BinaryStats_mean_longstretch1,acc_mag_SB_TransitionMatrix_3ac_sumdiagcov,acc_mag_PD_PeriodicityWang_th0_01,...,hr_SB_BinaryStats_diff_longstretch0,hr_SB_MotifThree_quantile_hh,hr_SC_FluctAnal_2_rsrangefit_50_1_logi_prop_r1,hr_SC_FluctAnal_2_dfa_50_1_2_logi_prop_r1,hr_SP_Summaries_welch_rect_centroid,hr_FC_LocalSimple_mean3_stderr,hr_DN_Mean,hr_DN_Spread_Std,utc_timestamp,label
0,,,0.0,0,,,,,,0,...,,,,,,,,,2020-02-26 23:10:00+00:00,0
1,,,0.0,0,,,,,,0,...,,,,,,,,,2020-02-26 23:10:10+00:00,0
2,,,0.0,0,,,,,,0,...,,,,,,,,,2020-02-26 23:10:20+00:00,0
3,,,0.0,0,,,,,,0,...,,,,,,,,,2020-02-26 23:10:30+00:00,0
4,,,0.0,0,,,,,,0,...,,,,,,,,,2020-02-26 23:10:40+00:00,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1237855,,,0.0,0,,,,,,0,...,,,,,,,,,2020-08-12 17:39:10+00:00,0
1237856,,,0.0,0,,,,,,0,...,,,,,,,,,2020-08-12 17:39:20+00:00,0
1237857,,,0.0,0,,,,,,0,...,,,,,,,,,2020-08-12 17:39:30+00:00,0
1237858,,,0.0,0,,,,,,0,...,,,,,,,,,2020-08-12 17:39:40+00:00,0


In [4]:
# Auto-sklearn 

# Split the dataset into training and testing sets
# Just do it across a time or do cross-validation when doing this seriously.
# training_data, testing_data, training_target, testing_target = train_test_split(df.drop('label', axis=1),
#                                                                                 df['label'], 
#                                                                                 train_size=0.75, 
#                                                                                 test_size=0.25,
#                                                                                 random_state=42)

# Specify Columns to Use as Input (in addition to the time-stamp)
cols = ["acc_mag","acc_theta","acc_phi",]

feature_names = np.loadtxt("Catch22_Featurenames",dtype=str)
col_names = [i + "_" + j for i in cols for j in feature_names]
col_names = col_names + ['time_of_day'] # Add the quality measurements later.

# Get time-of-day from timestamp
df['time_of_day'] = df['utc_timestamp'][0].hour + df['utc_timestamp'][0].minute/60 

# Perform train-test split. Just Split it across time 3:1
# And need to drop time-stamp. As datetime64 is unsupported 
training_data = df[col_names].iloc[0:int(df.shape[0]*0.75),:]
testing_data = df[col_names].iloc[int(df.shape[0]*0.75):,:]
training_target = df['label'].iloc[0:int(df.shape[0]*0.75)]
testing_target = df['label'].iloc[int(df.shape[0]*0.75):]

# # Give it some Nans just to see how autosklearn deals with it
# training_data.iloc[0:10,0] = np.nan 
# testing_data.iloc[0:10,0] = np.nan

# training_data["Nans"] = np.repeat(np.nan,training_data.shape[0])
# testing_data["Nans"] = np.repeat(np.nan,testing_data.shape[0])

# Need to adjust the metric
metric = autosklearn.metrics.roc_auc

# Initialize the auto-sklearn classifier
automl = autosklearn.classification.AutoSklearnClassifier(time_left_for_this_task = 180,
                                                           per_run_time_limit = 30,
                                                           n_jobs = -1,
                                                           memory_limit = 102400000, # This needs to be set or else you may error out. 
                                                           metric = metric)  # Metric needs to be set to not accuracy due to imbalanced dataset

# Fit the classifier on the training data
automl.fit(training_data, training_target)

# Score the classifier on the testing data
print("Accuracy: ", automl.score(testing_data, testing_target))

# Save the model
dump(automl, 'Fast_automl_acc.joblib')

Accuracy:  0.5


['Fast_automl_acc.joblib']

In [5]:
print("Accuracy: ", automl.score(testing_data, testing_target))
print(metrics.roc_curve(testing_target,automl.predict(testing_data)))
print(metrics.roc_auc_score(testing_target,automl.predict(testing_data)))

Accuracy:  0.5
(array([0., 1.]), array([0., 1.]), array([1, 0]))
0.5


In [13]:
# Print out all the ways to inspet the results
print('Basic Statistics')
print(automl.sprint_statistics())
#print('Performance over Time')
#print(automl.performance_over_time_)
print('Evaluated Models')
print(automl.show_models())
print('Leaderboard')
print(automl.leaderboard())
print('CV Results')
print(automl.cv_results_)

Basic Statistics
auto-sklearn results:
  Dataset name: 245293ca-af96-11ed-942e-123e0c4dfe5e
  Metric: roc_auc
  Number of target algorithm runs: 40
  Number of successful target algorithm runs: 0
  Number of crashed target algorithm runs: 1
  Number of target algorithms that exceeded the time limit: 39
  Number of target algorithms that exceeded the memory limit: 0

Evaluated Models


RuntimeError: No model found. Try increasing 'time_left_for_this_task'.

In [76]:
pd.read_pickle("Patient_1110.pkl")

Unnamed: 0,acc_mag_DN_HistogramMode_5,acc_mag_DN_HistogramMode_10,acc_mag_CO_f1ecac,acc_mag_CO_FirstMin_ac,acc_mag_CO_HistogramAMI_even_2_5,acc_mag_CO_trev_1_num,acc_mag_MD_hrv_classic_pnn40,acc_mag_SB_BinaryStats_mean_longstretch1,acc_mag_SB_TransitionMatrix_3ac_sumdiagcov,acc_mag_PD_PeriodicityWang_th0_01,...,acc_phi_SB_BinaryStats_diff_longstretch0,acc_phi_SB_MotifThree_quantile_hh,acc_phi_SC_FluctAnal_2_rsrangefit_50_1_logi_prop_r1,acc_phi_SC_FluctAnal_2_dfa_50_1_2_logi_prop_r1,acc_phi_SP_Summaries_welch_rect_centroid,acc_phi_FC_LocalSimple_mean3_stderr,acc_phi_DN_Mean,acc_phi_DN_Spread_Std,utc_timestamp,label
0,,,0.0,0,,,,,,0,...,,,,,,,,,2020-05-25 23:20:00+00:00,0
1,,,0.0,0,,,,,,0,...,,,,,,,,,2020-05-25 23:20:10+00:00,0
2,,,0.0,0,,,,,,0,...,,,,,,,,,2020-05-25 23:20:20+00:00,0
3,,,0.0,0,,,,,,0,...,,,,,,,,,2020-05-25 23:20:30+00:00,0
4,,,0.0,0,,,,,,0,...,,,,,,,,,2020-05-25 23:20:40+00:00,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15595,,,0.0,0,,,,,,0,...,,,,,,,,,2020-08-01 16:49:10+00:00,0
15596,,,0.0,0,,,,,,0,...,,,,,,,,,2020-08-01 16:49:20+00:00,0
15597,,,0.0,0,,,,,,0,...,,,,,,,,,2020-08-01 16:49:30+00:00,0
15598,,,0.0,0,,,,,,0,...,,,,,,,,,2020-08-01 16:49:40+00:00,0


In [2]:
clf = load('Fast_automl_acc.joblib') 

In [90]:
metrics.roc_curve(testing_target,automl.predict(testing_data))



(array([0., 1.]), array([nan, nan]), array([1, 0]))

# Checking a model trained on the cluster

In [3]:
# Load model
with open('Ignore/Output/Faster_automl_acc.pkl','rb') as file:
    clf = pickle.load(file) 

# Load data
data = pd.read_pickle("Ignore/Output/Patient_1110.pkl")

# Specify Colummns that were used as Input (in addition to the time-stamp)
cols = ["acc_mag","acc_theta","acc_phi",]

feature_names = np.loadtxt("Catch22_Featurenames",dtype=str)
col_names = [i + "_" + j for i in cols for j in feature_names]
col_names = col_names + ['time_of_day'] # Add the quality measurements later.

# Get time-of-day from timestamp
df['time_of_day'] = df['utc_timestamp'][0].hour + df['utc_timestamp'][0].minute/60 

# Perform train-test split. Just Split it across time 3:1
# And need to drop time-stamp. As datetime64 is unsupported 
training_data = df[col_names].iloc[0:int(df.shape[0]*0.75),:]
testing_data = df[col_names].iloc[int(df.shape[0]*0.75):,:]
training_target = df['label'].iloc[0:int(df.shape[0]*0.75)]
testing_target = df['label'].iloc[int(df.shape[0]*0.75):]

TypeError: _unpickle_timestamp() takes exactly 3 positional arguments (4 given)

In [4]:
clf

AutoSklearnClassifier(ensemble_class=<class 'autosklearn.ensembles.ensemble_selection.EnsembleSelection'>,
                      memory_limit=102400000, n_jobs=-1, per_run_time_limit=30,
                      time_left_for_this_task=180)

In [7]:
clf

AutoSklearnClassifier(ensemble_class=<class 'autosklearn.ensembles.ensemble_selection.EnsembleSelection'>,
                      memory_limit=102400000, n_jobs=-1, per_run_time_limit=30,
                      time_left_for_this_task=180)

In [8]:
import pickle

In [15]:
with open('Ignore/Output/Faster_automl_acc.pkl','rb') as file:
    clf = pickle.load(file) 

TypeError: _unpickle_timestamp() takes exactly 3 positional arguments (4 given)

In [10]:
# pickle.dump(clf, open('model.pkl', 'wb'))

In [14]:
clf

AutoSklearnClassifier(ensemble_class=<class 'autosklearn.ensembles.ensemble_selection.EnsembleSelection'>,
                      memory_limit=102400000, n_jobs=-1, per_run_time_limit=30,
                      time_left_for_this_task=180)

In [6]:
df['time_of_day']

0          23.166667
1          23.166667
2          23.166667
3          23.166667
4          23.166667
             ...    
1237855    23.166667
1237856    23.166667
1237857    23.166667
1237858    23.166667
1237859    23.166667
Name: time_of_day, Length: 1237860, dtype: float64