In [1]:
!mkdir gcs
!gcsfuse --implicit-dirs --rename-dir-limit=100 --max-conns-per-host=100 "hms_applied_cv" "/home/jupyter/data"

{"time":"15/04/2024 04:45:58.534355","severity":"INFO","message":"Start gcsfuse/2.0.0 (Go version go1.22.1) for app \"\" using mount point: /home/jupyter/data\n"}
{"time":"15/04/2024 04:45:58.534592","severity":"INFO","message":"GCSFuse mount command flags: {\"AppName\":\"\",\"Foreground\":false,\"ConfigFile\":\"\",\"MountOptions\":{},\"DirMode\":493,\"FileMode\":420,\"Uid\":-1,\"Gid\":-1,\"ImplicitDirs\":true,\"OnlyDir\":\"\",\"RenameDirLimit\":100,\"CustomEndpoint\":null,\"BillingProject\":\"\",\"KeyFile\":\"\",\"TokenUrl\":\"\",\"ReuseTokenFromUrl\":true,\"EgressBandwidthLimitBytesPerSecond\":-1,\"OpRateLimitHz\":-1,\"SequentialReadSizeMb\":200,\"MaxRetrySleep\":30000000000,\"StatCacheCapacity\":20460,\"StatCacheTTL\":60000000000,\"TypeCacheTTL\":60000000000,\"HttpClientTimeout\":0,\"MaxRetryDuration\":-1000000000,\"RetryMultiplier\":2,\"LocalFileCache\":false,\"TempDir\":\"\",\"ClientProtocol\":\"http1\",\"MaxConnsPerHost\":100,\"MaxIdleConnsPerHost\":100,\"EnableNonexistentTypeCac

In [2]:
# ! pip install sktime
! pip install sktime[all_extras]
! pip install sktime[mlflow]



In [32]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import joblib
import sktime
from sktime.classification.deep_learning import CNNClassifier, LSTMFCNClassifier
from sktime.classification.kernel_based import Arsenal
from sklearn.preprocessing import OneHotEncoder
# from sklearn.pipeline import Pipeline
# from sklearn.impute import SimpleImputer
from sktime.forecasting.arima import ARIMA
from sktime.classification.distance_based import KNeighborsTimeSeriesClassifier
from sktime.utils import mlflow_sktime
from sktime.classification.kernel_based import RocketClassifier
import pickle

In [4]:
mlflow_sktime.save_model

<function sktime.utils.mlflow_sktime.save_model(sktime_model, path, conda_env=None, code_paths=None, mlflow_model=None, signature=None, input_example=None, pip_requirements=None, extra_pip_requirements=None, serialization_format='pickle')>

In [4]:
features = pd.read_csv('data/hms-harmful-brain-activity-classification/cleaned_train.csv')

In [6]:
features.columns

Index(['Unnamed: 0', 'eeg_id', 'eeg_sub_id', 'eeg_label_offset_seconds',
       'spectrogram_id', 'spectrogram_sub_id',
       'spectrogram_label_offset_seconds', 'label_id', 'patient_id',
       'expert_consensus', 'seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote',
       'grda_vote', 'other_vote', 'concatenated_scores', 'is_center'],
      dtype='object')

In [7]:
len(features)

106800

In [35]:

train_size = 20000 # 1000
val_size = 6000 # 200

actual_train = 0
actual_val = 0

train_ind = features.iloc[:train_size]['eeg_id']
val_ind = features.iloc[train_size:val_size + train_size]['eeg_id']

x_train = []
y_train = []

x_val = []
y_val = []


for i in range(0,train_size):
    ind = features.iloc[i]['eeg_id']
    pq = pd.read_parquet(f"data/hms-harmful-brain-activity-classification/train_eegs/{ind}.parquet")
    middle = (len(pq)-2_000)//2
    pq = pq.iloc[middle:middle+2_000:2]
    pq = pq.reset_index()
    x_train.append(pq)
    y_train.append(features.iloc[i]['expert_consensus'])
    
    
for i in range(train_size, val_size + train_size):
    ind = features.iloc[i]['eeg_id']
    pq = pd.read_parquet(f"data/hms-harmful-brain-activity-classification/train_eegs/{ind}.parquet")
    middle = (len(pq)-2_000)//2
    pq = pq.iloc[middle:middle+2_000:2]
    pq = pq.reset_index()
    x_val.append(pq)
    y_val.append(features.iloc[i]['expert_consensus'])
    
x_train = pd.concat(x_train,keys=list(range(train_size)),axis=0).reset_index(level=1)
x_train['instances'] = x_train.index
x_train = x_train.rename(columns={"level_1": "timepoints"})
x_train = x_train.set_index(['instances', 'timepoints'])
x_train = x_train.fillna(0)


x_val = pd.concat(x_val,keys=list(range(val_size)),axis=0).reset_index(level=1)
x_val['instances'] = x_val.index
x_val = x_val.rename(columns={"level_1": "timepoints"})
x_val = x_val.set_index(['instances', 'timepoints'])
x_val = x_val.fillna(0)


y_train = pd.get_dummies(y_train, columns = ['GPD','GRDA','LPD','LRDA','Other','Seizure']).astype("int32")
y_train = y_train.reset_index(drop=True)

y_val = pd.get_dummies(y_val, columns = ['GPD','GRDA','LPD','LRDA','Other','Seizure']).astype("int32")
y_val = y_val.reset_index(drop=True)

In [37]:
x_train.to_csv("/home/jupyter/x_train.csv")
x_val.to_csv("/home/jupyter/x_val.csv")

y_train.to_csv("/home/jupyter/y_train.csv")
y_val.to_csv("/home/jupyter/y_val.csv")

In [7]:
x_train = pd.read_csv("/home/jupyter/x_train.csv")
x_val = pd.read_csv("/home/jupyter/x_val.csv")



In [25]:
x_train = x_train.set_index(['instances', 'timepoints'])
x_val = x_val.set_index(['instances', 'timepoints'])
y

In [45]:
y_train = pd.read_csv("/home/jupyter/y_train.csv", index_col=0)
y_val = pd.read_csv("/home/jupyter/y_val.csv", index_col=0)

In [28]:
x_train[:1000*1000]

Unnamed: 0_level_0,Unnamed: 1_level_0,index,Fp1,F3,C3,P3,F7,T3,T5,O1,Fz,...,Pz,Fp2,F4,C4,P4,F8,T4,T6,O2,EKG
instances,timepoints,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,0,5600,50.14,125.69,273.30,210.12,151.85,125.79,239.60,250.11,54.24,...,67.80,108.64,190.89,92.96,302.47,35.88,332.87,155.05,135.88,32855.60
0,1,5602,14.41,70.96,223.35,171.77,102.79,78.21,202.62,209.94,6.84,...,22.05,73.74,145.24,59.31,259.79,-9.09,290.35,146.31,89.02,751.12
0,2,5604,22.87,80.81,254.64,203.74,125.33,99.74,222.63,234.26,19.97,...,66.29,93.04,169.55,106.53,356.57,21.58,350.74,192.74,119.76,308.50
0,3,5606,19.78,59.58,230.42,187.28,108.51,86.94,209.18,221.25,6.53,...,50.27,99.64,156.97,104.67,302.20,6.66,309.80,198.64,102.93,-3342.34
0,4,5608,-14.46,26.90,217.55,169.16,82.12,67.85,194.67,200.64,-24.83,...,41.47,76.15,136.82,96.47,355.69,-7.12,314.93,194.51,83.54,3232.94
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999,995,8790,-42.27,-53.97,-21.00,-29.77,-45.46,-11.17,-31.90,-19.67,-27.12,...,-18.08,-7.71,-9.04,-43.33,8.24,-36.42,-22.60,-32.96,-8.51,-0.27
999,996,8792,-49.45,-65.40,-34.82,-42.00,-60.08,-22.06,-43.33,-31.90,-35.62,...,-28.44,-17.81,-21.00,-51.84,-8.24,-46.79,-39.34,-53.43,-24.46,1.33
999,997,8794,-29.24,-48.38,-23.13,-30.84,-41.20,-7.44,-30.84,-24.99,-17.01,...,-16.48,0.27,-9.30,-38.28,36.42,-32.43,5.05,-3.72,21.00,0.53
999,998,8796,-45.72,-61.14,-42.00,-54.23,-58.48,-24.46,-56.36,-48.38,-35.09,...,-36.42,-19.14,-14.89,-52.90,-16.75,-45.72,-45.72,-59.28,-28.44,-0.80


In [None]:
 # Fit and save model
model = RocketClassifier(num_kernels=5000)
predictions = model.fit(x_train[:5000*1000], y_train[:5000])
mlflow_sktime.save_model(model, "/home/jupyter/rocket")

In [None]:
with open('rocket.pkl','wb') as f:
    pickle.dump(model,f)


In [48]:
# Predict and print accuracy
predictions = model.predict(x_val[:1000*1000])


In [54]:
predictions.idxmax(axis=1)

0      Seizure
1      Seizure
2      Seizure
3      Seizure
4      Seizure
        ...   
995      Other
996        GPD
997        GPD
998        GPD
999        GPD
Length: 1000, dtype: object

In [56]:
print(accuracy_score(y_val.idxmax(axis=1)[:1000],predictions.idxmax(axis=1) ))

0.243


In [75]:
pd.get_dummies(y_val)

Unnamed: 0,LPD
0,True
1,True


In [17]:
eeg_id = features.iloc[1]['eeg_id']
label = features.iloc[1]['expert_consensus']

pq = pd.read_parquet(f"gcs/hms-harmful-brain-activity-classification/train_eegs/{eeg_id}.parquet")
middle = (len(pq)-10_000)//2
pq = pq.iloc[middle:middle+10_000]
pq1 = pq.reset_index()

In [18]:
eeg_id = features.iloc[2]['eeg_id']
label2 = features.iloc[2]['expert_consensus']

pq = pd.read_parquet(f"gcs/hms-harmful-brain-activity-classification/train_eegs/{eeg_id}.parquet")
middle = (len(pq)-10_000)//2
pq = pq.iloc[middle:middle+10_000]
pq2 = pq.reset_index()

In [55]:
mi_pd = pd.concat([pq1,pq2],keys= [1,2],axis=0).reset_index(level=1)

In [56]:
mi_pd.columns

Index(['level_1', 'index', 'Fp1', 'F3', 'C3', 'P3', 'F7', 'T3', 'T5', 'O1',
       'Fz', 'Cz', 'Pz', 'Fp2', 'F4', 'C4', 'P4', 'F8', 'T4', 'T6', 'O2',
       'EKG'],
      dtype='object')

In [57]:
mi_pd

Unnamed: 0,level_1,index,Fp1,F3,C3,P3,F7,T3,T5,O1,...,Pz,Fp2,F4,C4,P4,F8,T4,T6,O2,EKG
1,0,1600,26.139999,27.540001,114.160004,50.130001,65.769997,149.500000,38.090000,37.820000,...,75.519997,76.720001,67.870003,21.830000,204.610001,-59.549999,241.250000,87.320000,150.979996,-2648.439941
1,1,1601,-0.860000,3.940000,94.050003,28.770000,42.549999,119.040001,12.320000,13.630000,...,53.720001,46.950001,44.099998,-3.100000,212.619995,-79.680000,225.089996,68.389999,126.430000,2274.139893
1,2,1602,6.090000,-5.510000,73.160004,26.350000,41.560001,109.250000,9.220000,12.420000,...,41.970001,48.900002,33.180000,-6.840000,162.610001,-88.129997,196.699997,67.419998,120.349998,153.940002
1,3,1603,42.860001,34.759998,112.150002,71.410004,83.809998,151.070007,53.660000,56.369999,...,89.720001,82.389999,66.529999,38.090000,205.720001,-51.060001,241.250000,101.110001,165.110001,-3030.659912
1,4,1604,27.459999,31.379999,116.540001,69.680000,75.730003,142.119995,49.840000,52.810001,...,94.160004,68.110001,57.950001,35.310001,254.679993,-52.689999,256.089996,100.150002,164.940002,404.940002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2,9995,11595,-102.949997,-56.820000,135.770004,-47.740002,-87.129997,-41.790001,-77.870003,-137.360001,...,-13.450000,-69.680000,-32.099998,-63.790001,22.719999,-118.230003,119.610001,-80.580002,-73.160004,2372.300049
2,9996,11596,-99.110001,-64.269997,119.199997,-49.349998,-82.370003,-47.299999,-78.370003,-129.899994,...,-24.170000,-68.099998,-36.570000,-72.809998,-28.790001,-128.770004,85.580002,-82.309998,-76.040001,130.539993
2,9997,11597,-79.410004,-35.320000,154.639999,-23.670000,-53.520000,-22.360001,-52.980000,-97.040001,...,9.870000,-48.150002,-7.770000,-44.349998,23.590000,-104.160004,114.379997,-70.300003,-49.049999,-2685.510010
2,9998,11598,-109.790001,-59.380001,136.360001,-60.650002,-80.849998,-59.080002,-93.790001,-131.070007,...,-21.840000,-81.470001,-29.740000,-83.830002,24.430000,-132.429993,79.720001,-112.820000,-84.279999,1606.150024


In [11]:
# 80/20 split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2)

In [14]:
type(x_train)

pandas.core.frame.DataFrame

In [43]:
sktime.datatypes.mtype(mi_pd)

'pd_DataFrame_Table'

In [46]:
sktime.datatypes.check_raise(pd.DataFrame([1,2]), mtype='pd_DataFrame_Table')

True

In [30]:
mi_pd = mi_pd.astype(object)

In [36]:
mi_pd['i'] = mi_pd.index

In [40]:
mi_pd

Unnamed: 0,level_1,index,Fp1,F3,C3,P3,F7,T3,T5,O1,...,Fp2,F4,C4,P4,F8,T4,T6,O2,EKG,index1
1,0,1600,26.139999,27.540001,114.160004,50.130001,65.769997,149.5,38.09,37.82,...,76.720001,67.870003,21.83,204.610001,-59.549999,241.25,87.32,150.979996,-2648.439941,1
1,1,1601,-0.86,3.94,94.050003,28.77,42.549999,119.040001,12.32,13.63,...,46.950001,44.099998,-3.1,212.619995,-79.68,225.089996,68.389999,126.43,2274.139893,1
1,2,1602,6.09,-5.51,73.160004,26.35,41.560001,109.25,9.22,12.42,...,48.900002,33.18,-6.84,162.610001,-88.129997,196.699997,67.419998,120.349998,153.940002,1
1,3,1603,42.860001,34.759998,112.150002,71.410004,83.809998,151.070007,53.66,56.369999,...,82.389999,66.529999,38.09,205.720001,-51.060001,241.25,101.110001,165.110001,-3030.659912,1
1,4,1604,27.459999,31.379999,116.540001,69.68,75.730003,142.119995,49.84,52.810001,...,68.110001,57.950001,35.310001,254.679993,-52.689999,256.089996,100.150002,164.940002,404.940002,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2,9995,11595,-102.949997,-56.82,135.770004,-47.740002,-87.129997,-41.790001,-77.870003,-137.360001,...,-69.68,-32.099998,-63.790001,22.719999,-118.230003,119.610001,-80.580002,-73.160004,2372.300049,2
2,9996,11596,-99.110001,-64.269997,119.199997,-49.349998,-82.370003,-47.299999,-78.370003,-129.899994,...,-68.099998,-36.57,-72.809998,-28.790001,-128.770004,85.580002,-82.309998,-76.040001,130.539993,2
2,9997,11597,-79.410004,-35.32,154.639999,-23.67,-53.52,-22.360001,-52.98,-97.040001,...,-48.150002,-7.77,-44.349998,23.59,-104.160004,114.379997,-70.300003,-49.049999,-2685.51001,2
2,9998,11598,-109.790001,-59.380001,136.360001,-60.650002,-80.849998,-59.080002,-93.790001,-131.070007,...,-81.470001,-29.74,-83.830002,24.43,-132.429993,79.720001,-112.82,-84.279999,1606.150024,2


In [60]:
mi_pd['instances'] = mi_pd.index
mi_pd = mi_pd.rename(columns={"level_1": "timepoints"})
mi_pd = mi_pd.set_index(['instances', 'timepoints'])
# mi_pd = mi_pd.rename(columns={"index1": "instances"})

In [62]:
mi_pd

Unnamed: 0_level_0,Unnamed: 1_level_0,index,Fp1,F3,C3,P3,F7,T3,T5,O1,Fz,...,Pz,Fp2,F4,C4,P4,F8,T4,T6,O2,EKG
instances,timepoints,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,0,1600,26.139999,27.540001,114.160004,50.130001,65.769997,149.500000,38.090000,37.820000,28.480000,...,75.519997,76.720001,67.870003,21.830000,204.610001,-59.549999,241.250000,87.320000,150.979996,-2648.439941
1,1,1601,-0.860000,3.940000,94.050003,28.770000,42.549999,119.040001,12.320000,13.630000,-3.870000,...,53.720001,46.950001,44.099998,-3.100000,212.619995,-79.680000,225.089996,68.389999,126.430000,2274.139893
1,2,1602,6.090000,-5.510000,73.160004,26.350000,41.560001,109.250000,9.220000,12.420000,-14.040000,...,41.970001,48.900002,33.180000,-6.840000,162.610001,-88.129997,196.699997,67.419998,120.349998,153.940002
1,3,1603,42.860001,34.759998,112.150002,71.410004,83.809998,151.070007,53.660000,56.369999,21.530001,...,89.720001,82.389999,66.529999,38.090000,205.720001,-51.060001,241.250000,101.110001,165.110001,-3030.659912
1,4,1604,27.459999,31.379999,116.540001,69.680000,75.730003,142.119995,49.840000,52.810001,8.940000,...,94.160004,68.110001,57.950001,35.310001,254.679993,-52.689999,256.089996,100.150002,164.940002,404.940002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2,9995,11595,-102.949997,-56.820000,135.770004,-47.740002,-87.129997,-41.790001,-77.870003,-137.360001,-27.820000,...,-13.450000,-69.680000,-32.099998,-63.790001,22.719999,-118.230003,119.610001,-80.580002,-73.160004,2372.300049
2,9996,11596,-99.110001,-64.269997,119.199997,-49.349998,-82.370003,-47.299999,-78.370003,-129.899994,-30.389999,...,-24.170000,-68.099998,-36.570000,-72.809998,-28.790001,-128.770004,85.580002,-82.309998,-76.040001,130.539993
2,9997,11597,-79.410004,-35.320000,154.639999,-23.670000,-53.520000,-22.360001,-52.980000,-97.040001,-4.500000,...,9.870000,-48.150002,-7.770000,-44.349998,23.590000,-104.160004,114.379997,-70.300003,-49.049999,-2685.510010
2,9998,11598,-109.790001,-59.380001,136.360001,-60.650002,-80.849998,-59.080002,-93.790001,-131.070007,-35.169998,...,-21.840000,-81.470001,-29.740000,-83.830002,24.430000,-132.429993,79.720001,-112.820000,-84.279999,1606.150024


In [63]:
 # Fit HC2
cnn = CNNClassifier()
predictions = cnn.fit(mi_pd, pd.DataFrame([1,2]))

# Predict and print accuracy
# predictions = hc2.predict(x_test)
# print(accuracy_score(y_test, predictions))

In [64]:
predictions = cnn.predict(mi_pd)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step


In [65]:
predictions

array([1, 1])

In [34]:
pq_test = pd.read_parquet("gcs/hms-harmful-brain-activity-classification/train_eegs/463265518.parquet")

In [7]:
pq_test.shape

(11600, 20)

In [8]:
pq_test = pd.read_parquet("gcs/hms-harmful-brain-activity-classification/train_eegs/3764624085.parquet")

In [9]:
pq_test.shape

(10800, 20)

In [10]:
len(pq_test)

10800

In [12]:
middle = (len(pq_test)-10_000)//2
pq_test_middle = pq_test.iloc[middle:middle+10_000]

In [14]:
pq_test_middle.shape

(10000, 20)

In [17]:
BASE_PATH = "gcs/hms-harmful-brain-activity-classification/"

df = pd.read_csv(f'{BASE_PATH}/train.csv')
df['eeg_path'] = f'{BASE_PATH}/train_eegs/'+df['eeg_id'].astype(str)+'.parquet'
df['class_name'] = df.expert_consensus.copy()
display(df.head(2))

# Test
test_df = pd.read_csv(f'{BASE_PATH}/test.csv')
test_df['eeg_path'] = f'{BASE_PATH}/test_eegs/'+test_df['eeg_id'].astype(str)+'.parquet'

display(test_df.head(2))


Unnamed: 0,eeg_id,eeg_sub_id,eeg_label_offset_seconds,spectrogram_id,spectrogram_sub_id,spectrogram_label_offset_seconds,label_id,patient_id,expert_consensus,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote,eeg_path,class_name
0,1628180742,0,0.0,353733,0,0.0,127492639,42516,Seizure,3,0,0,0,0,0,gcs/hms-harmful-brain-activity-classification/...,Seizure
1,1628180742,1,6.0,353733,1,6.0,3887563113,42516,Seizure,3,0,0,0,0,0,gcs/hms-harmful-brain-activity-classification/...,Seizure


Unnamed: 0,spectrogram_id,eeg_id,patient_id,eeg_path
0,853520,3911565283,6885,gcs/hms-harmful-brain-activity-classification/...
