In [1]:
import os, gc
import numpy as np
import pandas as pd
import pickle
from collections import deque
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import StratifiedKFold, GroupKFold
from sklearn.metrics import average_precision_score

In [2]:
path = "/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/"
train_defog = os.listdir(path + "train/defog")
test_defog = os.listdir(path + "test/defog")
train_tdcsfog = os.listdir(path + "train/tdcsfog")
test_tdcsfog = os.listdir(path + "test/tdcsfog")

In [3]:
set(train_defog) & set(train_tdcsfog)

set()

In [4]:
if len(set(test_defog) & set(test_tdcsfog)) > 0: # is there?
    test_tdcsfog = list(set(test_tdcsfog) - (set(test_defog) & set(test_tdcsfog)))

In [5]:
task = pd.read_csv(path + "tasks.csv")
task_map = {t:i+1 for i,t in enumerate(sorted(list(set(task.Task.values))))}
task["TaskId"] = task.Task.apply(lambda x:task_map[x])
events = pd.read_csv(path + "events.csv")
events = events[events.Kinetic==1]
subjects = pd.read_csv(path + "subjects.csv")
subjects["SexId"] = (subjects.Sex=="M").values.astype(np.uint8)
subjects = subjects.fillna(0)
subjects = subjects.drop(["Sex"], axis=1)
len(set(task_map.values()))

31

In [6]:
metadata = [pd.read_csv(path + "defog_metadata.csv"),
            pd.read_csv(path + "tdcsfog_metadata.csv")]
metadata[0]["MedicationId"] = metadata[0].Medication.map( {'on': 1, 'off': 0} ).astype(int)
metadata[0] = metadata[0].drop(["Medication"], axis=1)
metadata[1]["MedicationId"] = (metadata[1].Medication.map( {'on': 1, 'off': 0} ).astype(int)
metadata[1] = metadata[1].drop(["Medication"], axis=1)

In [7]:
ext_columns = list(subjects.columns)[1:]

In [8]:
def read_csv_with_task(csv):
    global task, events, defog_metadata, tdcsfog_metadata
    fn = csv.split("/")[-1]
    idf = fn.split(".")[0]
    tdf = task[task.Id==idf]
    edf = events[events.Id==idf]
    df = pd.read_csv(csv)
    taskids = np.zeros(len(df), dtype=np.uint8)
    for b,e,t in zip(tdf.Begin,tdf.End,tdf.TaskId):
        taskids[int(b):int(e)] = t
    for b,e,t in zip(edf.Init,edf.Completion,edf.Type):
        if t=="Turn":
            taskids[int(b):int(e)] = taskids[int(b):int(e)] + 32
        else:
            taskids[int(b):int(e)] = taskids[int(b):int(e)] + 64
    df["TaskId"] = taskids
    met = metadata[0 if "defog" in csv else 1]
    subId = met[met.Id==idf].values.flatten()[1]
    sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
    for i,c in enumerate(ext_columns):
        df[c] = sub[i]
    return df

In [9]:
target_cols = ["StartHesitation","Turn","Walking"]
train_cols = ["Time","AccV","AccML","AccAP","TaskId"]

In [10]:
def feature_engineering(val, clfs, target=None):
    # Cluster and Dimensional mapping analysis for each data
    if clfs[0] is None:
        clfs[0] = MiniBatchKMeans(n_clusters=8, random_state=0, init="random").fit(val[:,1:4])
    km = clfs[0].predict(val[:,1:4])
    km_oh = np.zeros((val.shape[0],8), dtype=np.uint8) # discrete value change to One-hot
    for i in range(8):
        idx = np.where(km==0)[0]
        km_oh[idx,i] = 1
    if clfs[1] is None:
        clfs[1] = TruncatedSVD(n_components=2, n_iter=10, random_state=0).fit(val[:,1:4])
    svd = clfs[1].transform(val[:,1:4])
    # Per-user statistics
    print("Per-user statistics")
    cp = 0
    sp = 0
    usrm = np.zeros((val.shape[0], 5*val.shape[1]-10), dtype=np.float16)
    for i in range(val.shape[0]):
        if cp > val[i,0]:
            for t in range(val.shape[1]-2):
                usrm[sp:i,5*t] = np.mean(val[sp:i,t+1])
                usrm[sp:i,5*t+1] = np.std(val[sp:i,t+1])
                usrm[sp:i,5*t+2] = np.max(val[sp:i,t+1])
                usrm[sp:i,5*t+3] = np.min(val[sp:i,t+1])
                usrm[sp:i,5*t+4] = (i-sp)/val.shape[0]
            sp = i
        cp = val[i,0]
    for t in range(val.shape[1]-2):
        usrm[sp:,5*t] = np.mean(val[sp:,t+1])
        usrm[sp:,5*t+1] = np.std(val[sp:,t+1])
        usrm[sp:,5*t+2] = np.max(val[sp:,t+1])
        usrm[sp:,5*t+3] = np.min(val[sp:,t+1])
        usrm[sp:,5*t+4] = (val.shape[0]-sp)/val.shape[0]
    iskinetic = np.stack([(val[:,4]>=32).astype(np.uint8), (val[:,4]>=64).astype(np.uint8)]).transpose((1,0))
    # Cluster and Dimensional mapping analysis for each user/task
    print("Cluster and Dimensional mapping analysis for each user/task")
    if clfs[2] is None:
        clfs[2] = MiniBatchKMeans(n_clusters=8, random_state=0, init="random").fit(usrm)
    kmu = clfs[2].predict(usrm)
    kmu_oh = np.zeros((val.shape[0],8), dtype=np.uint8) # discrete value change to One-hot
    for i in range(8):
        idx = np.where(kmu==0)[0]
        kmu_oh[idx,i] = 1
    del kmu
    gc.collect()
    if clfs[3] is None:
        clfs[3] = TruncatedSVD(n_components=2, n_iter=10, random_state=0).fit(usrm)
    svdu = clfs[3].transform(usrm)
    gc.collect()
    # Merge waypoints
    marged = np.hstack([val[:,1:4],km_oh,svd])
    # Moving average and variance within the same user
    print("Moving average and variance within the same user/task")
    wnd = np.zeros((val.shape[0],52), dtype=np.float16)
    cp = val[0,0]
    window = deque([marged[0,:15]] * 5)
    for i in range(val.shape[0]):
        if cp > val[i,0]:
            window = deque([marged[i,:15]] * 5)
        else:
            window.popleft()
            window.append(marged[i,:15])
        cp = val[i,0]
        wnd[i] = np.hstack([np.mean(window, axis=0),np.std(window, axis=0),np.min(window, axis=0),np.max(window, axis=0)])
    # Analyze the entire merge data
    print("Analyze the entire merge data")
    usrv = np.hstack([svd,svdu])
    if clfs[4] is None:
        clfs[4] = [LinearRegression().fit(usrv, target[:,i]) for i in range(3)]
    reg = np.stack([clfs[4][i].predict(usrv) for i in range(3)]).transpose((1,0))
    del usrv
    gc.collect()
    if clfs[5] is None:
        clfs[5] = TruncatedSVD(n_components=2, n_iter=10, random_state=0).fit(marged)
    svdm = clfs[5].transform(marged)
    # Marge all
    return np.hstack([marged,wnd,reg,svdm,usrm,kmu_oh,svdu,iskinetic])

In [11]:
#from sklearn.tree import DecisionTreeRegressor
def get_regressor(totest=False):
    return Ridge(max_iter=1000,random_state=0) #DecisionTreeRegressor(max_leaf_nodes=32, random_state=0)

def training(val, target):
    return get_regressor().fit(val, target)

def predict(clfs, val):
    return clfs.predict(val)

In [12]:
train_dfs = [read_csv_with_task(path + "train/defog/"+i)[train_cols+target_cols] for i in train_defog]
train_val = [i[train_cols].values for i in train_dfs]
train_tgt = [i[target_cols].values for i in train_dfs]
del train_dfs
gc.collect()



0

In [13]:
train_val = np.vstack(train_val)
train_tgt = np.vstack(train_tgt)
gc.collect()

21

In [14]:
defog_trans = [None, None, None, None, None ,None]
train_val = feature_engineering(train_val, defog_trans, target=train_tgt)

Per-user statistics
Cluster and Dimensional mapping analysis for each user/task


  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


Moving average and variance within the same user/task
Analyze the entire merge data


In [15]:
train_val = train_val.astype(np.float16) # reduce memory
gc.collect()
defog_clf = [training(train_val,train_tgt[:,i]) for i in range(len(target_cols))]

In [16]:
del train_val, train_tgt, train_defog
gc.collect()

21

In [17]:
train_dfs = [read_csv_with_task(path + "train/tdcsfog/"+i)[train_cols+target_cols] for i in train_tdcsfog]
train_val = [i[train_cols].values for i in train_dfs]
train_tgt = [i[target_cols].values for i in train_dfs]
del train_dfs
gc.collect()



0

In [18]:
train_val = np.vstack(train_val)
train_tgt = np.vstack(train_tgt)
gc.collect()

21

In [19]:
tdcsfog_trans = [None, None, None, None, None, None]
train_val = feature_engineering(train_val, tdcsfog_trans, target=train_tgt)

Per-user statistics
Cluster and Dimensional mapping analysis for each user/task


  arrmean = umr_sum(arr, axis, dtype, keepdims=True, where=where)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


Moving average and variance within the same user/task
Analyze the entire merge data


In [20]:
train_val = train_val.astype(np.float16) # reduce memory
gc.collect()
tdcsfog_clf = [training(train_val,train_tgt[:,i]) for i in range(len(target_cols))]

In [21]:
del train_val, train_tgt, train_tdcsfog
gc.collect()

21

In [22]:
test_dfs = [read_csv_with_task("/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/test/defog/"+i)[train_cols] for i in test_defog]
test_val = [i.values for i in test_dfs]
gc.collect()



21

In [23]:
test_val = np.vstack(test_val)
test_val = feature_engineering(test_val, defog_trans)

Per-user statistics
Cluster and Dimensional mapping analysis for each user/task
Moving average and variance within the same user/task
Analyze the entire merge data


In [24]:
test_val = test_val.astype(np.float16) # reduce memory
gc.collect()
test_defog_preds = [np.clip(predict(c, test_val), 0, 1) for i,c in enumerate(defog_clf)]

In [25]:
defog_ids = []
for f,d in zip(test_defog,test_dfs):
    fid = f.split(".")[0]
    for t in d.Time.values:
        sid = f"{fid}_{t}"
        defog_ids.append(sid)

In [26]:
del test_defog, test_dfs, test_val
gc.collect()

42

In [27]:
test_dfs = [read_csv_with_task(path + "test/tdcsfog/"+i)[train_cols] for i in test_tdcsfog]
test_val = [i.values for i in test_dfs]
gc.collect()



21

In [28]:
test_val = np.vstack(test_val)
test_val = feature_engineering(test_val, tdcsfog_trans)

Per-user statistics
Cluster and Dimensional mapping analysis for each user/task
Moving average and variance within the same user/task
Analyze the entire merge data


In [29]:
test_val = test_val.astype(np.float16) # reduce memory
gc.collect()
test_tdcsfog_preds = [np.clip(predict(c, test_val), 0, 1) for i,c in enumerate(tdcsfog_clf)]

In [30]:
tdcsfog_ids = []
for f,d in zip(test_tdcsfog,test_dfs):
    fid = f.split(".")[0]
    for t in d.Time.values:
        sid = f"{fid}_{t}"
        tdcsfog_ids.append(sid)

In [31]:
del test_tdcsfog, test_dfs, test_val
gc.collect()

42

In [32]:
all_ids = defog_ids + tdcsfog_ids
all_starts = list(test_defog_preds[0]) + list(test_tdcsfog_preds[0])
all_turns = list(test_defog_preds[1]) + list(test_tdcsfog_preds[1])
all_walkings = list(test_defog_preds[2]) + list(test_tdcsfog_preds[2])

In [33]:
df = pd.DataFrame({"Id":all_ids,"StartHesitation":all_starts,"Turn":all_turns,"Walking":all_walkings})
df

Unnamed: 0,Id,StartHesitation,Turn,Walking
0,02ab235146_0,0.0,0.0,1.0
1,02ab235146_1,0.0,0.0,1.0
2,02ab235146_2,0.0,0.0,1.0
3,02ab235146_3,0.0,0.0,1.0
4,02ab235146_4,0.0,0.0,1.0
...,...,...,...,...
286365,003f117e14_4677,1.0,1.0,1.0
286366,003f117e14_4678,1.0,1.0,1.0
286367,003f117e14_4679,1.0,1.0,1.0
286368,003f117e14_4680,1.0,1.0,1.0


In [34]:
df.to_csv("submission.csv", index=False)