In [1]:
import os
import gc
import random
import time

import json
from tqdm import tqdm
import glob
import numpy as np
import pandas as pd

import torch
import torch.nn as nn

from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

from sklearn.model_selection import train_test_split, StratifiedGroupKFold
from sklearn.metrics import accuracy_score, average_precision_score

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
class Config:
    train_dir1 = "/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/train/defog"
    train_dir2 = "/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/train/tdcsfog"

    batch_size = 512
    window_size = 128
    window_future = 8
    window_past = window_size - window_future
    
    wx = 8
    
    lr = 0.00015
    num_epochs = 8
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    feature_list = ['AccV', 'AccML', 'AccAP']
    label_list = ['StartHesitation', 'Turn', 'Walking']
    
    
cfg = Config()

In [3]:
cfg.device

'cuda'

In [4]:
# Analysis of positive instances in each fold of our CV folds

n1_sum = []
n2_sum = []
n3_sum = []
count = []

# Here I am using the metadata file available during training. Since the code will run again during submission, if 
# I used the usual file from the competition folder, it would have been updated with the test files too.
metadata = pd.read_csv("/kaggle/input/copy-train-metadata/tdcsfog_metadata.csv")

for f in tqdm(metadata['Id']):
    fpath = f"/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/train/tdcsfog/{f}.csv"
    df = pd.read_csv(fpath)
    
    n1_sum.append(np.sum(df['StartHesitation']))
    n2_sum.append(np.sum(df['Turn']))
    n3_sum.append(np.sum(df['Walking']))
    count.append(len(df))
    
print(f"32 files have positive values in all 3 classes")

metadata['n1_sum'] = n1_sum
metadata['n2_sum'] = n2_sum
metadata['n3_sum'] = n3_sum
metadata['count'] = count

sgkf = StratifiedGroupKFold(n_splits=5, random_state=42, shuffle=True)
for i, (train_index, valid_index) in enumerate(sgkf.split(X=metadata['Id'], y=[1]*len(metadata), groups=metadata['Subject'])):
    print(f"Fold = {i}")
    train_ids = metadata.loc[train_index, 'Id']
    valid_ids = metadata.loc[valid_index, 'Id']
    
    print(f"Length of Train = {len(train_index)}, Length of Valid = {len(valid_index)}")
    n1_sum = metadata.loc[train_index, 'n1_sum'].sum()
    n2_sum = metadata.loc[train_index, 'n2_sum'].sum()
    n3_sum = metadata.loc[train_index, 'n3_sum'].sum()
    print(f"Train classes: {n1_sum:,}, {n2_sum:,}, {n3_sum:,}")
    
    n1_sum = metadata.loc[valid_index, 'n1_sum'].sum()
    n2_sum = metadata.loc[valid_index, 'n2_sum'].sum()
    n3_sum = metadata.loc[valid_index, 'n3_sum'].sum()
    print(f"Valid classes: {n1_sum:,}, {n2_sum:,}, {n3_sum:,}")
    
# FOLD 2 is the most well balanced
# The actual train-test split (based on Fold 2)

metadata = pd.read_csv("/kaggle/input/copy-train-metadata/tdcsfog_metadata.csv")
sgkf = StratifiedGroupKFold(n_splits=5, random_state=42, shuffle=True)
for i, (train_index, valid_index) in enumerate(sgkf.split(X=metadata['Id'], y=[1]*len(metadata), groups=metadata['Subject'])):
    if i != 2:
        continue
    print(f"Fold = {i}")
    train_ids = metadata.loc[train_index, 'Id']
    valid_ids = metadata.loc[valid_index, 'Id']
    print(f"Length of Train = {len(train_ids)}, Length of Valid = {len(valid_ids)}")
    
    if i == 2:
        break
        
train_fpaths_tdcs = [f"/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/train/tdcsfog/{_id}.csv" for _id in train_ids]
valid_fpaths_tdcs = [f"/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/train/tdcsfog/{_id}.csv" for _id in valid_ids]

100%|██████████| 833/833 [00:25<00:00, 33.15it/s]


32 files have positive values in all 3 classes
Fold = 0
Length of Train = 672, Length of Valid = 161
Train classes: 287,832, 1,462,652, 175,633
Valid classes: 16,958, 216,130, 32,205
Fold = 1
Length of Train = 613, Length of Valid = 220
Train classes: 51,748, 909,505, 65,242
Valid classes: 253,042, 769,277, 142,596
Fold = 2
Length of Train = 703, Length of Valid = 130
Train classes: 271,881, 1,332,746, 183,673
Valid classes: 32,909, 346,036, 24,165
Fold = 3
Length of Train = 649, Length of Valid = 184
Train classes: 303,710, 1,517,147, 205,196
Valid classes: 1,080, 161,635, 2,642
Fold = 4
Length of Train = 695, Length of Valid = 138
Train classes: 303,989, 1,493,078, 201,608
Valid classes: 801, 185,704, 6,230
Fold = 2
Length of Train = 703, Length of Valid = 130


### defog

In [5]:
# Analysis of positive instances in each fold of our CV folds

n1_sum = []
n2_sum = []
n3_sum = []
count = []

# Here I am using the metadata file available during training. Since the code will run again during submission, if 
# I used the usual file from the competition folder, it would have been updated with the test files too.
metadata = pd.read_csv("/kaggle/input/copy-train-metadata/defog_metadata.csv")
metadata['n1_sum'] = 0
metadata['n2_sum'] = 0
metadata['n3_sum'] = 0
metadata['count'] = 0

for f in tqdm(metadata['Id']):
    fpath = f"/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/train/defog/{f}.csv"
    if os.path.exists(fpath) == False:
        continue
        
    df = pd.read_csv(fpath)
    metadata.loc[metadata['Id'] == f, 'n1_sum'] = np.sum(df['StartHesitation'])
    metadata.loc[metadata['Id'] == f, 'n2_sum'] = np.sum(df['Turn'])
    metadata.loc[metadata['Id'] == f, 'n3_sum'] = np.sum(df['Walking'])
    metadata.loc[metadata['Id'] == f, 'count'] = len(df)
    
metadata = metadata[metadata['count'] > 0].reset_index()

sgkf = StratifiedGroupKFold(n_splits=5, random_state=42, shuffle=True)
for i, (train_index, valid_index) in enumerate(sgkf.split(X=metadata['Id'], y=[1]*len(metadata), groups=metadata['Subject'])):
    print(f"Fold = {i}")
    train_ids = metadata.loc[train_index, 'Id']
    valid_ids = metadata.loc[valid_index, 'Id']
    
    print(f"Length of Train = {len(train_index)}, Length of Valid = {len(valid_index)}")
    n1_sum = metadata.loc[train_index, 'n1_sum'].sum()
    n2_sum = metadata.loc[train_index, 'n2_sum'].sum()
    n3_sum = metadata.loc[train_index, 'n3_sum'].sum()
    print(f"Train classes: {n1_sum:,}, {n2_sum:,}, {n3_sum:,}")
    
    n1_sum = metadata.loc[valid_index, 'n1_sum'].sum()
    n2_sum = metadata.loc[valid_index, 'n2_sum'].sum()
    n3_sum = metadata.loc[valid_index, 'n3_sum'].sum()
    print(f"Valid classes: {n1_sum:,}, {n2_sum:,}, {n3_sum:,}")
    
# FOLD 2 is the most well balanced
# The actual train-test split (based on Fold 2)

sgkf = StratifiedGroupKFold(n_splits=5, random_state=42, shuffle=True)
for i, (train_index, valid_index) in enumerate(sgkf.split(X=metadata['Id'], y=[1]*len(metadata), groups=metadata['Subject'])):
    if i != 1:
        continue
    print(f"Fold = {i}")
    train_ids = metadata.loc[train_index, 'Id']
    valid_ids = metadata.loc[valid_index, 'Id']
    print(f"Length of Train = {len(train_ids)}, Length of Valid = {len(valid_ids)}")
    
    if i == 2:
        break
        
train_fpaths_de = [f"/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/train/defog/{_id}.csv" for _id in train_ids]
valid_fpaths_de = [f"/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/train/defog/{_id}.csv" for _id in valid_ids]

100%|██████████| 137/137 [00:26<00:00,  5.25it/s]

Fold = 0
Length of Train = 75, Length of Valid = 16
Train classes: 500, 428,683, 37,609
Valid classes: 0, 158,803, 60,910
Fold = 1
Length of Train = 65, Length of Valid = 26
Train classes: 216, 490,429, 84,955
Valid classes: 284, 97,057, 13,564
Fold = 2
Length of Train = 76, Length of Valid = 15
Train classes: 410, 488,634, 87,986
Valid classes: 90, 98,852, 10,533
Fold = 3
Length of Train = 70, Length of Valid = 21
Train classes: 435, 424,494, 88,800
Valid classes: 65, 162,992, 9,719
Fold = 4
Length of Train = 78, Length of Valid = 13
Train classes: 439, 517,704, 94,726
Valid classes: 61, 69,782, 3,793
Fold = 1
Length of Train = 65, Length of Valid = 26





In [6]:
train_fpaths = [(f, 'de') for f in train_fpaths_de] + [(f, 'tdcs') for f in train_fpaths_tdcs]
valid_fpaths = [(f, 'de') for f in valid_fpaths_de] + [(f, 'tdcs') for f in valid_fpaths_tdcs]

In [7]:
class FOGDataset(Dataset):
    def __init__(self, fpaths, scale=9.806, split="train"):
        super(FOGDataset, self).__init__()
        tm = time.time()
        self.split = split
        self.scale = scale
        
        self.fpaths = fpaths
        self.dfs = [self.read(f[0], f[1]) for f in fpaths]
        self.f_ids = [os.path.basename(f[0])[:-4] for f in self.fpaths]
        
        self.end_indices = []
        self.shapes = []
        _length = 0
        for df in self.dfs:
            self.shapes.append(df.shape[0])
            _length += df.shape[0]
            self.end_indices.append(_length)
        
        self.dfs = np.concatenate(self.dfs, axis=0).astype(np.float16)
        self.length = self.dfs.shape[0]
        
        shape1 = self.dfs.shape[1]
        
        self.dfs = np.concatenate([np.zeros((cfg.wx*cfg.window_past, shape1)), self.dfs, np.zeros((cfg.wx*cfg.window_future, shape1))], axis=0)
        print(f"Dataset initialized in {time.time() - tm} secs!")
        gc.collect()
        
    def read(self, f, _type):
        df = pd.read_csv(f)
        if self.split == "test":
            return np.array(df)
        
        if _type =="tdcs":
            df['Valid'] = 1
            df['Task'] = 1
            df['tdcs'] = 1
        else:
            df['tdcs'] = 0
        
        return np.array(df)
            
    def __getitem__(self, index):
        if self.split == "train":
            row_idx = random.randint(0, self.length-1) + cfg.wx*cfg.window_past
        elif self.split == "test":
            for i,e in enumerate(self.end_indices):
                if index >= e:
                    continue
                df_idx = i
                break

            row_idx_true = self.shapes[df_idx] - (self.end_indices[df_idx] - index)
            _id = self.f_ids[df_idx] + "_" + str(row_idx_true)
            row_idx = index + cfg.wx*cfg.window_past
        else:
            row_idx = index + cfg.wx*cfg.window_past
            
        x = self.dfs[row_idx - cfg.wx*cfg.window_past : row_idx + cfg.wx*cfg.window_future, 1:4]
        x = x[::cfg.wx, :][::-1, :]
        x = torch.tensor(x.astype('float'))#/scale
        
        t = self.dfs[row_idx, -3]*self.dfs[row_idx, -2]
        
        if self.split == "test":
            return _id, x, t
        
        y = self.dfs[row_idx, 4:7].astype('float')
        y = torch.tensor(y)
        
        return x, y, t
    
    def __len__(self):
        # return self.length
        if self.split == "train":
            return 5_000_000
        return self.length

In [8]:
gc.collect()

23

# Model

In [9]:
class UNet(nn.Module):
    def __init__(self):
        super(UNet, self).__init__()
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        self.pad = nn.ConstantPad1d((7, 8), 0)
        self.conv1_1 = nn.Conv1d(in_channels=3, out_channels=64, kernel_size=16, stride=1)
        self.conv1_2 = nn.Conv1d(in_channels=64, out_channels=64, kernel_size=16, stride=1)
        self.pool1 = nn.MaxPool1d(kernel_size=2, stride=2)
        self.conv2_1 = nn.Conv1d(in_channels=64, out_channels=128, kernel_size=16, stride=1)
        self.conv2_2 = nn.Conv1d(in_channels=128, out_channels=128, kernel_size=16, stride=1)
        self.pool2 = nn.MaxPool1d(kernel_size=2, stride=2)
        self.conv3_1 = nn.Conv1d(in_channels=128, out_channels=256, kernel_size=16, stride=1)
        self.conv3_2 = nn.Conv1d(in_channels=256, out_channels=256, kernel_size=16, stride=1)
        self.conv3_3 = nn.Conv1d(in_channels=256, out_channels=256, kernel_size=16, stride=1)
        
        self.pool3 = nn.MaxPool1d(kernel_size=2, stride=2)
        self.conv3a_1 = nn.Conv1d(in_channels=256, out_channels=512, kernel_size=16, stride=1)
        self.conv3a_2 = nn.Conv1d(in_channels=512, out_channels=512, kernel_size=16, stride=1)
        self.conv3a_3 = nn.Conv1d(in_channels=512, out_channels=512, kernel_size=16, stride=1)
        
        self.upconv1 = nn.ConvTranspose1d(in_channels=256, out_channels=128, kernel_size=2, stride=2)
        self.conv4_1 = nn.Conv1d(in_channels=256, out_channels=128, kernel_size=16, stride=1)
        self.conv4_2 = nn.Conv1d(in_channels=128, out_channels=128, kernel_size=16, stride=1)
        self.upconv2 = nn.ConvTranspose1d(in_channels=128, out_channels=64, kernel_size=2, stride=2)
        self.conv5_1 = nn.Conv1d(in_channels=128, out_channels=64, kernel_size=16, stride=1)
        self.conv5_2 = nn.Conv1d(in_channels=64, out_channels=64, kernel_size=16, stride=1)
        self.conv5_3 = nn.Conv1d(in_channels=64, out_channels=3, kernel_size=1, stride=1)
        
        self.global_avg_pool = nn.AdaptiveAvgPool1d(1)
        self.fc1 = nn.Linear(512, 512)
        self.fc2 = nn.Linear(512, 3)
        
    def forward(self, x):
        x = np.swapaxes(x, 1, 2)
        x = self.pad(x)
        conv1_1 = self.conv1_1(x)
        conv1_1 = self.pad(self.relu(conv1_1))
        conv1_2 = self.conv1_2(conv1_1)
        conv1_2 = self.relu(conv1_2)
        pool1 = self.pool1(conv1_2)
        pool1 = self.pad(pool1)
        conv2_1 = self.conv2_1(pool1)
        conv2_1 = self.pad(self.relu(conv2_1))
        conv2_2 = self.conv2_2(conv2_1)
        conv2_2 = self.relu(conv2_2)
        pool2 = self.pool2(conv2_2)
        pool2 = self.pad(pool2)
        conv3_1 = self.conv3_1(pool2)
        conv3_1 = self.pad(self.relu(conv3_1))
        conv3_2 = self.conv3_2(conv3_1)
        conv3_2 = self.pad(self.relu(conv3_2))
        conv3_3 = self.conv3_3(conv3_2)
        conv3_3 = self.relu(conv3_3)
        
        pool3 = self.pool3(conv3_3)
        pool3 = self.pad(pool3)
        conv3a_1 = self.conv3a_1(pool3)
        conv3a_1 = self.pad(self.relu(conv3a_1))
        conv3a_2 = self.conv3a_2(conv3a_1)
        conv3a_2 = self.pad(self.relu(conv3a_2))
        conv3a_3 = self.conv3a_3(conv3a_2)
        conv3a_3 = self.relu(conv3a_3)
        
        conv5_3 = self.global_avg_pool(conv3a_3).squeeze(2)
        fc1 = self.fc1(conv5_3)
        fc1 = self.relu(fc1)
        fc2 = self.fc2(fc1)
        return fc2

In [10]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Training

In [11]:
from torch.cuda.amp import GradScaler

def train_one_epoch(model, loader, optimizer, criterion):
    loss_sum = 0.
    scaler = GradScaler()
    
    model.train()
    for x,y,t in tqdm(loader):
        x = x.to(cfg.device).float()
        y = y.to(cfg.device).float()
        t = t.to(cfg.device).float()
        
        y_pred = model(x)
        loss = criterion(y_pred, y)
        loss = torch.mean(loss*t.unsqueeze(-1), dim=1)
        
        t_sum = torch.sum(t)
        if t_sum > 0:
            loss = torch.sum(loss)/t_sum
        else:
            loss = torch.sum(loss)*0.
        
        # loss.backward()
        scaler.scale(loss).backward()
        # optimizer.step()
        scaler.step(optimizer)
        scaler.update()
        
        optimizer.zero_grad()
        
        loss_sum += loss.item()
    
    print(f"Train Loss: {(loss_sum/len(loader)):.04f}")
    

def validation_one_epoch(model, loader, criterion):
    loss_sum = 0.
    y_true_epoch = []
    y_pred_epoch = []
    t_valid_epoch = []
    
    model.eval()
    for x,y,t in tqdm(loader):
        x = x.to(cfg.device).float()
        y = y.to(cfg.device).float()
        t = t.to(cfg.device).float()
        
        with torch.no_grad():
            y_pred = model(x)
            loss = criterion(y_pred, y)
            loss = torch.mean(loss*t.unsqueeze(-1), dim=1)
            
            t_sum = torch.sum(t)
            if t_sum > 0:
                loss = torch.sum(loss)/t_sum
            else:
                loss = torch.sum(loss)*0.
        
        loss_sum += loss.item()
        y_true_epoch.append(y.cpu().numpy())
        y_pred_epoch.append(y_pred.cpu().numpy())
        t_valid_epoch.append(t.cpu().numpy())
        
    y_true_epoch = np.concatenate(y_true_epoch, axis=0)
    y_pred_epoch = np.concatenate(y_pred_epoch, axis=0)
    
    t_valid_epoch = np.concatenate(t_valid_epoch, axis=0)
    y_true_epoch = y_true_epoch[t_valid_epoch > 0, :]
    y_pred_epoch = y_pred_epoch[t_valid_epoch > 0, :]
    
    scores = [average_precision_score(y_true_epoch[:,i], y_pred_epoch[:,i]) for i in range(3)]
    mean_score = np.mean(scores)
    print(f"Validation Loss: {(loss_sum/len(loader)):.04f}, Validation Score: {mean_score:.03f}, ClassWise: {scores[0]:.03f},{scores[1]:.03f},{scores[2]:.03f}")
    
    return mean_score

In [None]:
model = UNet().to(cfg.device)
print(f"Number of parameters in model - {count_parameters(model):,}")

train_dataset = FOGDataset(train_fpaths, split="train")
valid_dataset = FOGDataset(valid_fpaths, split="valid")
print(f"lengths of datasets: train - {len(train_dataset)}, valid - {len(valid_dataset)}")

train_loader = DataLoader(train_dataset, batch_size=cfg.batch_size, num_workers=5, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=cfg.batch_size, num_workers=5)

optimizer = torch.optim.Adam(model.parameters(), lr=cfg.lr)
criterion = torch.nn.BCEWithLogitsLoss(reduction='none').to(cfg.device)
# sched = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.85)

max_score = 0.0

print("="*50)
for epoch in range(cfg.num_epochs):
    print(f"Epoch: {epoch}")
    train_one_epoch(model, train_loader, optimizer, criterion)
    score = validation_one_epoch(model, valid_loader, criterion)
    # sched.step()

    if score > max_score:
        max_score = score
        torch.save(model.state_dict(), "best_model_state.h5")
        print("Saving Model ...")

    print("="*50)
    
gc.collect()