<a href="https://colab.research.google.com/github/SrinijaVaibhavi/Ml-Competition-/blob/main/ml_competition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import gc
import random
import time
import json
from tqdm import tqdm
import glob
import numpy as np
import pandas as pd
import torch
import torch.nn as nn

from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

from sklearn.model_selection import train_test_split, StratifiedGroupKFold
from sklearn.metrics import accuracy_score, average_precision_score

import warnings
warnings.filterwarnings(action='ignore')

In [None]:
class Config:
    train_dir1 = "/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/train/defog"
    train_dir2 = "/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/train/tdcsfog"

    batch_size = 1024
    window_size = 32
    window_future = 8
    window_past = window_size - window_future
    
    wx = 8
    
    model_dropout = 0.2
    model_hidden = 512
    model_nblocks = 3
    
    lr = 0.00015
    num_epochs = 8
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    feature_list = ['AccV', 'AccML', 'AccAP']
    label_list = ['StartHesitation', 'Turn', 'Walking']
    
    
cfg = Config()

In [None]:
cfg.device

'cuda'

In [None]:

n1 = []
n2 = []
n3 = []
count = []

metadata = pd.read_csv("/kaggle/input/copy-train-metadata/tdcsfog_metadata.csv")

for f in tqdm(metadata['Id']):
    fpath = f"/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/train/tdcsfog/{f}.csv"
    df = pd.read_csv(fpath)
    
    n1.append(np.sum(df['StartHesitation']))
    n2.append(np.sum(df['Turn']))
    n3.append(np.sum(df['Walking']))
    count.append(len(df))
    
print(f"32 files have positive values in all 3 classes")

metadata['n1'] = n1
metadata['n2'] = n2
metadata['n3'] = n3
metadata['count'] = count

sgkf = StratifiedGroupKFold(n_splits=5, random_state=42, shuffle=True)
for i, (train_index, valid_index) in enumerate(sgkf.split(X=metadata['Id'], y=[1]*len(metadata), groups=metadata['Subject'])):
    print(f"Fold = {i}")
    train_ids = metadata.loc[train_index, 'Id']
    valid_ids = metadata.loc[valid_index, 'Id']
    
    print(f"Length of Train = {len(train_index)}, Length of Valid = {len(valid_index)}")
    n1_sum = metadata.loc[train_index, 'n1'].sum()
    n2_sum = metadata.loc[train_index, 'n2'].sum()
    n3_sum = metadata.loc[train_index, 'n3'].sum()
    print(f"Train classes: {n1:,}, {n2:,}, {n3:,}")
    
    n1_sum = metadata.loc[valid_index, 'n1'].sum()
    n2_sum = metadata.loc[valid_index, 'n2'].sum()
    n3_sum = metadata.loc[valid_index, 'n3'].sum()
    print(f"Valid classes: {n1:,}, {n2:,}, {n3:,}")


metadata = pd.read_csv("/kaggle/input/copy-train-metadata/tdcsfog_metadata.csv")
sgkf = StratifiedGroupKFold(n_splits=5, random_state=42, shuffle=True)
for i, (train_index, valid_index) in enumerate(sgkf.split(X=metadata['Id'], y=[1]*len(metadata), groups=metadata['Subject'])):
    if i != 2:
        continue
    print(f"Fold = {i}")
    train_ids = metadata.loc[train_index, 'Id']
    valid_ids = metadata.loc[valid_index, 'Id']
    print(f"Length of Train = {len(train_ids)}, Length of Valid = {len(valid_ids)}")
    
    if i == 2:
        break
        
train_fpaths_tdcs = [f"/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/train/tdcsfog/{_id}.csv" for _id in train_ids]
valid_fpaths_tdcs = [f"/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/train/tdcsfog/{_id}.csv" for _id in valid_ids]

100%|██████████| 833/833 [00:16<00:00, 51.10it/s]

32 files have positive values in all 3 classes
Fold = 0
Length of Train = 672, Length of Valid = 161
Train classes: 287,832, 1,462,652, 175,633
Valid classes: 16,958, 216,130, 32,205
Fold = 1
Length of Train = 613, Length of Valid = 220
Train classes: 51,748, 909,505, 65,242
Valid classes: 253,042, 769,277, 142,596
Fold = 2
Length of Train = 703, Length of Valid = 130
Train classes: 271,881, 1,332,746, 183,673
Valid classes: 32,909, 346,036, 24,165
Fold = 3
Length of Train = 649, Length of Valid = 184
Train classes: 303,710, 1,517,147, 205,196
Valid classes: 1,080, 161,635, 2,642
Fold = 4
Length of Train = 695, Length of Valid = 138
Train classes: 303,989, 1,493,078, 201,608
Valid classes: 801, 185,704, 6,230
Fold = 2
Length of Train = 703, Length of Valid = 130





In [None]:

n1 = []
n2 = []
n3 = []
count = []

metadata = pd.read_csv("/kaggle/input/copy-train-metadata/defog_metadata.csv")
metadata['n1'] = 0
metadata['n2'] = 0
metadata['n3'] = 0
metadata['count'] = 0

for f in tqdm(metadata['Id']):
    fpath = f"/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/train/defog/{f}.csv"
    if os.path.exists(fpath) == False:
        continue
        
    df = pd.read_csv(fpath)
    metadata.loc[metadata['Id'] == f, 'n1'] = np.sum(df['StartHesitation'])
    metadata.loc[metadata['Id'] == f, 'n2'] = np.sum(df['Turn'])
    metadata.loc[metadata['Id'] == f, 'n3'] = np.sum(df['Walking'])
    metadata.loc[metadata['Id'] == f, 'count'] = len(df)
    
metadata = metadata[metadata['count'] > 0].reset_index()

sgkf = StratifiedGroupKFold(n_splits=5, random_state=42, shuffle=True)
for i, (train_index, valid_index) in enumerate(sgkf.split(X=metadata['Id'], y=[1]*len(metadata), groups=metadata['Subject'])):
    print(f"Fold = {i}")
    train_ids = metadata.loc[train_index, 'Id']
    valid_ids = metadata.loc[valid_index, 'Id']
    
    print(f"Length of Train = {len(train_index)}, Length of Valid = {len(valid_index)}")
    n1_sum = metadata.loc[train_index, 'n1_sum'].sum()
    n2_sum = metadata.loc[train_index, 'n2_sum'].sum()
    n3_sum = metadata.loc[train_index, 'n3_sum'].sum()
    print(f"Train classes: {n1_sum:,}, {n2_sum:,}, {n3_sum:,}")
    
    n1_sum = metadata.loc[valid_index, 'n1_sum'].sum()
    n2_sum = metadata.loc[valid_index, 'n2_sum'].sum()
    n3_sum = metadata.loc[valid_index, 'n3_sum'].sum()
    print(f"Valid classes: {n1_sum:,}, {n2_sum:,}, {n3_sum:,}")
    


sgkf = StratifiedGroupKFold(n_splits=5, random_state=42, shuffle=True)
for i, (train_index, valid_index) in enumerate(sgkf.split(X=metadata['Id'], y=[1]*len(metadata), groups=metadata['Subject'])):
    if i != 1:
        continue
    print(f"Fold = {i}")
    train_ids = metadata.loc[train_index, 'Id']
    valid_ids = metadata.loc[valid_index, 'Id']
    print(f"Length of Train = {len(train_ids)}, Length of Valid = {len(valid_ids)}")
    
    if i == 2:
        break
        
train_fpaths_de = [f"/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/train/defog/{_id}.csv" for _id in train_ids]
valid_fpaths_de = [f"/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/train/defog/{_id}.csv" for _id in valid_ids]

100%|██████████| 137/137 [00:22<00:00,  6.18it/s]

Fold = 0
Length of Train = 75, Length of Valid = 16
Train classes: 500, 410,998, 31,612
Valid classes: 0, 157,267, 60,789
Fold = 1
Length of Train = 65, Length of Valid = 26
Train classes: 216, 475,613, 81,579
Valid classes: 284, 92,652, 10,822
Fold = 2
Length of Train = 76, Length of Valid = 15
Train classes: 410, 470,765, 83,170
Valid classes: 90, 97,500, 9,231
Fold = 3
Length of Train = 70, Length of Valid = 21
Train classes: 435, 413,452, 84,247
Valid classes: 65, 154,813, 8,154
Fold = 4
Length of Train = 78, Length of Valid = 13
Train classes: 439, 502,232, 88,996
Valid classes: 61, 66,033, 3,405
Fold = 1
Length of Train = 65, Length of Valid = 26





In [None]:
train_fpaths = [(f, 'de') for f in train_fpaths_de] + [(f, 'tdcs') for f in train_fpaths_tdcs]
valid_fpaths = [(f, 'de') for f in valid_fpaths_de] + [(f, 'tdcs') for f in valid_fpaths_tdcs]

In [None]:
class FOGDataset(Dataset):
    def __init__(self, fpaths, scale=9.806, split="train"):
        super(FOGDataset, self).__init__()
        tm = time.time()
        self.split = split
        self.scale = scale
        
        self.fpaths = fpaths
        self.dfs = [self.read(f[0], f[1]) for f in fpaths]
        self.f_ids = [os.path.basename(f[0])[:-4] for f in self.fpaths]
        
        self.end_indices = []
        self.shapes = []
        _length = 0
        for df in self.dfs:
            self.shapes.append(df.shape[0])
            _length += df.shape[0]
            self.end_indices.append(_length)
        
        self.dfs = np.concatenate(self.dfs, axis=0).astype(np.float16)
        self.length = self.dfs.shape[0]
        
        shape1 = self.dfs.shape[1]
        
        self.dfs = np.concatenate([np.zeros((cfg.wx*cfg.window_past, shape1)), self.dfs, np.zeros((cfg.wx*cfg.window_future, shape1))], axis=0)
        print(f"Dataset initialized in {time.time() - tm} secs!")
        gc.collect()
        
    def read(self, f, _type):
        df = pd.read_csv(f)
        if self.split == "test":
            return np.array(df)
        
        if _type =="tdcs":
            df['Valid'] = 1
            df['Task'] = 1
            df['tdcs'] = 1
        else:
            df['tdcs'] = 0
        
        return np.array(df)
            
    def __getitem__(self, index):
        if self.split == "train":
            row_idx = random.randint(0, self.length-1) + cfg.wx*cfg.window_past
        elif self.split == "test":
            for i,e in enumerate(self.end_indices):
                if index >= e:
                    continue
                df_idx = i
                break

            row_idx_true = self.shapes[df_idx] - (self.end_indices[df_idx] - index)
            _id = self.f_ids[df_idx] + "_" + str(row_idx_true)
            row_idx = index + cfg.wx*cfg.window_past
        else:
            row_idx = index + cfg.wx*cfg.window_past
            
        #scale = 9.806 if self.dfs[row_idx, -1] == 1 else 1.0
        x = self.dfs[row_idx - cfg.wx*cfg.window_past : row_idx + cfg.wx*cfg.window_future, 1:4]
        x = x[::cfg.wx, :][::-1, :]
        x = torch.tensor(x.astype('float'))#/scale
        
        t = self.dfs[row_idx, -3]*self.dfs[row_idx, -2]
        
        if self.split == "test":
            return _id, x, t
        
        y = self.dfs[row_idx, 4:7].astype('float')
        y = torch.tensor(y)
        
        return x, y, t
    
    def __len__(self):
        # return self.length
        if self.split == "train":
            return 5_000_000
        return self.length

In [None]:
gc.collect()

23

In [None]:
def _block(in_features, out_features, drop_rate):
    return nn.Sequential(
        nn.Linear(in_features, out_features),
        nn.BatchNorm1d(out_features),
        nn.ReLU(),
        nn.Dropout(drop_rate)
    )

class FOGModel(nn.Module):
    def __init__(self, p=cfg.model_dropout, dim=cfg.model_hidden, nblocks=cfg.model_nblocks):
        super(FOGModel, self).__init__()
        self.dropout = nn.Dropout(p)
        self.in_layer = nn.Linear(cfg.window_size*3, dim)
        self.blocks = nn.Sequential(*[_block(dim, dim, p) for _ in range(nblocks)])
        self.out_layer = nn.Linear(dim, 3)
        
    def forward(self, x):
        x = x.view(-1, cfg.window_size*3)
        x = self.in_layer(x)
        for block in self.blocks:
            x = block(x)
        x = self.out_layer(x)
        return x

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [None]:
from torch.cuda.amp import GradScaler

def train_one_epoch(model, loader, optimizer, criterion):
    loss_sum = 0.
    scaler = GradScaler()
    
    model.train()
    for x,y,t in tqdm(loader):
        x = x.to(cfg.device).float()
        y = y.to(cfg.device).float()
        t = t.to(cfg.device).float()
        
        y_pred = model(x)
        loss = criterion(y_pred, y)
        loss = torch.mean(loss*t.unsqueeze(-1), dim=1)
        
        t_sum = torch.sum(t)
        if t_sum > 0:
            loss = torch.sum(loss)/t_sum
        else:
            loss = torch.sum(loss)*0.
        
        # loss.backward()
        scaler.scale(loss).backward()
        # optimizer.step()
        scaler.step(optimizer)
        scaler.update()
        
        optimizer.zero_grad()
        
        loss_sum += loss.item()
    
    print(f"Train Loss: {(loss_sum/len(loader)):.04f}")
    

def validation_one_epoch(model, loader, criterion):
    loss_sum = 0.
    y_true_epoch = []
    y_pred_epoch = []
    t_valid_epoch = []
    
    model.eval()
    for x,y,t in tqdm(loader):
        x = x.to(cfg.device).float()
        y = y.to(cfg.device).float()
        t = t.to(cfg.device).float()
        
        with torch.no_grad():
            y_pred = model(x)
            loss = criterion(y_pred, y)
            loss = torch.mean(loss*t.unsqueeze(-1), dim=1)
            
            t_sum = torch.sum(t)
            if t_sum > 0:
                loss = torch.sum(loss)/t_sum
            else:
                loss = torch.sum(loss)*0.
        
        loss_sum += loss.item()
        y_true_epoch.append(y.cpu().numpy())
        y_pred_epoch.append(y_pred.cpu().numpy())
        t_valid_epoch.append(t.cpu().numpy())
        
    y_true_epoch = np.concatenate(y_true_epoch, axis=0)
    y_pred_epoch = np.concatenate(y_pred_epoch, axis=0)
    
    t_valid_epoch = np.concatenate(t_valid_epoch, axis=0)
    y_true_epoch = y_true_epoch[t_valid_epoch > 0, :]
    y_pred_epoch = y_pred_epoch[t_valid_epoch > 0, :]
    
    scores = [average_precision_score(y_true_epoch[:,i], y_pred_epoch[:,i]) for i in range(3)]
    mean_score = np.mean(scores)
    print(f"Validation Loss: {(loss_sum/len(loader)):.04f}, Validation Score: {mean_score:.03f}, ClassWise: {scores[0]:.03f},{scores[1]:.03f},{scores[2]:.03f}")
    
    return mean_score

In [None]:
model = FOGModel().to(cfg.device)
print(f"Number of parameters in model - {count_parameters(model):,}")

train_dataset = FOGDataset(train_fpaths, split="train")
valid_dataset = FOGDataset(valid_fpaths, split="valid")
print(f"lengths of datasets: train - {len(train_dataset)}, valid - {len(valid_dataset)}")

train_loader = DataLoader(train_dataset, batch_size=cfg.batch_size, num_workers=5, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=cfg.batch_size, num_workers=5)

optimizer = torch.optim.Adam(model.parameters(), lr=cfg.lr)
criterion = torch.nn.BCEWithLogitsLoss(reduction='none').to(cfg.device)
# sched = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.85)

max_score = 0.0

print("="*50)
for epoch in range(cfg.num_epochs):
    print(f"Epoch: {epoch}")
    train_one_epoch(model, train_loader, optimizer, criterion)
    score = validation_one_epoch(model, valid_loader, criterion)
    # sched.step()

    if score > max_score:
        max_score = score
        torch.save(model.state_dict(), "best_model_state.h5")
        print("Saving Model ...")

    print("="*50)
    
gc.collect()

Number of parameters in model - 842,243
Dataset initialized in 28.17720866203308 secs!
Dataset initialized in 8.077937126159668 secs!
lengths of datasets: train - 5000000, valid - 4984740
Epoch: 0


100%|██████████| 4883/4883 [02:32<00:00, 31.95it/s]


Train Loss: 0.1687


100%|██████████| 4868/4868 [02:02<00:00, 39.75it/s]


Validation Loss: 0.0725, Validation Score: 0.250, ClassWise: 0.043,0.687,0.020
Saving Model ...
Epoch: 1


100%|██████████| 4883/4883 [02:26<00:00, 33.41it/s]


Train Loss: 0.1478


100%|██████████| 4868/4868 [01:59<00:00, 40.89it/s]


Validation Loss: 0.0743, Validation Score: 0.245, ClassWise: 0.049,0.665,0.020
Epoch: 2


100%|██████████| 4883/4883 [02:26<00:00, 33.28it/s]


Train Loss: 0.1404


100%|██████████| 4868/4868 [01:56<00:00, 41.67it/s]


Validation Loss: 0.0798, Validation Score: 0.239, ClassWise: 0.041,0.654,0.023
Epoch: 3


100%|██████████| 4883/4883 [02:22<00:00, 34.19it/s]


Train Loss: 0.1348


100%|██████████| 4868/4868 [01:56<00:00, 41.65it/s]


Validation Loss: 0.0845, Validation Score: 0.237, ClassWise: 0.037,0.643,0.030
Epoch: 4


100%|██████████| 4883/4883 [02:26<00:00, 33.30it/s]


Train Loss: 0.1311


100%|██████████| 4868/4868 [01:57<00:00, 41.38it/s]


Validation Loss: 0.0864, Validation Score: 0.230, ClassWise: 0.041,0.619,0.031
Epoch: 5


100%|██████████| 4883/4883 [02:27<00:00, 33.08it/s]


Train Loss: 0.1276


100%|██████████| 4868/4868 [01:59<00:00, 40.61it/s]


Validation Loss: 0.0794, Validation Score: 0.249, ClassWise: 0.044,0.666,0.038
Epoch: 6


100%|██████████| 4883/4883 [02:28<00:00, 32.98it/s]


Train Loss: 0.1253


100%|██████████| 4868/4868 [01:59<00:00, 40.88it/s]


Validation Loss: 0.0855, Validation Score: 0.248, ClassWise: 0.038,0.668,0.039
Epoch: 7


100%|██████████| 4883/4883 [02:25<00:00, 33.56it/s]


Train Loss: 0.1228


100%|██████████| 4868/4868 [01:57<00:00, 41.56it/s]


Validation Loss: 0.0855, Validation Score: 0.246, ClassWise: 0.041,0.659,0.038


0

In [None]:
test_defog_paths = glob.glob("/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/test/defog/*.csv")
test_tdcsfog_paths = glob.glob("/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/test/tdcsfog/*.csv")
test_fpaths = [(f, 'de') for f in test_defog_paths] + [(f, 'tdcs') for f in test_tdcsfog_paths]

test_dataset = FOGDataset(test_fpaths, split="test")
test_loader = DataLoader(test_dataset, batch_size=cfg.batch_size, num_workers=5)

ids = []
preds = []

for _id, x, _ in tqdm(test_loader):
    x = x.to(cfg.device).float()
    with torch.no_grad():
        y_pred = model(x)*0.1
    
    ids.extend(_id)
    preds.extend(list(np.nan_to_num(y_pred.cpu().numpy())))

Dataset initialized in 0.3825681209564209 secs!


100%|██████████| 280/280 [00:04<00:00, 56.07it/s]


In [None]:
sample_submission = pd.read_csv("/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/sample_submission.csv")
sample_submission.shape

(286370, 4)

In [None]:
preds = np.array(preds)
submission = pd.DataFrame({'Id': ids, 'StartHesitation': np.round(preds[:,0],5), \
                           'Turn': np.round(preds[:,1],5), 'Walking': np.round(preds[:,2],5)})

submission = pd.merge(sample_submission[['Id']], submission, how='left', on='Id').fillna(0.0)
submission.to_csv("submission.csv", index=False)

In [None]:
print(submission.shape)
submission.head()

(286370, 4)


Unnamed: 0,Id,StartHesitation,Turn,Walking
0,003f117e14_0,-0.91815,-0.54727,-1.06585
1,003f117e14_1,-0.9195,-0.54821,-1.06673
2,003f117e14_2,-0.91899,-0.54748,-1.06575
3,003f117e14_3,-0.91789,-0.54692,-1.06471
4,003f117e14_4,-0.91827,-0.54712,-1.06538
