In [13]:
import os, sys, pdb, time
from pathlib import Path
import copy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.signal
import pickle
%matplotlib inline
data_dir = Path('../input')
model_dir = Path('../model')
out_dir = Path('../output')
subject_dir_names = ['Dog_1', 'Dog_2', 'Dog_3', 'Dog_4', 'Dog_5', 'Patient_1', 'Patient_2']
partial_name_list = ['train', 'val', 'test']
class_names = ['preictal', 'interictal']

In [2]:
import torch
from torch import nn
import torchvision
seed = 0
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
import random
random.seed(seed)
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils, models

In [17]:
locals()

{'__name__': '__main__',
 '__doc__': 'Automatically created module for IPython interactive environment',
 '__package__': None,
 '__loader__': None,
 '__spec__': None,
 '__builtin__': <module 'builtins' (built-in)>,
 '__builtins__': <module 'builtins' (built-in)>,
 '_ih': ['',
  'from sklearn import metrics\nimport eeglib',
  "from sklearn import metrics\nimport eeglib\nget_ipython().run_line_magic('autoreload', '1')",
  "get_ipython().system('pwd')",
  "get_ipython().system('cd ../')",
  "get_ipython().system('cd ../')\nget_ipython().system('pwd')",
  'import os\nos.chdir',
  'import os\nos.chdir()',
  "import os\nos.chdir('../')",
  "get_ipython().system('ls')",
  "from sklearn import metrics\nimport eeglib\nget_ipython().run_line_magic('autoreload', '1')",
  '__file__',
  '__file__()',
  '__name__',
  'locals()',
  '__file__',
  '__file__()',
  'locals()'],
 '_oh': {6: <function posix.chdir(path)>, 13: '__main__', 14: {...}},
 '_dh': ['/home/tomoya/workspace/kaggle/seizure-prediction

In [8]:
import os
os.chdir(../)

In [20]:
from sklearn import metrics
import eeglib
%load_ext autoreload
%autoreload 1

In [4]:
def move_file(src_path, dst_path):
    os.rename(str(src_path), str(dst_path))

    
def split_seizure_dataset(data_dir, train_ratio=0.8, shuffle=False, grouping_again=False):
    partial_dirs = {part: Path(data_dir) / part for part in partial_name_list}
    
    for partial_data_path in list(partial_dirs.values()):
        partial_data_path.mkdir(exist_ok=True)    
        if grouping_again:
            [move_file(p, str(data_dir)+'/'+p.name) for p in list(partial_data_path.iterdir())]
    file_paths = list(Path(data_dir).iterdir())
    file_paths = [path for path in file_paths if path.is_file()]
    
    if shuffle:
        random.seed(0)
        file_paths.sort()
        random.shuffle(file_paths)
        
    preictal_files = [path for path in file_paths if 'preictal' in path.name]
    interictal_files = [path for path in file_paths if 'interictal' in path.name]
    test_files = [path for path in file_paths if 'test' in path.name]
    
    print('{} : \n # of preictal_files \t {} \n # of interictal_files \t {}'.format(
        data_dir.name, len(preictal_files), len(interictal_files)))

    for file_path in test_files:
        move_file(src_path=file_path, dst_path=partial_dirs['test'] / file_path.name)

    for file_paths in [preictal_files, interictal_files]:
        for i, file_path in enumerate(file_paths):
            if i <= int(len(file_paths)*train_ratio):
                move_file(src_path=file_path, dst_path=partial_dirs['train'] / file_path.name)
            else:
                move_file(src_path=file_path, dst_path=partial_dirs['val'] / file_path.name)


In [5]:
for subject_dir_name in subject_dir_names:
    split_seizure_dataset(data_dir / subject_dir_name, train_ratio=0.8, shuffle=True, grouping_again=True)

Dog_1 : 
 # of preictal_files 	 24 
 # of interictal_files 	 480
Dog_2 : 
 # of preictal_files 	 42 
 # of interictal_files 	 500
Dog_3 : 
 # of preictal_files 	 72 
 # of interictal_files 	 1440
Dog_4 : 
 # of preictal_files 	 97 
 # of interictal_files 	 804
Dog_5 : 
 # of preictal_files 	 30 
 # of interictal_files 	 450
Patient_1 : 
 # of preictal_files 	 18 
 # of interictal_files 	 50
Patient_2 : 
 # of preictal_files 	 18 
 # of interictal_files 	 42


In [6]:
# eeg_conf = dict(sample_rate=args.sample_rate,
#                 window_size=args.window_size,
#                 window_stride=args.window_stride,
#                 window=args.window,
#                 noise_dir=args.noise_dir,
#                 noise_prob=args.noise_prob,
#                 noise_levels=(args.noise_min, args.noise_max)
eeg_conf = dict(sample_rate=16000,
                window_size=0.02,
                window_stride=0.01,
                window='hamming',
                noise_dir=None,
                noise_prob=0.4,
                noise_levels=(0.0, 0.5))
windows = {'hamming': scipy.signal.hamming, 'hann': scipy.signal.hann, 'blackman': scipy.signal.blackman,
           'bartlett': scipy.signal.bartlett}
eeg_conf

{'sample_rate': 16000,
 'window_size': 0.02,
 'window_stride': 0.01,
 'window': 'hamming',
 'noise_dir': None,
 'noise_prob': 0.4,
 'noise_levels': (0.0, 0.5)}

In [14]:
class EEGParser:
    def __init__(self, eeg_conf, normalize=False, augment=False):
        self.window_stride = eeg_conf['window_stride']
        self.window_size = eeg_conf['window_size']
        self.sample_rate = eeg_conf['sample_rate']
        self.window = windows.get(eeg_conf['window'], windows['hamming'])
        self.normalize = normalize
        self.augment = augment
    
    def parse_eeg(self, eeg_path) -> np.array:
        if self.augment:
            raise NotImplementedError
        else:
            eeg = EEG(eeg_path).load()
            y = eeg['data']
        
        return y
        
        
class EEGDataSet(Dataset, EEGParser):
    def __init__(self, manifest_filepath, normalize=False, augment=False):
        with open(manifest_filepath, 'r') as f:
            path_list = f.readlines()
        path_list = [p.strip() for p in path_list]
        
        self.path_list = path_list
        self.size = len(path_list)
        super(EEGDataSet, self).__init__(eeg_conf, normalize, augment)
    
    def __getitem__(self, index):
        eeg_path = self.path_list[index]
        y = self.parse_eeg(eeg_path)
        label = eeg_path.split('_')[2]
        return y, label
        
    def __len__(self):
        return self.size

    
class EEGDataLoader(DataLoader):
    def __init__(self, *args, **kwargs):
        super(EEGDataLoader, self).__init__(*args, **kwargs)

In [8]:
def compile_manifest(data_dir, csv_name):
    raise NotImplementedError

In [9]:
partial_name_list

['train', 'val', 'test']

In [10]:
Path(data_dir / 'manifests').mkdir(exist_ok=True)
data_version = 'Dog_1'
for partial_name in partial_name_list:
    df = pd.DataFrame(list(Path(data_dir / 'Dog_1' / partial_name).resolve().iterdir()))
    df.to_csv(str(data_dir / 'manifests' / '{}_{}_manifest.csv'.format(data_version, partial_name)), 
              header=None, index=False)

In [11]:
datasets = {
    part: EEGDataSet(data_dir / 'manifests' / '{}_{}_manifest.csv'.format(data_version, part))
    for part in partial_name_list
}

In [12]:
data_transforms = {
    'train': transforms.Compose([
        transforms.ToTensor(),
    ]),
    'val': transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

dataloaders = {part: EEGDataLoader(datasets[part], batch_size=2, num_workers=2) 
               for part in partial_name_list}

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device = 'cpu'
device

'cpu'

## datalo

## いよいよmodelを書いていく

In [13]:
class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


In [14]:
class Args:
    def __init__(self):
        self.lr = 3e-4
        self.epochs = 50
        self.continue_from = False
        self.momentum = 0.9

In [15]:
# train.py
# if __name__ == '__main__':
# parse args
args = Args()
# set seeds: torch, cuda, np, random
# device setting
# make best model save dir
model_dir = model_dir
model_dir.mkdir(exist_ok=True)

roc_results, prec_results, rec_results = torch.Tensor(args.epochs), torch.Tensor(args.epochs), torch.Tensor(
    args.epochs)

best_roc = None

avg_loss, avg_auc, start_epoch, start_iter, optim_state = 0, 0, 0, 0, None

if args.continue_from:  # TODO: Starting from previous model
    raise NotImplementedError
    
model = models.vgg11_bn(pretrained=True)
num_ftrs = model.classifier[6].in_features
model.classifier[6] = nn.Linear(num_ftrs, len(class_names))
input_size = 224
model = model.to(device)

parameters = model.parameters()
optimizer = torch.optim.SGD(parameters, lr=args.lr,
                            momentum=args.momentum, nesterov=True, weight_decay=1e-5)

if optim_state is not None:
    optimizer.load_state_dict(optim_state)

# print(model)

criterion = nn.BCELoss()
batch_time = AverageMeter()
data_time = AverageMeter()
losses = AverageMeter()

for epoch in range(start_epoch, args.epochs):
    end = time.time()
    start_epoch_time = time.time()
    
    for phase in ['train', 'val']:
#         if phase == 'train':
#             scheduler.step()
        
        for i, inputs, labels in enumerate(dataloaders[phase]):
            # measure data loading time
            data_time.update(time.time() - end)
            inputs = inputs.to(device)
            labels = labels.to(device)
            
            optimizer.zero_grad()
            
            with torch.set_grad_enabled(phase == 'train'):
                pdb.set_trace()
                outputs = model(inputs)
                _, preds = torch.max(outputs, 1)
                loss = criterion(outputs, labels)
                
                if phase == 'train':
                    loss.backward()
                    optimizer.step()
                    
            avg_loss += loss.item()
            avg_auc += metrics.auc(labels, outputs)
            losses.update(loss_value, inputs.size(0))
            
            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()
            if True: #not args.silent:
                print('Epoch: [{0}][{1}/{2}]\t'
                      'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                      'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                      'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format(
                    (epoch + 1), (i + 1), len(train_sampler), batch_time=batch_time, data_time=data_time, loss=losses))
            break
        # deep copy the model
        if phase == 'val' and epoch_acc > best_acc:
            best_acc = epoch_acc
            best_model_wts = copy.deepcopy(model.state_dict())
        

{'__header__': b'MATLAB 5.0 MAT-file, Platform: MACI64, Created on: Thu Aug 21 01:00:00 2014', '__version__': '1.0', '__globals__': [], 'interictal_segment_347': array([[(array([[  1, -28, -48, ..., -18,  -6,  11],
       [-12, -14, -30, ..., -80, -79, -84],
       [-36, -29, -21, ...,  -1, -10,  -6],
       ...,
       [ 18,  28,  33, ...,  12,  17,  20],
       [ -2,   7,  14, ...,  25,  18,  12],
       [ -5,   0,   3, ...,  23,  20,  19]], dtype=int16), array([[600]], dtype=uint16), array([[399.6097561]]), array([[array(['NVC1202_32_002_Ecog_c001'], dtype='<U24'),
        array(['NVC1202_32_002_Ecog_c002'], dtype='<U24'),
        array(['NVC1202_32_002_Ecog_c003'], dtype='<U24'),
        array(['NVC1202_32_002_Ecog_c004'], dtype='<U24'),
        array(['NVC1202_32_002_Ecog_c005'], dtype='<U24'),
        array(['NVC1202_32_002_Ecog_c006'], dtype='<U24'),
        array(['NVC1202_32_002_Ecog_c007'], dtype='<U24'),
        array(['NVC1202_32_002_Ecog_c008'], dtype='<U24'),
        arra

KeyError: 'Traceback (most recent call last):\n  File "<ipython-input-7-a578fd81c1d8>", line 44, in _parse_mat\n    for i, key in enumerate(mat[self.data_col].dtype.names):\nKeyError: \'1_interictal_segment\'\n\nDuring handling of the above exception, another exception occurred:\n\nTraceback (most recent call last):\n  File "/home/tomoya/anaconda3/envs/kaggle/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 138, in _worker_loop\n    samples = collate_fn([dataset[i] for i in batch_indices])\n  File "/home/tomoya/anaconda3/envs/kaggle/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 138, in <listcomp>\n    samples = collate_fn([dataset[i] for i in batch_indices])\n  File "<ipython-input-7-a578fd81c1d8>", line 99, in __getitem__\n    y = self.parse_eeg(eeg_path)\n  File "<ipython-input-7-a578fd81c1d8>", line 81, in parse_eeg\n    eeg = EEG(eeg_path).load()\n  File "<ipython-input-7-a578fd81c1d8>", line 20, in load\n    eeg = self._parse_mat()\n  File "<ipython-input-7-a578fd81c1d8>", line 48, in _parse_mat\n    "not implemented except this key.".format(self.data_file))\nKeyError: "eeg_file {} doesn\'t have info about \'interictal_segment_1\', not implemented except this key."\n'

In [None]:
# TODO
# - EEG Dataloader, EEG Datasetの定義
# - 実験に最適なML開発とはどんな構成か。データ・モデルのバージョン管理をいかに行うか