In [60]:
import sys
sys.path.append('..')
from dataset import CachingImagesDataset

import torch
import torch.nn
import torch.optim
from torch.utils.data import DataLoader
from torchvision.models import densenet201
from torchvision import transforms

from sklearn.utils import shuffle
import numpy as np
import pandas as pd

from collections import OrderedDict
import json
from os.path import basename, join
from os import listdir, makedirs

In [7]:
def prepare_model(model_params):
    model = densenet201(pretrained=True)
    
    return model

In [None]:
#TODO:
# prepare_for_training - + load hparams, - train/valid loggers
# prepare_model - + prepare model and optimizer by params

In [41]:
hparams_path = '../hparams.json'
with open(hparams_path, 'r') as f:
    hparams = json.load(f, object_pairs_hook=OrderedDict)

In [10]:
def prepare_training(hparams):
    if 'model_params' not in hparams:
        raise Exception('You must add model params to hparams')
    
    model = prepare_model(hparams['model_params'])
    
    if 'criterion_params' not in hparams or \
        'criterion' not in hparams['criterion_params']:
        raise Exception('You must add criterion params to hparams')
    
    criterion_params = hparams['criterion_params']
    criterion = torch.nn.__dict__[criterion_params['criterion']](**criterion_params)
    if torch.cuda.is_available():
        criterion = criterion.cuda()
    
    if 'optimizer_params' not in hparams or \
        'optimizer' not in hparams['optimizer_params']:
        raise Exception('You must add optimizer params to hparams')
    
    optimizer_params = hparams['optimizer_params']
    optimizer = torch.optim.__dict__[optimizer_params['optimizer']](
        filter(lambda p: p.requires_grad, model.parameters()),
        **optimizer_params
    )
    
    if 'scheduler_params' in hparams:
        scheduler_params = hparams['scheduler_params']
        if 'scheduler' not in scheduler_params:
            raise Exception('If you provided scheduler params you also must add scheduler name')
        
        scheduler = torch.optim.lr_scheduler.__dict__[scheduler_params['scheduler']](
            optimizer, **scheduler_params
        )
    else:
        scheduler = None
    
    return model, criterion, optimzer, scheduler

In [39]:
TARGET_LABEL_NAMES = ['MEL', 'BCC', 'AKIEC', 'BKL', 'DF', 'VASC', 'NV']

In [53]:
np.arange(5).repeat(10)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4])

In [61]:
def column_fold_split(df, column, folds_seed, n_folds):
    df_tmp = []
    labels = shuffle(sorted(df[column].unique()), random_state=folds_seed)
    for i, fold_labels in enumerate(np.array_split(labels, n_folds)):
        df_label = df[df[column].isin(fold_labels)]
        df_label['fold'] = i
        df_tmp.append(df_label)
    df = pd.concat(df_tmp)
    
    return df

def read_labels(path):
    labels = pd.read_csv(path,
                        dtype={**{'image': str},
                               **{label: int for label in TARGET_LABEL_NAMES}})
    labels.image += '.jpg'
    
    return labels

def prepare_data_loaders(hparams):
    if 'data_params' not in hparams:
        raise Exception('You must provide data params in hparams')
    
    data_params = hparams['data_params']
    if 'transforms' not in data_params or not isinstance(data_params['transforms'], list):
        raise Exception('You must add transforms list into hparams.data_params')
    
    transforms_list = []
    for transform_info in data_params['transforms']:
        transform_name = transform_info['name']
        transform_params = transform_info['params']
        if transform_params is not None:
            transform = transforms.__dict__[transform_name](**transform_params)
        else:
            transform = transforms.__dict__[transform_name]()
        transforms_list.append(transform)
    transform = transforms.Compose(transforms_list)
        
    if 'labels_path' not in data_params:
        raise Exception('You must add labels_path into hparams')
    
    if 'n_folds' not in data_params or 'train_folds' not in data_params or \
        'folds_split_column' not in data_params or 'folds_seed' not in data_params:
        raise Exception('You must add n_folds, train_folds, folds_split_column' \
                        'and folds_seed into hparams.data_params')
    
    labels = read_labels(data_params['labels_path'])
    labels = column_fold_split(labels, data_params['folds_split_column'],
                      data_params['folds_seed'], data_params['n_folds'])
    
    if 'images_path' not in data_params:
        raise Exception('You must add images_path into hparams.data_params')
    
    train_folds = list(map(int, data_params['train_folds'].split(",")))
    train_labels = labels[labels['fold'].isin(train_folds)]
    valid_labels = labels[~labels['fold'].isin(train_folds)]
    
    train_labels = train_labels.reset_index().drop('index', axis=1)
    valid_labels = valid_labels.reset_index().drop('index', axis=1)
    
    train_dataset = CachingImagesDataset(train_labels, data_params['images_path'],
                                         TARGET_LABEL_NAMES, transform=transform,
                                         image_filename_column=data_params['folds_split_column'])
    valid_dataset = CachingImagesDataset(valid_labels, data_params['images_path'],
                                         TARGET_LABEL_NAMES, transform=transforms.ToTensor(),
                                         image_filename_column=data_params['folds_split_column'])
    
    if 'training_params' not in hparams or 'batch_size' not in hparams['training_params']:
        raise Exception('You must add training_params with batch_size specified in hparams')
    training_params = hparams['training_params']
    
    train_loader = DataLoader(train_dataset, batch_size=training_params['batch_size'],
                              shuffle=True, num_workers=-1,
                              pin_memory=torch.cuda.is_available())
    valid_loader = DataLoader(valid_dataset, batch_size=training_params['batch_size'],
                              shuffle=False, num_workers=-1,
                              pin_memory=torch.cuda.is_available())
    
    return train_loader, valid_loader

In [62]:
train_loader, valid_loader = prepare_data_loaders(hparams)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [None]:
def run_train(hparams, args):
    best_loss = int(1e10)
    best_metrics = None
    
    