In [2]:
from time import time
from tqdm import tqdm

from JigsawDataLoader import DataLoader
from JigsawNetwork import Network

from copy import copy
from fastprogress.fastprogress import master_bar, progress_bar
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

gpu_id = 2
num_cores = 4
torch.cuda.set_device(gpu_id)

In [2]:
def subsample_dl(ds, bs, pct=0.1, seed=None):
    """Takes a databunch as input and returns a mini-version of the dataset
    This is useful for debugging and rapid experimentation. 
    data -> a databunch object
    pct  -> the fraction of original dataset size (default: 0.1)"""
    if seed: np.random.seed(seed)
    size = len(ds)
    indices = np.random.choice(np.arange(size), 
                                   size=int(pct*size), replace=False)
    sampler = torch.utils.data.sampler.SubsetRandomSampler(indices)
    mini_dl = torch.utils.data.DataLoader(dataset=ds,
                                            batch_size=bs,
                                            sampler=sampler,
                                            num_workers=4)
    return mini_dl

#### train args

In [3]:
bs = 32
lr = 0.01

In [27]:
train_pth = pd.read_csv('../train.csv')
train_ds = DataLoader(train_pth)
train_loader = torch.utils.data.DataLoader(dataset=train_ds,
                                            batch_size=bs,
                                            shuffle=True,
                                            num_workers=num_cores)
sub_train_loader = subsample_dl(ds, bs)

In [28]:
len(train_ds), len(train_loader), len(sub_train_loader)

(79997, 2500, 250)

In [24]:
val_pth = pd.read_csv('../valid.csv')
val_ds = DataLoader(val_pth)
val_loader = torch.utils.data.DataLoader(dataset=val_ds,
                                            batch_size=bs,
                                            shuffle=True,
                                            num_workers=num_cores)
sub_val_loader = subsample_dl(val_ds, bs)

In [25]:
len(val_ds), len(val_loader), len(sub_val_loader)

(20000, 625, 63)

In [29]:
train_data = sub_train_loader
val_data = sub_val_loader

#### TODO: add validation data

In [18]:
iter_per_epoch = len(train_data)
# print('Images: train %d, validation %d'%(train_data.N,val_data.N))

In [11]:
net = Network()
net = net.cuda()

In [12]:
# loss func and optimizer
# criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(net.parameters(),lr=lr,momentum=0.9,weight_decay = 5e-4)

In [137]:
batch_time, net_time = [], []
for epoch in range(30):
#         if epoch%10==0 and epoch>0:
#             test(net,criterion,logger_test,val_loader,steps)
#         lr = adjust_learning_rate(optimizer, epoch, init_lr=args.lr, step=20, decay=0.1)
        net.train()
        total_loss = 0
        total_items = 0
        for i, (images, labels) in enumerate(progress_bar(train_data)):
            images = images.cuda()
            labels = labels.cuda()          
            outputs = net(images)
            
            loss = F.cross_entropy(outputs, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()*len(labels)
            total_items += len(labels)
            
        train_loss = total_loss / total_items
        print(f'Train loss {train_loss:.2f}')

250it [01:20,  4.41it/s]

Train loss 6.71



250it [00:33,  8.14it/s]

Train loss 6.69



250it [00:18, 13.58it/s]

Train loss 6.70



250it [00:18, 13.63it/s]

Train loss 6.69



250it [00:18, 13.72it/s]

Train loss 6.70



250it [00:18, 13.36it/s]

Train loss 6.68



250it [00:19, 13.01it/s]

Train loss 6.64



250it [00:18, 13.40it/s]

Train loss 6.58



250it [00:18, 13.64it/s]

Train loss 6.50



250it [00:18, 13.59it/s]

Train loss 6.44



250it [00:18, 13.63it/s]

Train loss 6.46



250it [00:18, 13.67it/s]

Train loss 6.39



250it [00:18, 13.72it/s]

Train loss 6.70



250it [00:18, 13.60it/s]

Train loss 6.74



250it [00:18, 13.41it/s]

Train loss 6.73



250it [00:18, 13.58it/s]

Train loss 6.72



250it [00:18, 13.63it/s]

Train loss 6.72



250it [00:18, 13.38it/s]

Train loss 6.71



250it [00:18, 13.58it/s]

Train loss 6.71



250it [00:18, 13.32it/s]

Train loss 6.71



250it [00:19, 13.05it/s]

Train loss 6.71



250it [00:18, 13.33it/s]

Train loss 6.71



250it [00:19, 12.87it/s]

Train loss 6.70



250it [00:19, 12.84it/s]

Train loss 6.70



250it [00:19, 12.51it/s]

Train loss 6.70



250it [00:19, 12.81it/s]

Train loss 6.70



250it [00:19, 12.77it/s]

Train loss 6.70



250it [00:19, 12.75it/s]

Train loss 6.70



250it [00:19, 12.73it/s]

Train loss 6.70



250it [00:19, 12.71it/s]

Train loss 6.70





In [135]:
def valid_metrics(model, valid_dl):
    model.eval()
    total_loss = 0
    total_true = 0
    total_items = 0
    for images, y in valid_dl:
        images = Variable(images)
        images = images.cuda()
        out = model(images)
        out = out.cpu().data
        loss = F.cross_entropy(out, y)
        total_loss += loss.item()*len(y)
        total_true += (out.argmax(dim=1) == y).sum().item()
        total_items += len(y)
    val_loss = total_loss / total_items
    val_acc = total_true / total_items
    print(f'Valid loss {val_loss:.2f}')
    print(f'Accuracy: {val_acc:.2f}')
    return val_loss, val_acc

In [139]:
valid_metrics(net, val_data)

Valid loss 6.72
Accuracy: 0.00


(6.723890724182129, 0.004)

# Original script

In [None]:
import os, sys, numpy as np
import argparse
from time import time
from tqdm import tqdm
from torch.autograd import Variable

# import tensorflow # needs to call tensorflow before torch, otherwise crush #???
# sys.path.append('Utils')
# from logger import Logger
# from TrainingUtils import adjust_learning_rate, compute_accuracy

parser = argparse.ArgumentParser(description='Train JigsawPuzzleSolver on Imagenet')
parser.add_argument('data', type=str, help='Path to Imagenet folder')
parser.add_argument('--model', default=None, type=str, help='Path to pretrained model')
parser.add_argument('--classes', default=1000, type=int, help='Number of permutation to use')
parser.add_argument('--gpu', default=0, type=int, help='gpu id')
parser.add_argument('--epochs', default=70, type=int, help='number of total epochs for training')
parser.add_argument('--iter_start', default=0, type=int, help='Starting iteration count')
parser.add_argument('--batch', default=256, type=int, help='batch size')
parser.add_argument('--checkpoint', default='checkpoints/', type=str, help='checkpoint folder')
parser.add_argument('--lr', default=0.001, type=float, help='learning rate for SGD optimizer')
parser.add_argument('--cores', default=0, type=int, help='number of CPU core for loading')
parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
                    help='evaluate model on validation set, No training')
args = parser.parse_args()

In [None]:
def main():
# data loader
    train_data = DataLoader(trainpath,args.data+'/ilsvrc12_train.txt',
                            classes=args.classes)
    train_loader = torch.utils.data.DataLoader(dataset=train_data,
                                            batch_size=args.batch,
                                            shuffle=True,
                                            num_workers=args.cores)
    
    valpath = args.data+'/ILSVRC2012_img_val'
    if os.path.exists(valpath+'_255x255'):
        valpath += '_255x255'
    val_data = DataLoader(valpath, args.data+'/ilsvrc12_val.txt',
                            classes=args.classes)
    val_loader = torch.utils.data.DataLoader(dataset=val_data,
                                            batch_size=args.batch,
                                            shuffle=True,
                                            num_workers=args.cores)
# iterating through data
    
    N = train_data.N
    
    iter_per_epoch = train_data.N/args.batch
    print('Images: train %d, validation %d'%(train_data.N,val_data.N))
    
# Network initialize
    net = Network(args.classes)
    if args.gpu is not None:
        net.cuda()
    
    ############## Load from checkpoint if exists, otherwise from model ###############
    if os.path.exists(args.checkpoint):
        files = [f for f in os.listdir(args.checkpoint) if 'pth' in f]
        if len(files)>0:
            files.sort()
            #print files
            ckp = files[-1]
            net.load_state_dict(torch.load(args.checkpoint+'/'+ckp))
            args.iter_start = int(ckp.split(".")[-3].split("_")[-1])
            print('Starting from: ',ckp)
        else:
            if args.model is not None:
                net.load(args.model)
    else:
        if args.model is not None:
            net.load(args.model)

# loss func and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(net.parameters(),lr=args.lr,momentum=0.9,weight_decay = 5e-4)
    
# dunno how to use the logger yet
#     logger = Logger(args.checkpoint+'/train')
#     logger_test = Logger(args.checkpoint+'/test')
    
    ############## TESTING ###############
    if args.evaluate:
        test(net,criterion,None,val_loader,0)
        return
    
    ############## TRAINING ###############
    print(('Start training: lr %f, batch size %d, classes %d'%(args.lr,args.batch,args.classes)))
    print(('Checkpoint: '+args.checkpoint))
    
    # Train the Model
    batch_time, net_time = [], []
    steps = args.iter_start
    for epoch in range(int(args.iter_start/iter_per_epoch),args.epochs):
        if epoch%10==0 and epoch>0:
            test(net,criterion,logger_test,val_loader,steps)
        lr = adjust_learning_rate(optimizer, epoch, init_lr=args.lr, step=20, decay=0.1)
        
        end = time()
        for i, (images, labels, original) in enumerate(train_loader):
            batch_time.append(time()-end)
            if len(batch_time)>100:
                del batch_time[0]
            
            images = Variable(images)
            labels = Variable(labels)
            if args.gpu is not None:
                images = images.cuda()
                labels = labels.cuda()

            # Forward + Backward + Optimize
            optimizer.zero_grad()
            t = time()
            outputs = net(images)
            net_time.append(time()-t)
            if len(net_time)>100:
                del net_time[0]
            
            prec1, prec5 = compute_accuracy(outputs.cpu().data, labels.cpu().data, topk=(1, 5))
            acc = prec1[0]

            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            loss = float(loss.cpu().data.numpy())

            if steps%20==0:
                print(('[%2d/%2d] %5d) [batch load % 2.3fsec, net %1.2fsec], LR %.5f, Loss: % 1.3f, Accuracy % 2.2f%%' %(
                            epoch+1, args.epochs, steps, 
                            np.mean(batch_time), np.mean(net_time),
                            lr, loss,acc)))

            if steps%20==0:
                logger.scalar_summary('accuracy', acc, steps)
                logger.scalar_summary('loss', loss, steps)
                
                original = [im[0] for im in original]
                imgs = np.zeros([9,75,75,3])
                for ti, img in enumerate(original):
                    img = img.numpy()
                    imgs[ti] = np.stack([(im-im.min())/(im.max()-im.min()) 
                                         for im in img],axis=2)
                
                logger.image_summary('input', imgs, steps)

            steps += 1

            if steps%1000==0:
                filename = '%s/jps_%03i_%06d.pth.tar'%(args.checkpoint,epoch,steps)
                net.save(filename)
                print('Saved: '+args.checkpoint)
            
            end = time()

        if os.path.exists(args.checkpoint+'/stop.txt'):
            # break without using CTRL+C
            break

In [None]:
def test(net,criterion,logger,val_loader,steps):
    print('Evaluating network.......')
    accuracy = []
    net.eval()
    for i, (images, labels, _) in enumerate(val_loader):
        images = Variable(images)
        if args.gpu is not None:
            images = images.cuda()

        # Forward + Backward + Optimize
        outputs = net(images)
        outputs = outputs.cpu().data

        prec1, prec5 = compute_accuracy(outputs, labels, topk=(1, 5))
        accuracy.append(prec1[0])

    if logger is not None:
        logger.scalar_summary('accuracy', np.mean(accuracy), steps)
    print('TESTING: %d), Accuracy %.2f%%' %(steps,np.mean(accuracy)))
    net.train()