# Torch
## Check GPU¶

In [1]:
import torch
import sys
sys.path.append('..')
from torchlib.utils import list_device,set_device

list_device()

the rosdep view is empty: call 'sudo rosdep init' and 'rosdep update'


------------ List Devices ------------
Device 0 :
GeForce RTX 2060
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB

Device 1 :
TITAN Xp
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB



## Set torch default parameters¶

In [2]:
set_device(1)
torch.set_default_dtype(torch.float32)
torch.set_printoptions(precision=4)
torch.backends.cudnn.benchmark = True
torch.set_printoptions(sci_mode=False)

Using Device 1 : TITAN Xp


# Set Arguments

In [3]:
import argparse
import sys
import os
import time
import pickle

parser = argparse.ArgumentParser()

'''Training Parameters'''
parser.add_argument('--batch_size', type=int, default=300, help='minibatch size')
parser.add_argument('--num_epochs', type=int, default=80, help='number of epochs')
parser.add_argument('--grad_clip', type=float, default=5., help='clip gradients at this value')
parser.add_argument('--learning_rate', type=float, default=0.1, help='learning rate')
parser.add_argument('--learning_rate_clip', type=float, default=0.0000001, help='learning rate clip')
parser.add_argument('--decay_rate', type=float, default=.75, help='decay rate for rmsprop')
parser.add_argument('--weight_decay', type=float, default=.0001, help='decay rate for rmsprop')
parser.add_argument('--batch_norm_decay', type=float, default=.999, help='decay rate for rmsprop')
parser.add_argument('--keep_prob', type=float, default=1.0, help='dropout keep probability')
parser.add_argument('--lamda_weights', type=float, default=.01, help='lamda weight')
parser.add_argument('--data_argumentation', type=bool, default=True, help='whether do data argument')
parser.add_argument('--is_normalization', type=bool, default=True, help='whether do data nomalization')
parser.add_argument('--target_image_size', default=[300, 300], nargs=2, type=int, help='Input images will be resized to this for data argumentation.')
parser.add_argument('--output_dim', default=3, type=int, help='output dimention.')
parser.add_argument('--feat_dim', default=128, type=int, help='feature dimention.')

'''Configure'''
parser.add_argument('--network', type=str, default='vggnet_localization')
parser.add_argument('--model_dir', type=str, default='/notebooks/global_localization/gps_net_torch', help='rnn, gru, or lstm')

parser.add_argument('--train_dataset', type=str, default = ['/notebooks/michigan_nn_data/2012_01_08',
                                                            '/notebooks/michigan_nn_data/2012_01_15',
                                                            '/notebooks/michigan_nn_data/2012_01_22',
                                                            '/notebooks/michigan_nn_data/2012_02_02',
                                                            '/notebooks/michigan_nn_data/2012_02_04',
                                                            '/notebooks/michigan_nn_data/2012_02_05',
                                                            '/notebooks/michigan_nn_data/2012_03_31',
                                                            '/notebooks/michigan_nn_data/2012_09_28'])
'''
#parser.add_argument('--train_dataset', type=str, default = ['/notebooks/michigan_nn_data/test'])
parser.add_argument('--train_dataset', type=str, default = ['/notebooks/michigan_nn_data/2012_01_08'])
'''
parser.add_argument('--norm_tensor', type=str, default = ['/notebooks/global_localization/norm_mean_std.pt'])

parser.add_argument('--seed', default=1337, type=int)
parser.add_argument('--save_every', type=int, default=1000, help='save frequency')
parser.add_argument('--display', type=int, default=10, help='display frequency')
parser.add_argument('--tensorboard', type=bool, default=True, help='open tensorboard')

sys.argv = ['']
args = parser.parse_args()

if args.tensorboard:
    import os
    os.system('rm -rf runs/gps')
    from torch.utils.tensorboard import SummaryWriter
    writer = SummaryWriter('runs/gps')

# Load Dataset

In [4]:
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
import tf.transformations as tf_tran
from tqdm import tqdm
from PIL import Image
import numpy as np
import random

#import gpflow.multioutput.kernels as mk
import gpytorch

import torch.nn as nn
import torch.optim as optim
from torchlib import resnet, vggnet
from torchlib.cnn_auxiliary import normalize, denormalize_navie, denormalize, get_relative_pose, translational_rotational_loss
from torchlib.utils import LocalizationDataset, display_loss, data2tensorboard
import time

transform = transforms.Compose([transforms.ToTensor()])
dataset = LocalizationDataset(dataset_dirs = args.train_dataset, \
                              image_size = args.target_image_size, \
                              transform = transform, get_pair = False)

if len(args.train_dataset)>7:
    [args.norm_mean, args.norm_std] = [torch.tensor(x) for x in dataset.get_norm()]
    torch.save([args.norm_mean, args.norm_std], *args.norm_tensor)
    print('Save norm and std:',*args.norm_tensor)
else:
    [args.norm_mean, args.norm_std] = torch.load(*args.norm_tensor)
    print('Load norm and std:',*args.norm_tensor)


dataloader = DataLoader(dataset, batch_size=args.batch_size, \
                        shuffle=True, num_workers=0, \
                        drop_last=True, pin_memory=True)

100%|██████████| 16446/16446 [00:22<00:00, 725.48it/s]
100%|██████████| 22584/22584 [00:31<00:00, 716.43it/s]
100%|██████████| 18655/18655 [00:25<00:00, 726.75it/s]
100%|██████████| 17310/17310 [00:23<00:00, 723.50it/s]
100%|██████████| 10766/10766 [00:14<00:00, 731.28it/s]
100%|██████████| 14878/14878 [00:22<00:00, 664.90it/s]
100%|██████████| 13452/13452 [00:20<00:00, 645.97it/s]
100%|██████████| 14037/14037 [00:26<00:00, 539.73it/s]


Save norm and std: /notebooks/global_localization/norm_mean_std.pt


# Define Model

In [5]:
class Backbone(nn.Module):
    def __init__(self):
        super().__init__()
        self.resnet = resnet.resnet50(pretrained=True)
    def forward(self,input_data):
        dense_feat = self.resnet(input_data)
        return dense_feat
    
class NN(nn.Module):
    def __init__(self):
        super().__init__()
        self.global_context = vggnet.vggnet(input_channel=2048,opt="context")
        self.global_regressor = vggnet.vggnet(opt="regressor")
        
    def forward(self,input_data):
        context_feat = self.global_context(input_data)
        output,feature_t, feature_r = self.global_regressor(context_feat)
        return output, feature_t, feature_r

class GP(gpytorch.models.ApproximateGP):
    def __init__(self, inducing_points, output_dim=3):
        variational_distribution = gpytorch.variational.CholeskyVariationalDistribution(
            inducing_points.size(-2), batch_shape=torch.Size([output_dim])
        )
        variational_strategy = gpytorch.variational.MultitaskVariationalStrategy(
            gpytorch.variational.VariationalStrategy(
                self, inducing_points, variational_distribution, learn_inducing_locations=True
            ), num_tasks=output_dim
        )
        super().__init__(variational_strategy)
        self.mean_module = gpytorch.means.ConstantMean(batch_shape=torch.Size([output_dim]))
        self.covar_module = gpytorch.kernels.ScaleKernel(
            gpytorch.kernels.RBFKernel(batch_shape=torch.Size([output_dim])),
            batch_shape=torch.Size([output_dim]))

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

class GPNode(nn.Module):
    def __init__(self,inducing_points, seed):
        super().__init__()
        output_dim = inducing_points.shape[0]
        sub_feat_dim = inducing_points.shape[-1]
        torch.manual_seed(seed)
        
        #self.feat_index = torch.randint(high=args.feat_dim, size=(sub_feat_dim,))
        self.feat_index = torch.randperm(args.feat_dim)[:sub_feat_dim]
        self.gp = GP(inducing_points)
        self.likelihood = gpytorch.likelihoods.MultitaskGaussianLikelihood(num_tasks=output_dim) 
        
    def forward(self,input_data):
        output = self.gp(input_data)
        return output
    
class Model(nn.Module):
    def __init__(self,gp_args):
        super().__init__()
        self.backbone = Backbone()
        self.nn = NN()
        self.gps = nn.ModuleList()
        
        self.num_gp = gp_args['num_gp']
        #self.sub_batch_rate = gp_args['sub_rate']
        self.sub_feat_rate = gp_args['feat_rate']
        #self.sub_batch_size = int(args.batch_size*self.sub_batch_rate)
        self.sub_feat_dim = int(args.feat_dim*self.sub_feat_rate)
        
        for i in range(self.num_gp):
            #inducing_points = torch.zeros(args.output_dim, self.sub_batch_size, self.sub_feat_dim)
            inducing_points = torch.zeros(args.output_dim, args.batch_size, self.sub_feat_dim)
            # use i as seed to fix sub features
            gp = GPNode(inducing_points,seed=i)
            self.gps.append(gp)
        
    def forward_nn(self, input_data):
        dense_feat = self.backbone(input_data)
        output, feature_t, feature_r = self.nn(dense_feat)
        rot_pred = torch.split(output, [3, 4], dim=1)[1] # 4-dimention            
        return feature_t, rot_pred
    
    def forward_gp(self,gp,trans_feat):
        sub_trans_feat = trans_feat[:,gp.feat_index]
        trans_pred = gp(sub_trans_feat)
        return trans_pred

In [6]:
class Trainer:
    def __init__(self,gp_args,regressor_context_rate = [0.0,0.0]):
        self.model = Model(gp_args).cuda()
        self.norm_mean = args.norm_mean.cuda()
        self.norm_std = args.norm_std.cuda()
        
        # disable learning backbone
        for param in self.model.backbone.parameters():
            param.requires_grad = False
            
        # training tool
        self.optimizer = optim.Adam(self._optimize(regressor_context_rate))
        self.scheduler = optim.lr_scheduler.LambdaLR(optimizer=self.optimizer,
                                                         lr_lambda=lambda epoch: args.decay_rate**epoch)
        
    def load_model(self, file_name = 'pretrained.pth'):
        # load file info
        state_dict = torch.load(os.path.join(args.model_dir, file_name))
        if 'net.resnet.conv1.weight' in state_dict:
            print('Transform from old model.')
            # Part 1: backbone
            backbone_state_dict = self._from_old_model(state_dict,'backbone')
            print('Backbone parameters layer:',len(backbone_state_dict.keys()))
            self.model.backbone.load_state_dict(backbone_state_dict,strict = True)
            # Part 2: nn
            nn_state_dict = self._from_old_model(torch.load(os.path.join(args.model_dir, file_name)),'nn')
            print('NN parameters layer:',len(nn_state_dict.keys()))
            self.model.nn.load_state_dict(nn_state_dict,strict = True)
        else:
            print('Parameters layer:',len(state_dict.keys()))
            # load file to model
            self.model.load_state_dict(state_dict,strict = True)
        print('Model Structure:')
        # Display model structure
        for name, param in self.model.named_parameters():
            print(name, param.shape)
        print('Parameters layer:',len(self.model.state_dict().keys()))
    
    def _from_old_model(self, state_dict, select = 'backbone'):
        if select == 'backbone':
            for key in list(state_dict):
                if 'net.resnet.' in key:
                    state_dict[key.replace('net.resnet.','resnet.')] = state_dict.pop(key)
                else:
                    state_dict.pop(key)
        elif select == 'nn':
            for key in list(state_dict):
                if 'net.global_regressor.' in key:
                    state_dict[key.replace('net.global_regressor.','global_regressor.')] = state_dict.pop(key)
                elif 'net.global_context.' in key:
                    state_dict[key.replace('net.global_context.','global_context.')] = state_dict.pop(key)
                else:
                    state_dict.pop(key)
        return state_dict
    
    def save_model(self, file_name = 'model-{}-{}.pth'):
        checkpoint_path = os.path.join(args.model_dir, file_name)
        torch.save(self.model.state_dict(),checkpoint_path)
        print('Saving model to ' +  file_name)
        
    def _optimize(self,regressor_context_rate = [0.0,0.0]):
        optimizer = [
                {'params': self.model.gps.parameters(), \
                 'lr': args.learning_rate,'weight_decay':args.weight_decay}]
            
        if regressor_context_rate[0]!=0:
            optimizer += [{'params': self.model.nn.global_regressor.parameters(), \
                 'lr': args.learning_rate * regressor_context_rate[0],'weight_decay':args.weight_decay}]
            print('Regressor learn rate:',regressor_context_rate[0])
        else:
            for param in self.model.nn.global_regressor.parameters():
                param.requires_grad = False
                
        if regressor_context_rate[1]!=0:
            optimizer += [{'params': self.model.nn.global_context.parameters(), \
                 'lr': args.learning_rate * regressor_context_rate[1],'weight_decay':args.weight_decay}]
            print('Context learn rate:',regressor_context_rate[1])
        else:
            for param in self.model.nn.global_context.parameters():
                param.requires_grad = False
                
        return optimizer
            
    def train(self,x,y):
        # Step 0: zero grad
        self.optimizer.zero_grad()
        
        start = time.time()
        # Step 1: get data
        x,y = x.cuda(),y.cuda()
        if args.is_normalization:
            y = normalize(y,self.norm_mean, self.norm_std)
            
        # Step 2: training
        assert self.model.training == True
        
        trans_loss = torch.tensor(0.).cuda()
        
        trans_target, rot_target = torch.split(y, [3, 4], dim=1)
        trans_feat, rot_pred = self.model.forward_nn(x)
        rot_loss = self._nn_loss(rot_pred,rot_target)
        for i,gp in enumerate(self.model.gps):
            #torch.manual_seed(i)
            #sampled_mask = torch.randint(high=args.batch_size, size=(self.model.sub_batch_size,))
            sampled_mask = torch.randint(high=args.batch_size, size=(args.batch_size,))
            sub_x = trans_feat[sampled_mask]
            sub_y = trans_target[sampled_mask]
            gp_loss = self._gp_loss(gp,sub_x,sub_y)
            trans_loss += gp_loss
        trans_loss = trans_loss/self.model.num_gp
        
        total_loss = trans_loss + args.lamda_weights * rot_loss
        
        batch_time = time.time() - start
        
        #Step 3: update
        total_loss.backward()
        self.optimizer.step()
        
        return float(total_loss), batch_time    
    
    def _nn_loss(self,rot_pred,rot_target):
        rot_loss = 1. - torch.mean(torch.square(torch.sum(torch.mul(rot_pred,rot_target),dim=1)))
        return rot_loss
        
    def _gp_loss(self,gp,trans_feat,trans_target):
        # predict
        trans_pred = self.model.forward_gp(gp,trans_feat)
        #sub_trans_feat = trans_feat[:,gp.feat_index]
        #trans_pred = gp(sub_trans_feat)
        
        #num_data = int(min(len(dataloader)*args.batch_size,len(dataset))*self.model.sub_batch_rate)
        num_data = min(len(dataloader)*args.batch_size,len(dataset))
        mll = gpytorch.mlls.PredictiveLogLikelihood(gp.likelihood, gp.gp, num_data = num_data)
        
        # trans loss
        trans_loss = -1.*mll(trans_pred, trans_target)
        
        return trans_loss
    
    def _eval_gp(self, gp, trans_pred):
        c_mean, c_var = trans_pred.mean, trans_pred.variance
        y_mean, y_var = gp.likelihood(trans_pred).mean, gp.likelihood(trans_pred).variance
        
        return y_mean, c_mean, c_var
    
    def _sample(self, mean, var, num_sample = 100):
        dist = Normal(mean, var)
        samples = dist.sample([num_sample])
        return samples

    def eval_forward(self,x,y,num_sample = 100,output_denormalize = True):
        # Step 1: get data
        x,y = x.cuda(),y.cuda()
        if args.is_normalization:
            y = normalize(y,self.norm_mean, self.norm_std)
        
        # Step 2: forward
        assert self.model.training == False
        trans_feat, rot_pred = self.model.forward_nn(x)
        
        trans_preds = 0
        trans_means = 0
        trans_vars = 0
        for gp in self.model.gps:
            trans_pred = self.model.forward_gp(gp,trans_feat)
            trans_pred, trans_mean, trans_var = self._eval_gp(gp, trans_pred)
            trans_preds += trans_pred
            trans_means += trans_mean
            trans_vars += trans_var
            
        trans_preds /= self.model.num_gp
        trans_means /= self.model.num_gp
        trans_vars /= self.model.num_gp
        
        if args.is_normalization and output_denormalize:
            trans_preds = denormalize_navie(trans_preds, self.norm_mean, self.norm_std)
            trans_means = denormalize_navie(trans_means, self.norm_mean, self.norm_std)
            trans_vars = trans_vars.mul(self.norm_std)
            y = denormalize(y, self.norm_mean, self.norm_std)
        
        samples = self._sample(trans_means, trans_vars, num_sample)
            
        # Step 3: split output
        trans_target, rot_target = torch.split(y, [3, 4], dim=1)
        
        return trans_preds, rot_pred, trans_target, rot_target, samples

num_gp = 100
gp_args = {
    'feat_rate':1-1/3,
    'num_gp':num_gp
}    

#trainer = Trainer(gp_args,regressor_context_rate = [0.1,0.01])
trainer = Trainer(gp_args,regressor_context_rate = [0.,0.])
#trainer.load_model('model-2-1000.pth')
trainer.load_model('pretrained_old.pth')

Regressor learn rate: 0.1
Parameters layer: 1346
Model Structure:
backbone.resnet.conv1.weight torch.Size([64, 1, 7, 7])
backbone.resnet.bn1.weight torch.Size([64])
backbone.resnet.bn1.bias torch.Size([64])
backbone.resnet.layer1.0.conv1.weight torch.Size([64, 64, 1, 1])
backbone.resnet.layer1.0.bn1.weight torch.Size([64])
backbone.resnet.layer1.0.bn1.bias torch.Size([64])
backbone.resnet.layer1.0.conv2.weight torch.Size([64, 64, 3, 3])
backbone.resnet.layer1.0.bn2.weight torch.Size([64])
backbone.resnet.layer1.0.bn2.bias torch.Size([64])
backbone.resnet.layer1.0.conv3.weight torch.Size([256, 64, 1, 1])
backbone.resnet.layer1.0.bn3.weight torch.Size([256])
backbone.resnet.layer1.0.bn3.bias torch.Size([256])
backbone.resnet.layer1.0.downsample.0.weight torch.Size([256, 64, 1, 1])
backbone.resnet.layer1.0.downsample.1.weight torch.Size([256])
backbone.resnet.layer1.0.downsample.1.bias torch.Size([256])
backbone.resnet.layer1.1.conv1.weight torch.Size([64, 256, 1, 1])
backbone.resnet.laye

In [7]:
for name,param in trainer.model.named_parameters():
    if param.requires_grad:
        print(name, param.shape)

nn.global_regressor.regressor.fc1_trans.0.weight torch.Size([4096, 6400])
nn.global_regressor.regressor.fc1_trans.0.bias torch.Size([4096])
nn.global_regressor.regressor.fc2_trans.0.weight torch.Size([4096, 4096])
nn.global_regressor.regressor.fc2_trans.0.bias torch.Size([4096])
nn.global_regressor.regressor.fc3_trans.0.weight torch.Size([128, 4096])
nn.global_regressor.regressor.fc3_trans.0.bias torch.Size([128])
nn.global_regressor.regressor.logits_t.weight torch.Size([3, 128])
nn.global_regressor.regressor.logits_t.bias torch.Size([3])
nn.global_regressor.regressor.fc1_rot.0.weight torch.Size([4096, 6400])
nn.global_regressor.regressor.fc1_rot.0.bias torch.Size([4096])
nn.global_regressor.regressor.fc2_rot.0.weight torch.Size([4096, 4096])
nn.global_regressor.regressor.fc2_rot.0.bias torch.Size([4096])
nn.global_regressor.regressor.fc3_rot.0.weight torch.Size([128, 4096])
nn.global_regressor.regressor.fc3_rot.0.bias torch.Size([128])
nn.global_regressor.regressor.logits_r.weight tor

gps.34.gp.covar_module.raw_outputscale torch.Size([1])
gps.34.gp.covar_module.base_kernel.raw_lengthscale torch.Size([1, 1, 1])
gps.34.likelihood.raw_noise torch.Size([1])
gps.34.likelihood.noise_covar.raw_noise torch.Size([3])
gps.35.gp.variational_strategy.base_variational_strategy.inducing_points torch.Size([3, 300, 85])
gps.35.gp.variational_strategy.base_variational_strategy._variational_distribution.variational_mean torch.Size([3, 300])
gps.35.gp.variational_strategy.base_variational_strategy._variational_distribution.chol_variational_covar torch.Size([3, 300, 300])
gps.35.gp.mean_module.constant torch.Size([1, 1])
gps.35.gp.covar_module.raw_outputscale torch.Size([1])
gps.35.gp.covar_module.base_kernel.raw_lengthscale torch.Size([1, 1, 1])
gps.35.likelihood.raw_noise torch.Size([1])
gps.35.likelihood.noise_covar.raw_noise torch.Size([3])
gps.36.gp.variational_strategy.base_variational_strategy.inducing_points torch.Size([3, 300, 85])
gps.36.gp.variational_strategy.base_variation

# Training

## Training Epoch

In [8]:
trainer.model.train()
for e in range(args.num_epochs):
#for e in range(1):
    torch.manual_seed(args.seed)
    train_loss = 0.
    for b, data in enumerate(dataloader, 0):
        x,y = data.values()
        
        single_loss, batch_time = trainer.train(x,y)
        
        with torch.no_grad():
            train_loss += single_loss
            args.tensorboard and data2tensorboard(writer,single_loss,train_loss/(b+1),e*len(dataloader)+(b+1))
            if ((b+1)%args.display == 0):
                 display_loss(e*len(dataloader)+(b+1),args.num_epochs*len(dataloader),e,
                              train_loss/(b+1),batch_time,trainer.scheduler.get_last_lr()[0])          
            if (e * len(dataloader) + (b+1)) % args.save_every == 0:
                trainer.save_model('model-{}-{}.pth'.format(e, e * len(dataloader) + (b+1)))
            if trainer.scheduler.get_last_lr()[0] > args.learning_rate_clip and (e * len(dataloader) + (b+1)) % 150 == 0:
                trainer.scheduler.step()

10/34160 (epoch 0), train_loss = -7.42033644, time/batch = 2.315, learning rate = 0.00130000
20/34160 (epoch 0), train_loss = -7.58430595, time/batch = 2.318, learning rate = 0.00130000
30/34160 (epoch 0), train_loss = -7.66486074, time/batch = 2.330, learning rate = 0.00130000
40/34160 (epoch 0), train_loss = -7.70595225, time/batch = 2.322, learning rate = 0.00130000
50/34160 (epoch 0), train_loss = -7.73845427, time/batch = 2.319, learning rate = 0.00130000
60/34160 (epoch 0), train_loss = -7.76018190, time/batch = 2.326, learning rate = 0.00130000
70/34160 (epoch 0), train_loss = -7.77607707, time/batch = 2.330, learning rate = 0.00130000
80/34160 (epoch 0), train_loss = -7.78149038, time/batch = 2.336, learning rate = 0.00130000
90/34160 (epoch 0), train_loss = -7.79083312, time/batch = 2.329, learning rate = 0.00130000
100/34160 (epoch 0), train_loss = -7.79813015, time/batch = 2.331, learning rate = 0.00130000
110/34160 (epoch 0), train_loss = -7.80500672, time/batch = 2.318, le

904/34160 (epoch 2), train_loss = -8.06164865, time/batch = 2.326, learning rate = 0.00023137
914/34160 (epoch 2), train_loss = -8.06277947, time/batch = 2.325, learning rate = 0.00023137
924/34160 (epoch 2), train_loss = -8.06249894, time/batch = 2.329, learning rate = 0.00023137
934/34160 (epoch 2), train_loss = -8.06334822, time/batch = 2.338, learning rate = 0.00023137
944/34160 (epoch 2), train_loss = -8.06351674, time/batch = 2.328, learning rate = 0.00023137
954/34160 (epoch 2), train_loss = -8.06284156, time/batch = 2.320, learning rate = 0.00023137
964/34160 (epoch 2), train_loss = -8.06260314, time/batch = 2.331, learning rate = 0.00023137
974/34160 (epoch 2), train_loss = -8.06315239, time/batch = 2.317, learning rate = 0.00023137
984/34160 (epoch 2), train_loss = -8.06265717, time/batch = 2.320, learning rate = 0.00023137
994/34160 (epoch 2), train_loss = -8.06331020, time/batch = 2.368, learning rate = 0.00023137
Saving model to model-2-1000.pth
1004/34160 (epoch 2), train

KeyboardInterrupt: 