In [1]:
import sys
print(sys.executable)

/home/ubuntu/anaconda3/envs/pytorch_p39/bin/python


In [2]:
import numpy as np
import skimage.io as io
import random
import os
import albumentations as A
import cv2
import pickle
import torch
from pycocotools.coco import COCO
from torch.utils.data import Dataset, DataLoader
from albumentations.pytorch import ToTensorV2
import glob

In [3]:
torch.cuda.empty_cache()

In [4]:
import random

class MSCOCODataset(Dataset):
    def __init__(self, args=None, basic_trans=None):
        self.args = args
        self.images = sorted(glob.glob(args.data_dir + '/*'))
        self.ann_file = '{}/annotations/instances_{}.json'.format(args.ann_dir, args.mode)
        self.coco = COCO(self.ann_file)
        self.basic_trans = basic_trans
        self.img_size = args.input_size
        self.center = (self.img_size/2, self.img_size/2)
        
    def __len__(self):
        return len(self.images)//4

    def __getitem__(self, idx):
        # 363 triplets, 1089 images
    
        bg = self.getImage(self.images[idx*3])
        fg = self.getImage(self.images[idx*3+1])
        mask = self.getMask(self.images[idx*3+2])
      
        ##########

        bg = self.basic_trans(image=bg)['image']
        items = self.basic_trans(image=fg, mask=mask)
        fg = items['image']
        mask = items['mask'].unsqueeze(0)
        
        return bg, fg, mask>0.5
    
    def getClassName(self, classID, cats):
        for i in range(len(cats)):
            if cats[i]['id'] == classID:
                return cats[i]['name']
        return "None"
    
    def getImage(self, file_name):
        img = cv2.imread(file_name, cv2.IMREAD_COLOR)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        return img
    
    def getMask(self, file_name):
        mask = cv2.imread(file_name, cv2.IMREAD_GRAYSCALE)
        return mask

In [5]:
import torch.nn as nn
import torch.nn.functional as F
import math

class GenCompModel(nn.Module):
    def __init__(self, args):
        super(GenCompModel, self).__init__()
        self.args = args
        self.img_size = args.input_size
        self.batch_size = args.batch_size
        self.stn = STN(7) # fg+bg+mask channels
        self.colornet = LinearWithChannel(self.batch_size, self.img_size, self.img_size, 3) 
        self.refinenet = TransformNetwork(3, 3)
    
    def forward(self, fg, bg, mask):
        HI, AI, trans_mat = self.stn(fg, bg, mask)
        FI = self.colornet(HI) # changed to inputs as I
        AI = AI > 0.5
        R_in = torch.multiply(FI, AI) + torch.multiply(bg, ~AI)
        R_out = self.refinenet(R_in)
        return AI, HI, FI, trans_mat, R_in, R_out
    
class Discriminator(nn.Module):    
    def __init__(self):        
        super(Discriminator, self).__init__()
        self.imgdisc = ImageDiscriminator(3, norm='spec')
        self.segnet = TransformNetwork(3, 1)
        
    def forward(self, mask, R_out):
        img_out = self.imgdisc(R_out)
        fg_seg_out = self.segnet(torch.multiply(mask, R_out), last='sigmoid')
        bg_seg_out = self.segnet(torch.multiply(~mask, R_out), last='sigmoid')

        return img_out, fg_seg_out, bg_seg_out
    
class TransformNetwork(nn.Module):
    def __init__(self, in_ch, out_ch):        
        super(TransformNetwork, self).__init__()        
        
        self.layers = nn.Sequential(            
            ConvLayer(in_ch, 32, 9, 1),
            ConvLayer(32, 64, 3, 2),
            ConvLayer(64, 128, 3, 2),
            
            ResidualLayer(128, 128, 3, 1),
            ResidualLayer(128, 128, 3, 1),
            ResidualLayer(128, 128, 3, 1),
            ResidualLayer(128, 128, 3, 1),
            ResidualLayer(128, 128, 3, 1),
            
            DeconvLayer(128, 64, 3, 1),
            DeconvLayer(64, 32, 3, 1),
            ConvLayer(32, out_ch, 9, 1, activation='linear'))
        
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x, last=None):
        x = self.layers(x)
        if last:
            x = self.sigmoid(x)
        return x
    
class LinearWithChannel(nn.Module):
    def __init__(self, batch_size, input_size, output_size, channel_size):
        super(LinearWithChannel, self).__init__()
        self.w = torch.nn.Parameter(torch.empty(channel_size))
        self.b = torch.nn.Parameter(torch.zeros(channel_size))
    
    def forward(self, x):
        return x * self.w.view(1, 3, 1, 1) + self.b.view(1, 3, 1, 1)
    
class STN(nn.Module):
    def __init__(self, in_ch):
        super(STN, self).__init__()
        self.in_ch = in_ch
        
        # localization-network for STN
        self.localization = nn.Sequential(
            ConvLayer(self.in_ch, 16, 3, 1), # 256
            ConvLayer(16, 16, 3, 1),
            ConvLayer(16, 16*2, 3, 2), # 128
            ConvLayer(16*2, 16*2, 3, 1),
            ConvLayer(16*2, 16*4, 3, 2), # 64
            ConvLayer(16*4, 16*4, 3, 1),
            ConvLayer(16*4, 16*8, 3, 2), # 32
            ConvLayer(16*8, 16*8, 3, 1), 
            ConvLayer(16*8, 16*16, 3, 2), # 16
            ConvLayer(16*16, 16*16, 3, 1), 
            ConvLayer(16*16, 16*32, 3, 2), # 8
            ConvLayer(16*32, 16*32, 3, 1), 
        )

        # [3 * 2] 크기의 아핀(affine) 행렬에 대해 예측
        self.fc_loc = nn.Sequential(
            nn.Linear(16*32*8*8, 32*8*8),
            nn.ReLU(True),
            nn.Linear(32*8*8, 8*8),
            nn.ReLU(True),
            nn.Linear(8*8, 16),
            nn.ReLU(True),
            nn.Linear(16, 2*3),
        )

        # 항등 변환(identity transformation)으로 가중치/바이어스 초기화
        self.fc_loc[6].weight.data.zero_()
        self.fc_loc[6].bias.data.copy_(torch.tensor([1, 0, 0, 0, 1, 0], dtype=torch.float))

    # STN의 forward 함수
    def stn(self, fg, bg, mask):
        # x = x.type(torch.cuda.FloatTensor)
        mask = mask.float()
        inputs = torch.cat([fg, bg, mask], dim=1) #[B, (3+3+1), 256, 256]
        xs = self.localization(inputs)
        # print('xs shape:', xs.shape) #torch.Size([4, 10, 60, 60]) # [1, 128, 32, 32])
        xs = xs.view(-1, 16*32*8*8) #xs = xs.view(-1, 10 * 3 * 3)
        theta = self.fc_loc(xs)
        theta = theta.view(-1, 2, 3)

        grid = F.affine_grid(theta, fg.size(), align_corners=False)
        fg = F.grid_sample(fg, grid, align_corners=False)

        grid = F.affine_grid(theta, mask.size(), align_corners=False)
        mask = F.grid_sample(mask, grid, align_corners=False)
        
        # return fg, mask>0.5, theta
        return fg, mask, theta

    def forward(self, fg, bg, mask):
        # 입력을 변환
        fg, mask, trans_mat = self.stn(fg, bg, mask)
        return fg, mask, trans_mat
    
class ConvLayer(nn.Module):    
    def __init__(self, in_ch, out_ch, kernel_size, stride, pad='reflect', activation='leaky', normalization='batch'):        
        super(ConvLayer, self).__init__()
        
        # padding
        if pad == 'reflect':            
            self.pad = nn.ReflectionPad2d(kernel_size//2)
        elif pad == 'zero':
            self.pad = nn.ZeroPad2d(kernel_size//2)
        else:
            raise NotImplementedError("Not expected pad flag !!!")
    
            
        # convolution
        self.conv_layer = nn.Conv2d(in_ch, out_ch, 
                                    kernel_size=kernel_size,
                                    stride=stride)
        if normalization == 'spec':
            self.conv_layer = nn.utils.spectral_norm(self.conv_layer)
           
        
        # activation
        if activation == 'relu':
            self.activation = nn.ReLU()     
        elif activation == 'leaky':
            self.activation = nn.LeakyReLU(0.2)
        elif activation == 'linear':
            self.activation = lambda x : x

        else:
            raise NotImplementedError("Not expected activation flag !!!")

        # normalization 
        if normalization == 'instance':            
            self.normalization = nn.InstanceNorm2d(out_ch, affine=True)
        elif normalization == 'batch':
            self.normalization = nn.BatchNorm2d(out_ch, affine=True)
        elif normalization == 'spec':
            self.normalization = None
        else:
            raise NotImplementedError("Not expected normalization flag !!!")

    def forward(self, x):
        x = self.pad(x)
        x = self.conv_layer(x)
        if self.normalization:
            x = self.normalization(x)
        x = self.activation(x)        
        return x
    
class ResidualLayer(nn.Module):    
    def __init__(self, in_ch, out_ch, kernel_size, stride, pad='reflect', normalization='batch'):        
        super(ResidualLayer, self).__init__()
        
        self.conv1 = ConvLayer(in_ch, out_ch, kernel_size, stride, pad, 
                               activation='relu', 
                               normalization=normalization)
        
        self.conv2 = ConvLayer(out_ch, out_ch, kernel_size, stride, pad, 
                               activation='linear', 
                               normalization=normalization)
        
    def forward(self, x):
        y = self.conv1(x)
        return self.conv2(y) + x
    
class DeconvLayer(nn.Module):    
    def __init__(self, in_ch, out_ch, kernel_size, stride, pad='reflect', activation='leaky', normalization='batch', upsample='nearest'):        
        super(DeconvLayer, self).__init__()
        
        # upsample
        self.upsample = upsample
        
        # pad
        if pad == 'reflect':            
            self.pad = nn.ReflectionPad2d(kernel_size//2)
        elif pad == 'zero':
            self.pad = nn.ZeroPad2d(kernel_size//2)
        else:
            raise NotImplementedError("Not expected pad flag !!!")        
        
        # conv
        self.conv = nn.Conv2d(in_ch, out_ch, kernel_size, stride)
        
        # activation
        if activation == 'relu':
            self.activation = nn.ReLU()
        elif activation == 'leaky':
            self.activation = nn.LeakyReLU(0.2)
        elif activation == 'linear':
            self.activation = lambda x : x
        else:
            raise NotImplementedError("Not expected activation flag !!!")
        
        # normalization
        if normalization == 'instance':
            self.normalization = nn.InstanceNorm2d(out_ch, affine=True)
        elif normalization == 'batch':
            self.normalization = nn.BatchNorm2d(out_ch, affine=True)
        else:
            raise NotImplementedError("Not expected normalization flag !!!")
        
    def forward(self, x):
        x = nn.functional.interpolate(x, scale_factor=2, mode=self.upsample)        
        x = self.pad(x)
        x = self.conv(x)
        x = self.normalization(x)        
        x = self.activation(x)        
        return x
    
class ImageDiscriminator(nn.Module):
    def __init__(self, in_ch, norm='spec'):
        super(ImageDiscriminator, self).__init__()
        self.layers = nn.Sequential(
            ConvLayer(in_ch, 64, 10, 4, normalization=norm, activation='leaky'), #256->64
            ConvLayer(64, 128, 10, 4, normalization=norm, activation='leaky'), #64->16
            ConvLayer(128, 256, 10, 4, normalization=norm, activation='leaky'), #16->4
            nn.Conv2d(256, 1, 5, 1), #4->1
            nn.Sigmoid()
        )
    def forward(self, input):
        return self.layers(input)

In [6]:
import argparse

args = argparse.Namespace(
    ann_dir = '/home/ubuntu/COCOdataset2017',
    data_dir = '/home/ubuntu/GCCdataset/val_bg',
    save_model_dir = '/home/ubuntu/GCC-GAN-server/models/',
    mode = 'val', # or 'test'
    batch_size = 32,
    input_size = 256,
    # epochs = 8,
    lr = 2e-5,
    beta = 0.5,
    test_interval = 50,
    device_id = 6,#[0, 1, 2, 3]
    vis_id = 'test'
)

In [95]:
import os
from torch.utils.data.distributed import DistributedSampler
import torch.distributed as dist
from apex.parallel import DistributedDataParallel as DDP

device = 'cuda:'+str(args.device_id) if torch.cuda.is_available() else 'cpu'

print('Device:', device)

basic_transform = A.Compose([
        A.Normalize(
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225],),
        ToTensorV2(),
])
affine_transform = A.IAAAffine(scale=(0.8, 1.2), translate_percent=(0.2, 0.4), rotate=(-10, 10), shear=15, mode='constant')


num_workers = 8 # 4 * len(args.device_ids)
print('# Workers:', num_workers)

dataset = MSCOCODataset(args, basic_trans=basic_transform)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=args.batch_size, shuffle=True, num_workers=num_workers, pin_memory=True)

Device: cuda:6
# Workers: 8
loading annotations into memory...
Done (t=0.88s)
creating index...
index created!


In [10]:
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from torch.optim import Adam
import visdom
import os
from torchvision.utils import save_image

def denorm(tensor):
    mean = torch.Tensor([0.485, 0.456, 0.406]).reshape(-1, 1, 1)
    std = torch.Tensor([0.229, 0.224, 0.225]).reshape(-1, 1, 1)
    res = torch.clamp(tensor * std + mean, 0, 1)
    return res

def tensor2im(image_tensor, imtype=np.uint8):
    image_numpy = image_tensor.numpy() #image_tensor[0].permute(1,2,0).detach().cpu().float().numpy()
    # image_numpy = image_numpy * 0.5 - 0.5
    image_numpy = image_numpy * 255
    # image_numpy = (np.transpose(image_numpy, (1, 2, 0)) + 1) / 2.0 * 255.0
    # image_numpy = (image_numpy + 1) / 2.0 * 255.0
    image_numpy = np.clip(image_numpy, 0, 255)

    return image_numpy.astype(imtype)

# def weights_init(m):
#     if isinstance(m, nn.Conv2d):
#         nn.init.normal_(m.weight.data, 0.0, 0.02)
#     elif isinstance(m, nn.BatchNorm2d):
#         nn.init.normal_(m.weight.data, 1.0, 0.02)
#         nn.init.constant_(m.bias.data, 0)
        
# def save_checkpoint(epoch, model, optimizer, filename):
#     state = {
#         'Epoch': epoch,
#         'State_dict': model.state_dict(),
#         'optimizer': optimizer.state_dict()
#     }
#     torch.save(state, filename)
    
def load_checkpoint(filename):
    state = torch.load(filename, map_location='cpu')
    return state['State_dict'], state['optimizer']

if __name__ == '__main__':
    # Visdom display initialization
    vis = visdom.Visdom(env=args.vis_id)
    vis.close(env=args.vis_id)
    
    win_ids = []

    for i in range(args.batch_size):
        win_ids.append(vis.images(np.random.rand(7, 3, 256, 256), env=args.vis_id))
    
    net_state, opt_state = load_checkpoint(args.save_model_dir + 'netHCRS-free3.pth')
    disc_state, optD_state = load_checkpoint(args.save_model_dir + 'discHCRS-free3.pth')
    
    net = GenCompModel(args) # generator
    # net.apply(weights_init)
    net.load_state_dict(net_state)

    disc = Discriminator()
    # disc.apply(weights_init)
    disc.load_state_dict(disc_state)

    net.to(device)
    disc.to(device)
    
    net.eval()
    disc.eval()

    real_label = 1.
    fake_label = 0.

    optimizer = Adam(net.parameters(), lr=args.lr, betas=(args.beta, 0.999))
    optimizerD = Adam(disc.parameters(), lr=args.lr, betas=(args.beta, 0.999))
    
    optimizer.load_state_dict(opt_state)
    optimizerD.load_state_dict(optD_state)
        

    SMOOTH = 1e-6

Setting up a new session...


In [122]:
import time

for i, items in enumerate(dataloader):
    bg = bg.unsqueeze(0).to(device); fg = fg.unsqueeze(0).to(device); mask = mask.unsqueeze(0).to(device) #bg = items[0].to(device); fg = items[1].to(device); mask = items[2].to(device)
    # print(bg.shape, fg.shape, mask.shape)
    infer_time_total = 0
    with torch.no_grad():
        inference_start_time = time.time()
        AI, HI, FI, pred_mat, R_in, R_out = net(fg, bg, mask)
        R_img_out, R_fg_seg, R_bg_seg = disc(AI, R_out)
        temp_time = time.time() - inference_start_time
        infer_time_total += temp_time
        print(temp_time/args.batch_size)
            
    print('Inference Time Taken: %.4f sec' % (infer_time_total/i/args.batch_size)) #divide by #batch and #img in batch

    inputs = tensor2im(denorm((torch.multiply(fg, mask) + torch.multiply(bg, ~mask)).detach().cpu()).float())
    FI_out = tensor2im(denorm(FI.detach().cpu()).float())
    HI_out = tensor2im(denorm(HI.detach().cpu()).float())
    AI_out = tensor2im(AI.detach().cpu())
    R_in = tensor2im(denorm(R_in.detach().cpu()).float())
    seg_out = tensor2im((R_fg_seg+R_bg_seg).detach().cpu())
    R_out = tensor2im(denorm(R_out.detach().cpu()).float())

    AI_out = np.concatenate([AI_out, AI_out, AI_out], axis=1)
    seg_out = np.concatenate([seg_out, seg_out, seg_out], axis=1)

    total = np.concatenate([inputs, FI_out, HI_out, AI_out, R_in, seg_out, R_out], axis=1)

    for i in range(args.batch_size):
        batch = total[i, :, :, :] # (17, 256, 256)
        batch_out = np.concatenate([np.expand_dims(batch[:3, :, :], axis=0), np.expand_dims(batch[3:6, :, :], axis=0), np.expand_dims(batch[6:9, :, :], axis=0), np.expand_dims(batch[9:12, :, :], axis=0),\
                                    np.expand_dims(batch[12:15, :, :], axis=0), np.expand_dims(batch[15:18, :, :], axis=0), np.expand_dims(batch[18:21, :, :], axis=0)], axis=0)
        # batch_out = np.concatenate([batch[:, :3, :, :], batch[:, 3:6, :, :], batch[:, 6:9, :, :], batch[:, 9:10, :, :], batch[:, 10:13, :, :], batch[:, 13:14, :, :], batch[:, 14:17, :, :]], axis=0)
        vis.images(batch_out, win=win_ids[i], opts=dict(title=str(i)), env=args.vis_id)

0.0005987584590911865
Inference Time Taken: 0.0006 sec


'window_3b34e2950d3c64'

In [None]:
# Single image testing

import time

bg = cv2.imread("/home/ubuntu/RealIndoors/automatic/bg.jpeg", cv2.IMREAD_COLOR)
bg = cv2.cvtColor(bg, cv2.COLOR_BGR2RGB)
bg = cv2.resize(bg, (256, 256))
fg = cv2.imread("/home/ubuntu/RealIndoors/automatic/fg.jpeg", cv2.IMREAD_COLOR)
fg = cv2.cvtColor(fg, cv2.COLOR_BGR2RGB)
fg = cv2.resize(fg, (256, 256))
mask = cv2.imread("/home/ubuntu/RealIndoors/automatic/mask.png", cv2.IMREAD_GRAYSCALE)
mask = cv2.resize(mask, (256, 256))

bg = basic_transform(image=bg)['image']
mask = mask>0.5
items = affine_transform(image=fg, mask=mask)
items = basic_transform(image=items['image'], mask=items['mask'])
fg = items['image']
mask = items['mask'].unsqueeze(0)

bg = bg.unsqueeze(0).to(device); fg = fg.unsqueeze(0).to(device); mask = mask.unsqueeze(0).to(device)

infer_time_total = 0
with torch.no_grad():
    inference_start_time = time.time()
    AI, HI, FI, pred_mat, R_in, R_out = net(fg, bg, mask)
    R_img_out, R_fg_seg, R_bg_seg = disc(AI, R_out)
    temp_time = time.time() - inference_start_time
    infer_time_total += temp_time
    print(temp_time/args.batch_size)
        
print('Inference Time Taken: %.4f sec' % (infer_time_total/i/args.batch_size)) #divide by #batch and #img in batch


inputs = tensor2im(denorm((torch.multiply(fg, mask) + torch.multiply(bg, ~mask)).detach().cpu()).float())

FI_out = tensor2im(denorm(FI.detach().cpu()).float())
HI_out = tensor2im(denorm(HI.detach().cpu()).float())
AI_out = tensor2im(AI.detach().cpu())
R_in = tensor2im(denorm(R_in.detach().cpu()).float())
seg_out = tensor2im((R_fg_seg+R_bg_seg).detach().cpu())
R_out = tensor2im(denorm(R_out.detach().cpu()).float())

AI_out = np.concatenate([AI_out, AI_out, AI_out], axis=1)
seg_out = np.concatenate([seg_out, seg_out, seg_out], axis=1)

total = np.concatenate([inputs, FI_out, HI_out, AI_out, R_in, seg_out, R_out], axis=1)

for i in range(args.batch_size):
    batch = total[i, :, :, :] # (17, 256, 256)
    batch_out = np.concatenate([np.expand_dims(batch[:3, :, :], axis=0), np.expand_dims(batch[3:6, :, :], axis=0), np.expand_dims(batch[6:9, :, :], axis=0), np.expand_dims(batch[9:12, :, :], axis=0),\
                                np.expand_dims(batch[12:15, :, :], axis=0), np.expand_dims(batch[15:18, :, :], axis=0), np.expand_dims(batch[18:21, :, :], axis=0)], axis=0)
    vis.images(batch_out, win=win_ids[i], opts=dict(title=str(i)), env=args.vis_id)