In [18]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [19]:
!pip install torchinfo

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [20]:
import os
import json
import random
import numpy as np
import pandas as pd
import pickle
import sys
import torchinfo

import torch 
from torch import nn 
from torch import optim
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm
import itertools
import random
import copy
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
import cv2
import json
from glob import glob
from sklearn.model_selection import train_test_split
from functools import partial
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from itertools import product
import matplotlib.pyplot as plt
from sklearn import metrics
from tabulate import tabulate
import math
import logging
from datetime import datetime
from sklearn.metrics import accuracy_score
import argparse

In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def classname_id(class_name_list):
    id2classname = {k:v for k, v in zip(list(range(len(class_name_list))),class_name_list)}
    classname2id = {v:k for k, v in id2classname.items()}
    return id2classname, classname2id

def trunc(latent, mean_size, truncation):  # Truncation trick on Z
    t = Variable(FloatTensor(np.random.normal(0, 1, (mean_size, *latent.shape[1:]))))
    m = t.mean(0, keepdim=True)

    for i,_ in enumerate(latent):
        latent[i] = m + truncation*(latent[i] - m)

    return latent

In [22]:
import torch

In [23]:
config = {
    "batch_size":10,
    "latent_dim":512,
    "mlp_dim":8,
    "n_classes":120,
    "label":-1,
    "t_size":64,
    "v_size":25,
    "channels":3,
    "dataset":"ntu",
    "model":"/content/model_saves/generator_ntu120_xsub_mlp8_2150000.pth",
    "stochastic":False,
    "stochastic_file":"-",
    "stochastic_index":0,
    "gen_qtd":10,
    "trunc":0.95,
    "trunc_mode":'w',
    "mean_size":1000
}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [244]:
import torch 
import torch.nn as nn
from torch.nn import functional as F
import numpy as np




class BasicBlock(nn.Module):
    """
    Basic block is composed of 2 CNN layers with residual connection.
    Each CNN layer is followed by batchnorm layer and swish activation 
    function. 
    Args:
        in_channel: number of input channels
        out_channel: number of output channels
        k: (default = 1) kernel size
    """
    def __init__(self, in_channel, out_channel, k=1):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(
            in_channel,
            out_channel,
            kernel_size=k,
            padding=(0, 0),
            stride=(1, 1))
        self.bn1 = nn.BatchNorm2d(out_channel)

        self.conv2 = nn.Conv2d(
            out_channel,
            out_channel,
            kernel_size=1,
            padding=(0, 0),
            stride=(1, 1))
        self.bn2 = nn.BatchNorm2d(out_channel)

        self.shortcut = nn.Sequential()
        # if in_channel != out_channel:
        self.shortcut.add_module(
            'conv',
            nn.Conv2d(
                in_channel,
                out_channel,
                kernel_size=k,
                padding=(0,0),
                stride=(1,1)))
        self.shortcut.add_module('bn', nn.BatchNorm2d(out_channel))

    def swish(self,x):
        """
        We use swish in spatio-temporal encoding/decoding. We tried with 
        other activation functions such as ReLU and LeakyReLU. But we 
        achieved the best performance with swish activation function.
        Args:
            X: tensor: (batch_size, ...)
        Return:
            _: tensor: (batch, ...): applies swish 
            activation to input tensor and returns  
        """
        return x*torch.sigmoid(x)

    def forward(self, x):
        y = self.swish(self.conv1(x))
        y = self.swish(self.conv2(y))
        y = y + self.shortcut(x)
        y = self.swish(y)
        return y


class BasicBlockTranspose(nn.Module):
    """
    Basic block is composed of 2 CNN layers with residual connection.
    Each CNN layer is followed by batchnorm layer and swish activation 
    function. 
    Args:
        in_channel: number of input channels
        out_channel: number of output channels
        k: (default = 1) kernel size
    """
    def __init__(self, in_channel, out_channel, k=(1,1)):
        super(BasicBlockTranspose, self).__init__()
        self.stride = (1, 1)
        self.padding = (0, 0)
        self.k = k
        self.conv1 = nn.ConvTranspose2d(
            in_channel,
            out_channel,
            kernel_size=k,
            padding=self.padding,
            stride=self.stride)
        self.bn1 = nn.BatchNorm2d(out_channel)

        self.conv2 = nn.ConvTranspose2d(
            out_channel,
            out_channel,
            kernel_size=1,
            padding=self.padding,
            stride=self.stride)
        self.bn2 = nn.BatchNorm2d(out_channel)

        self.shortcut = nn.Sequential()
        # if in_channel != out_channel:
        self.shortcut.add_module(
            'conv',
            nn.ConvTranspose2d(
                in_channel,
                out_channel,
                kernel_size=k,
                padding=self.padding,
                stride=self.stride))
        self.shortcut.add_module('bn', nn.BatchNorm2d(out_channel))

    def get_h_out(self,h_in):
        return (h_in - 1)*self.stride[0]-2*self.padding[0]+(self.k[0]-1)+1
    def get_w_out(self,w_in):
        return (w_in - 1)*self.stride[1]-2*self.padding[1]+(self.k[1]-1)+1

    def swish(self,x):
        """
        We use swish in spatio-temporal encoding/decoding. We tried with 
        other activation functions such as ReLU and LeakyReLU. But we 
        achieved the best performance with swish activation function.
        Args:
            X: tensor: (batch_size, ...)
        Return:
            _: tensor: (batch, ...): applies swish 
            activation to input tensor and returns  
        """
        return x*torch.sigmoid(x)

    def forward(self, x):
        y = self.swish(self.bn1(self.conv1(x)))
        y = self.swish(self.bn2(self.conv2(y)))
        y = y + self.shortcut(x)
        y = self.swish(y)
        return y



class Self_Attn_Seq(nn.Module):
    def __init__(self,in_dim, n_head=3):
        super(Self_Attn_Seq,self).__init__()
        input_dim = in_dim
        self.n_head = n_head # number of attenn head
        self.hidden_size_attention = input_dim // self.n_head
        self.w_q = nn.Linear(input_dim, self.n_head * self.hidden_size_attention)
        self.w_k = nn.Linear(input_dim, self.n_head * self.hidden_size_attention)
        self.w_v = nn.Linear(input_dim, self.n_head * self.hidden_size_attention)
        nn.init.normal_(self.w_q.weight, mean=0, std=np.sqrt(2.0 / (input_dim + self.hidden_size_attention)))
        nn.init.normal_(self.w_k.weight, mean=0,
                        std=np.sqrt(2.0 / (input_dim + self.hidden_size_attention)))
        nn.init.normal_(self.w_v.weight, mean=0,
                        std=np.sqrt(2.0 / (input_dim + self.hidden_size_attention)))
        self.temperature = np.power(self.hidden_size_attention, 0.5)

        self.softmax = nn.Softmax(dim=2)
        self.linear2 = nn.Linear(self.n_head * self.hidden_size_attention, input_dim)
        self.layer_norm = nn.LayerNorm(input_dim)
        self.gamma = nn.Parameter(torch.zeros(1))
    

    def forward(self, q):
        n_head = self.n_head
        residual = q
        k, v = q, q
        bs, len, _ = q.size()
        q = self.w_q(q).view(bs, len, n_head, self.hidden_size_attention)
        k = self.w_k(k).view(bs, len, n_head, self.hidden_size_attention)
        v = self.w_v(v).view(bs, len, n_head, self.hidden_size_attention)

        q = q.permute(2, 0, 1, 3).contiguous().view(-1, len, self.hidden_size_attention)
        k = k.permute(2, 0, 1, 3).contiguous().view(-1, len, self.hidden_size_attention)
        v = v.permute(2, 0, 1, 3).contiguous().view(-1, len, self.hidden_size_attention)

        # generate mask
        subsequent_mask = torch.triu(
            torch.ones((len, len), device=q.device, dtype=torch.uint8), diagonal=1)
        subsequent_mask = subsequent_mask.unsqueeze(0).expand(bs, -1, -1).gt(0)
        mask = subsequent_mask.repeat(n_head, 1, 1)

        # self attention
        attn = torch.bmm(q, k.transpose(1, 2)) / self.temperature
        attn = attn.masked_fill(mask, -np.inf)
        attn = self.softmax(attn)

        output = torch.bmm(attn, v)
        output = output.view(n_head, bs, len, self.hidden_size_attention)
        output = output.permute(1, 2, 0, 3).contiguous().view(bs, len, -1)
        output = self.gamma * self.linear2(output) + residual


        attn = attn.view(n_head,bs,len,len)
        attn_avg = torch.mean(attn,0)
        return output, attn_avg





class Generator(nn.Module):
    def __init__(self,  
                 seq_len, 
                 input_size, 
                 embedding_size:int,
                 temporal_decoder_filters=[4,8,14,16],
                 feat_size = [2,4],
                 internal_attention = [168]
                 ):
        super(Generator, self).__init__()
        self.embedding_size = embedding_size

        temporal_decoder_filters.append(seq_len-2)
        self.temporal_decoder_filters = temporal_decoder_filters

        self.latent_dim_inner = self.embedding_size//self.temporal_decoder_filters[0]

        self.input_size = input_size
        self.internal_attention = internal_attention
        self.feat_sizes = feat_size
        self.seq_len = seq_len

        #transpose blocks
        self.decode_s1 = BasicBlockTranspose(self.latent_dim_inner//self.feat_sizes[0], self.latent_dim_inner//self.feat_sizes[0], k=(3,1))
        self.decode_s2 = BasicBlockTranspose(self.internal_attention[0]//self.feat_sizes[1], self.internal_attention[0]//self.feat_sizes[1], k=(3,1))

        # decoder 
        self.conv1 = BasicBlock(1,1)
        self.conv2 = BasicBlock(1,1)
        self.conv3 = BasicBlock(1,1)
        self.conv4 = BasicBlock(1,1)
        self.decode_t = BasicBlock(self.temporal_decoder_filters[0],self.temporal_decoder_filters[1])
        self.decode_t1 = BasicBlock(self.temporal_decoder_filters[1],self.temporal_decoder_filters[2])
        self.decode_t2 = BasicBlock(self.decode_s1.get_h_out(self.temporal_decoder_filters[2]),
                                    self.temporal_decoder_filters[3])
        self.decode_t3 = BasicBlock(self.temporal_decoder_filters[3],self.temporal_decoder_filters[4])
        self.decode_t4 = BasicBlock(self.decode_s2.get_h_out(self.temporal_decoder_filters[4]),
                                    self.seq_len)

        # self attention layer
        self.decoder_attn1 = Self_Attn_Seq(self.latent_dim_inner)
        self.decoder_attn2 = Self_Attn_Seq(self.internal_attention[0])
        self.decoder = nn.Linear(self.latent_dim_inner, self.internal_attention[0])
        self.decoder1 = nn.Linear(self.internal_attention[0],self.input_size)

        
        # self.decode_s3 = BasicBlockTranspose(22, 22, k=(3,1))



    def forward(self, X):
        """0
        The deocder is opposit of the encoder. It takes the vector sampled
        from a mixture of gaussian parameter conditioned by class label on-
        hot vector and viewpoint vector, upsamples it in the temporal dimension 
        first and then upsamples it in the spatial dimension.
        Args:
            X: tensor: (batch_size, 4, ...): sampled vector conditionied on class 
            label and viewpoint
        Return:
            x: tensor: (batch_size, 32, 48, 6): generated human motion
        """

        N = X.shape[0]
        X = X.reshape((N,self.temporal_decoder_filters[0],-1))
        N,T,J = X.shape
        x, attn = self.decoder_attn1(X)

        # temporal decoding
        x = x.reshape((N,T,J//self.feat_sizes[0],self.feat_sizes[0]))
        x = self.decode_t(x)
        x = self.decode_t1(x)

        # ----------------------------------------------------------------
        # ------------------------- newly added --------------------------
        # ----------------------------------------------------------------

        x = x.transpose(2,1)
        x = self.decode_s1(x)
        x = x.transpose(2,1)

        # ----------------------------------------------------------------
        # pose decoding
        x = x.reshape((N*self.decode_s1.get_h_out(self.temporal_decoder_filters[2]),1,J//self.feat_sizes[0],self.feat_sizes[0]))
        x = self.conv1(x)
        x = x.reshape((N,self.decode_s1.get_h_out(self.temporal_decoder_filters[2]), -1))

        x = self.decoder(x)
        x, attn = self.decoder_attn2(x)
        # ------------------------ End of block one ---------------------

        
        N,T,J = x.shape
        # temporal decoding
        x = x.reshape((N,T,J//self.feat_sizes[1], self.feat_sizes[1]))
        x = self.decode_t2(x)
        x = self.decode_t3(x)

        # ----------------------------------------------------------------
        # ------------------------- Transpose block --------------------------
        # ----------------------------------------------------------------

        x = x.transpose(2,1)
        x = self.decode_s2(x)
        x = x.transpose(2,1)

        # ----------------------------------------------------------------
        # pose decoding
        x = x.reshape((N*self.seq_len,1,J//self.feat_sizes[1],self.feat_sizes[1]))
        x = self.conv2(x)
        x = x.reshape((N,self.seq_len, -1))
        x = self.decoder1(x)
        # ------------------------ End of block two ---------------------

        return x

In [245]:
generator     = Generator(  
                 seq_len = 65, 
                 input_size = 24, 
                 embedding_size=1024
                  ).to(device)

In [246]:
generator(torch.rand((16,1024)).to(device)).shape

torch.Size([16, 65, 24])

In [247]:
torchinfo.summary(generator, input_size=(32,1024), col_names = ("input_size", "output_size", "num_params", "kernel_size", "mult_adds"))

  action_fn=lambda data: sys.getsizeof(data.storage()),
  return super().__sizeof__() + self.nbytes()


Layer (type:depth-idx)                   Input Shape               Output Shape              Param #                   Kernel Shape              Mult-Adds
Generator                                [32, 1024]                [32, 65, 24]              13,284                    --                        --
├─Self_Attn_Seq: 1-1                     [32, 4, 256]              [32, 4, 256]              513                       --                        --
│    └─Linear: 2-1                       [32, 4, 256]              [32, 4, 255]              65,535                    --                        2,097,120
│    └─Linear: 2-2                       [32, 4, 256]              [32, 4, 255]              65,535                    --                        2,097,120
│    └─Linear: 2-3                       [32, 4, 256]              [32, 4, 255]              65,535                    --                        2,097,120
│    └─Softmax: 2-4                      [96, 4, 4]                [96, 4, 4]       

# *Below Code WorthLess Don't Check* 

In [None]:
#out = general.check_runs('kinetic-gan', id=-1)
out = "/content/runs/kinetic-gan"
actions_out = os.path.join(out, 'actions')
if not os.path.exists(actions_out): os.makedirs(actions_out)

config_file = open(os.path.join(out,"gen_config.txt"),"w")
config_file.write("Kinetic-GAN.ipynb" + '|' + str(config))
config_file.close()

cuda = True if torch.cuda.is_available() else False
print(cuda)

# Initialize generator 
generator     = Generator(
    config["latent_dim"], 
    config["channels"], 
    config["n_classes"], 
    config["t_size"], 
    mlp_dim=config["mlp_dim"], 
    dataset=config["dataset"])

if cuda:
    generator.cuda()

FloatTensor = torch.cuda.FloatTensor if cuda else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if cuda else torch.LongTensor

# Load Models
generator.load_state_dict(torch.load(config["model"], map_location=device),strict=False)
generator.eval()

new_imgs   = []
new_labels = []
z_s        = []

classes = np.arange(config["n_classes"]) if config["label"] == -1 else [config["label"]]
qtd = config["batch_size"]

if config["stochastic_file"]!='-':
    stoch = np.load(config["stochastic_file"]) 
    stoch = np.expand_dims(stoch[config["stochastic_index"]], 0)
    print(stoch.shape)

if config["stochastic"]:  # Generate one latent point 
    z   = Variable(FloatTensor(np.random.normal(0, 1, (1, config["latent_dim"]))  if config["stochastic_file"] == '-' else stoch ))
    z   = z.repeat(qtd*len(classes),1)

while(len(classes)>0):

    if not config["stochastic"]: # Generate Samples if not in mode stochastic
        z         = Variable(FloatTensor(np.random.normal(0, 1, (qtd*len(classes), config["latent_dim"])))) 

    z         = trunc(z, config["mean_size"], config["trunc"]) if config["trunc_mode"]=='z' else z
    labels_np = np.array([num for _ in range(qtd) for num in classes])  # Generate labels
    labels    = Variable(LongTensor(labels_np))
    gen_imgs  = generator(z, labels, config["trunc"]) if config["trunc_mode"] == 'w' else generator(z, labels)

    new_imgs   = gen_imgs.data.cpu()  if len(new_imgs)==0 else np.concatenate((new_imgs, gen_imgs.data.cpu()), axis=0)
    new_labels = labels_np if len(new_labels)==0 else np.concatenate((new_labels, labels_np), axis=0)
    z_s        = z.cpu()  if len(z_s)==0 else np.concatenate((z_s, z.cpu()), axis=0)   
    

    tmp     = Counter(new_labels)
    classes = [i for i in classes if tmp[i]<config["gen_qtd"]]

    print('---------------------------------------------------')
    print(tmp)
    print(len(new_labels), classes)


if config["dataset"] == 'ntu':
    new_imgs = np.expand_dims(new_imgs, axis=-1)
    

new_labels = np.concatenate((np.expand_dims(new_labels, 0), np.expand_dims(new_labels, 0)), axis=0)  # Remove if not needed

with open(os.path.join(actions_out, str(config["n_classes"] if config["label"] == -1 else config["label"])+'_'+str(config["gen_qtd"])+('_trunc' + str(config["trunc"]) if config["trunc_mode"]!='-' else '')+('_stochastic' if config["stochastic"] else '')+'_gen_data.npy'), 'wb') as npf:
    np.save(npf, new_imgs)


with open(os.path.join(actions_out, str(config["n_classes"] if config["label"] == -1 else config["label"])+'_'+str(config["gen_qtd"])+('_trunc' + str(config["trunc"]) if config["trunc_mode"]!='-' else '')+('_stochastic' if config["stochastic"] else '')+'_gen_z.npy'), 'wb') as npf:
    np.save(npf, z_s)


with open(os.path.join(actions_out, str(config["n_classes"] if config["label"] == -1 else config["label"])+'_'+str(config["gen_qtd"])+('_trunc' + str(config["trunc"]) if config["trunc_mode"]!='-' else '')+('_stochastic' if config["stochastic"] else '')+'_gen_label.pkl'), 'wb') as f:
    pickle.dump(new_labels, f)

In [None]:
data = np.load("/content/runs/kinetic-gan/actions/120_10_trunc0.95_gen_data.npy")

with open("/content/runs/kinetic-gan/actions/120_10_trunc0.95_gen_label.pkl", 'rb') as f:
    labels = pickle.load(f)

In [None]:
data = np.squeeze(data)
labels = labels[0].squeeze()

In [None]:
data.shape,labels.shape

((1200, 3, 64, 25), (1200,))

In [None]:
#!rm -r runs/synthetic

In [None]:
#!python visualization/action_ntu.py --path "/content/runs/kinetic-gan/actions/120_10_trunc0.95_gen_data.npy" --indexes 26 86 146 

In [None]:
def gen_skeleton(frame, 
                 height,
                 width,
                 mapping_list = [(0, 1), (1, 3), (3, 5), 
                                 (0, 2), (2, 4), (0, 6), 
                                 (1, 7), (6, 7), (6, 8), 
                                 (7, 9), (8, 10), (9, 11)]):
    img_3 = np.zeros([height, width,3],dtype=np.uint8)
    img_3.fill(255)

    # add circles
    for coord in frame:
        x, y = int(width*coord[0]), int(height*coord[1])
        img_3 = cv2.circle(img_3, center=(x,y), radius=1, color=(255, 0, 0), thickness=6)

    # add lines
    for line in mapping_list:
        i, j = line
        st = frame[i, :]
        start_point = (int(width*st[0]), int(height*st[1]))

        en = frame[j, :]
        end_point = (int(width*en[0]), int(height*en[1]))

        img3_ = cv2.line(img_3, start_point, end_point, color=(0, 0, 0), thickness=3)

    return img_3

def gen_video(points, 
              save_file, 
              frame_h, 
              frame_w, 
              is_3d=True,
              mapping_list = [(0, 1), (1, 3), (3, 5), 
                                 (0, 2), (2, 4), (0, 6), 
                                 (1, 7), (6, 7), (6, 8), 
                                 (7, 9), (8, 10), (9, 11)]):
    # make 3D if points are flatten
    if len(points.shape) == 2:
        if is_3d:
          fts = points.shape[1]
          x_cds = list(range(0, fts, 3))
          y_cds = list(range(1, fts, 3))
          z_cds = list(range(2, fts, 3))
          points = np.transpose(np.array([points[:, x_cds], 
                                          points[:, y_cds], 
                                          points[:, z_cds]]), (1,2,0))
        else:
          fts = points.shape[1]
          x_cds = list(range(0, fts, 2))
          y_cds = list(range(1, fts, 2))
          points = np.transpose(np.array([points[:, x_cds], 
                                          points[:, y_cds]]), (1,2,0))

    size = (frame_w, frame_h)
    result = cv2.VideoWriter(save_file,
                         cv2.VideoWriter_fourcc(*'MJPG'),
                         10, size)

    for __id,frame in enumerate(points):
        skel_image = gen_skeleton(frame, frame_h, frame_w,mapping_list=mapping_list)
        result.write(skel_image)

    result.release()

In [None]:
joint_map = [(3,2),(2,20),(20,4),(4,5),(5,6),(6,7),(7,21),(7,22),(20,8),(8,9),(9,10),(10,11),(11,23),(11,24),
            (20,1),(1,0),(0,12),(12,13),(13,14),(14,15),(0,16),(16,17),(17,18),(18,19),(8,16),(4,12),(8,4),(16,12)]

In [None]:
save_vids_dir = "checking_vids/init"
for index,adata,alabel in enumerate(tqdm(zip(data,labels))):
  data = adata
  file_id = index
  target = alabel
  vid_size = [int(adata[3][0][selected_ind]),int(adata[3][1][selected_ind])]

  try:
    if not os.path.exists(f"{save_vids_dir}/{file_id}/dataloader_out_cls_{target}.mp4"):
      os.makedirs(f"{save_vids_dir}/{file_id}",exist_ok=True)
      gen_video(data, 
                f"{save_vids_dir}/{file_id}/dataloader_out_cls_{target}.mp4",
                vid_size[0], 
                vid_size[1],
                is_3d=False,
                mapping_list=joint_map
                )
  except ValueError:
    continue

torch.Size([32, 2, 60, 25])