In [9]:
import random
import pytorch_lightning
import torchvision.models as models
from torch.utils.data import Dataset, DataLoader
import pytorchvideo.data
import pathlib
import cv2
import torch.nn.functional as F
from argparse import ArgumentParser
import torch.nn as nn
from torchvision import transforms
import torch
import os
import time
import numpy
import pytorchvideo.models.resnet
from os.path import exists
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    UniformTemporalSubsample,
    RandomShortSideScale,
    Normalize
)
from typing import Any, Callable, List, Optional
from pytorchvideo.data.encoded_video import EncodedVideo

In [2]:
IMAGENET_STATS = ([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
FRAME_SIZE = (int(720/2), int(1280/2))

In [13]:
transform = transforms.Compose(
      [
        ApplyTransformToKey(
          key="video",
          transform= transforms.Compose(
            [
              UniformTemporalSubsample(8),
              transforms.Lambda(lambda x: x / 255.0),
              Normalize(*IMAGENET_STATS),
            ]
          ),
        ),
      ]
    )

device = 'cpu'

In [4]:
class DashcamStopTimeModel(pytorch_lightning.LightningModule):
  def __init__(self):
    super(DashcamStopTimeModel, self).__init__()
    self.model = pytorchvideo.models.resnet.create_resnet(
      input_channel=3, # RGB input from Kinetics
      model_depth=50, # For the tutorial let's just use a 50 layer network
      model_num_class=1, # Kinetics has 400 classes so we need out final head to align
      norm=nn.BatchNorm3d
    )
    print(self.model)
    #self.model.fc = nn.Linear(in_features=2048, out_features=1)
    #self.model.classifier = nn.Linear(in_features=1024, out_features=1)
    #freeze_layers(self.model)

  def forward(self, x):
    out = self.model(x)
    return out

  def configure_optimizers(self):
    optimizer = torch.optim.Adam(self.parameters(), lr=3e-3)
    return optimizer

  def loss_function(self, logits, labels):
    return F.l1_loss(logits, labels)
    #return F.mse_loss(logits, labels).float()

  def training_step(self, train_batch, batch_idx):
    y_hat = self.model(train_batch["video"])
    loss = self.loss_function(y_hat, train_batch["label"])
    self.log('train_loss', loss)
    return loss

  def validation_step(self, val_batch, batch_idx):
    y_hat = self.model(val_batch['video'])
    loss = self.loss_function(y_hat, val_batch['label'])
    self.log('val_loss', loss)

In [6]:
model = DashcamStopTimeModel.load_from_checkpoint('/home/scott/Documents/fh/project_coursework/lightning_logs/version_1/checkpoints/epoch=69-step=24640.ckpt')

Net(
  (blocks): ModuleList(
    (0): ResNetBasicStem(
      (conv): Conv3d(3, 64, kernel_size=(3, 7, 7), stride=(1, 2, 2), padding=(1, 3, 3), bias=False)
      (norm): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (activation): ReLU()
      (pool): MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=[0, 1, 1], dilation=1, ceil_mode=False)
    )
    (1): ResStage(
      (res_blocks): ModuleList(
        (0): ResBlock(
          (branch1_conv): Conv3d(64, 256, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
          (branch1_norm): BatchNorm3d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (branch2): BottleneckBlock(
            (conv_a): Conv3d(64, 64, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
            (norm_a): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (act_a): ReLU()
            (conv_b): Conv3d(64, 64, kernel_size=(1, 3, 3), stride=(1, 

In [15]:
def predict(video_filename):
    video = EncodedVideo.from_path(video_filename)


    # Select the duration of the clip to load by specifying the start and end duration
    # The start_sec should correspond to where the action occurs in the video
    start_sec = 0
    clip_duration = int(video.duration)
    end_sec = start_sec + clip_duration    

    # Load the desired clip
    video_data = video.get_clip(start_sec=start_sec, end_sec=2)

    # Apply a transform to normalize the video input
    video_data = transform(video_data)

    # Move the inputs to the desired device
    inputs = video_data["video"]
    inputs = inputs.to(device)

    # Pass the input clip through the model
    preds_pre_act = model(inputs[None, ...])
    print(preds_pre_act)

In [31]:
import matplotlib.pyplot as plt
import matplotlib
def show_img(im, ax=None, figsize=(8,8), bgr=True):
  '''Function taken from lecture material. Added in support for showing RGB images in addition to BGR, set the bgr parameter to be False (defaults to True) to show a RGB image'''
  if not ax: _,ax = plt.subplots(1,1,figsize=figsize)
  if len(im.shape)==2: im = numpy.tile(im[:,:,None], 3)
  show_im = im
  ax.imshow(im[:,:,::-1]);
  ax.xaxis.set_visible(False)
  ax.yaxis.set_visible(False)
  return ax

In [18]:
predict('/mnt/ssd/scott/temp/video-shorts/35b5a5cc-461a1da6.mov')

tensor([[6132.1841]], grad_fn=<ViewBackward0>)


In [19]:
DIR = pathlib.Path('/mnt/ssd/scott/temp/video-shorts')
MAX_FILES = 50

file_counter = 0

for file in DIR.iterdir():
    file_name = file.resolve()
    result = predict(file)
    
    print(f'{file.name} = {result}')
    
    file_counter += 1
    if file_counter >= MAX_FILES:
        break
        
print('done')

tensor([[6145.9692]], grad_fn=<ViewBackward0>)
02bb67ae-8c3d61f8.mov = None
tensor([[6143.0522]], grad_fn=<ViewBackward0>)
02c01e62-c53e91a0.mov = None
tensor([[6138.3535]], grad_fn=<ViewBackward0>)
02cdc06d-5502f174.mov = None
tensor([[6143.6201]], grad_fn=<ViewBackward0>)
02ce41e9-147b2dd1.mov = None
tensor([[6145.6538]], grad_fn=<ViewBackward0>)
02d478d1-42ca3cfb.mov = None
tensor([[6130.8828]], grad_fn=<ViewBackward0>)
02d478d1-84df3ff0.mov = None
tensor([[6136.7627]], grad_fn=<ViewBackward0>)
02d478d1-cfb83eb8.mov = None
tensor([[6154.3423]], grad_fn=<ViewBackward0>)
02d5e1be-3827db43.mov = None
tensor([[6150.4219]], grad_fn=<ViewBackward0>)
02d5e1be-c0e9cba4.mov = None
tensor([[6140.0625]], grad_fn=<ViewBackward0>)
02d5e1be-cb00355e.mov = None
tensor([[6137.3833]], grad_fn=<ViewBackward0>)
02ddd956-1bee234c.mov = None
tensor([[6144.5752]], grad_fn=<ViewBackward0>)
02ddd956-8089f55d.mov = None
tensor([[6142.3120]], grad_fn=<ViewBackward0>)
02e948bb-53fcbba0.mov = None
tensor([[613

In [25]:
image