In [1]:
import cv2
import threading 
import time
import torch
import torchvision
from torchvision import transforms
import numpy as np

In [2]:
if(torch.cuda.is_available()):
    device = torch.device("cuda")
    print(device, torch.cuda.get_device_name(0))
else:
    device= torch.device("cpu")
    print(device)

cuda NVIDIA GeForce RTX 3060


### Load Slow Fast

In [3]:
torch.hub._validate_not_a_forked_repo=lambda a,b,c: True
SlowFastNet = torch.hub.load('facebookresearch/pytorchvideo', 'slowfast_r50', pretrained=True)

Using cache found in C:\Users\ADMIN/.cache\torch\hub\facebookresearch_pytorchvideo_main


In [4]:
SlowFastNet = SlowFastNet.eval()
SlowFastNet = SlowFastNet.to(device)

In [5]:
import urllib
import json

json_url = "https://dl.fbaipublicfiles.com/pyslowfast/dataset/class_names/kinetics_classnames.json"
json_filename = "kinetics_classnames.json"
try: urllib.URLopener().retrieve(json_url, json_filename)
except: urllib.request.urlretrieve(json_url, json_filename)

In [6]:
with open(json_filename, "r") as f:
    kinetics_classnames = json.load(f)

# Create an id to label name mapping
kinetics_id_to_classname = {}
for k, v in kinetics_classnames.items():
    kinetics_id_to_classname[v] = str(k).replace('"', "")

### Read img frames from web cam using Threading

In [7]:
#啟動 web cam, 以 global variables 紀錄, 
__cap = cv2.VideoCapture(0) 
__ret, __frame = __cap.read()

In [8]:
def Read_web_cam():
    global __ret, __frame
    t = threading.currentThread()
    while getattr(t, "do_run", True):
        __ret, __frame = __cap.read()

In [9]:
t = threading.Thread(target = Read_web_cam, args = ())
t.start()

In [10]:
num_frames = 32
sampling_rate = 2
SlowFastFrames = num_frames * sampling_rate

In [11]:
# collect SlowFastFrames = num_frames * sampling_rate
imgLst =[]
frame_count = 1
cv2.namedWindow("window", cv2.WINDOW_NORMAL)
while (frame_count<= SlowFastFrames):
    if __ret == True:
        cv2.imshow("window", __frame) # np array (H, W, 3) 
        
        transform = transforms.Compose([transforms.ToTensor()]) 
        img = transform(__frame) # torch (3, H, W)
        imgLst.append(img)
        frame_count +=1
t.do_run = False  #stop thread
t.join() #when thread finishes comes here
__cap.release()
cv2.destroyAllWindows()

In [12]:
print(len(imgLst), imgLst[0].shape)

64 torch.Size([3, 480, 640])


In [13]:
# torch.stack stacks a list of tensors
#a = [torch.FloatTensor([1, 2, 3]), torch.FloatTensor([3, 4, 5])]
#b = torch.stack(a)  
#tensor([[1., 2., 3.],
#        [3., 4., 5.]])

ImgsTensor = torch.stack(imgLst)  #stach a list of tensors 
print(ImgsTensor.shape)

torch.Size([64, 3, 480, 640])


In [14]:
# convert tensor shape from [64, 3, 256, 256] to [3, 64, 256, 256]
ImgsTensor = ImgsTensor.permute(1, 0, 2, 3) #swap axis 0 and 1
print(ImgsTensor.shape)

torch.Size([3, 64, 480, 640])


### Sent image frame tensor to SlowFast

In [15]:
mean = [0.45, 0.45, 0.45]
std = [0.225, 0.225, 0.225]
slowfast_alpha = 4

In [16]:
from torchvision.transforms import Compose, Lambda
from torchvision.transforms._transforms_video import NormalizeVideo
from pytorchvideo.transforms import UniformTemporalSubsample



In [17]:
transform=Compose(
    [
        UniformTemporalSubsample(num_frames),
        Lambda(lambda x: x/255.0),
        NormalizeVideo(mean, std),
    ])

In [18]:
fast_pathway = transform(ImgsTensor)
print(fast_pathway.shape)

torch.Size([3, 32, 480, 640])


In [19]:
slow_pathway = torch.index_select(
            fast_pathway,
            1,
            torch.linspace(
                0, fast_pathway.shape[1] - 1, fast_pathway.shape[1] // slowfast_alpha
            ).long(),
        )
print(slow_pathway.shape)

torch.Size([3, 8, 480, 640])


In [20]:
SlowFastInputFrames = [slow_pathway, fast_pathway]

In [21]:
SlowFastInputFrames = [i.to(device)[None, ...] for i in SlowFastInputFrames]

In [22]:
# Pass the input clip through the model
preds = SlowFastNet(SlowFastInputFrames)

In [23]:
# Get the predicted classes
post_act = torch.nn.Softmax(dim=1)
preds = post_act(preds)

In [24]:
preds[0].topk(k=5)

torch.return_types.topk(
values=tensor([0.5935, 0.3248, 0.0363, 0.0232, 0.0047], device='cuda:0',
       grad_fn=<TopkBackward0>),
indices=tensor([304,  84,  93, 326,  40], device='cuda:0'))

In [25]:
print(preds[0].topk(k=5)[0], "\n", preds[0].topk(k=5)[1])

tensor([0.5935, 0.3248, 0.0363, 0.0232, 0.0047], device='cuda:0',
       grad_fn=<TopkBackward0>) 
 tensor([304,  84,  93, 326,  40], device='cuda:0')


In [26]:
pred_classes = preds[0].topk(k=5)[1]
print(pred_classes)

tensor([304,  84,  93, 326,  40], device='cuda:0')


In [44]:
# Map the predicted classes to the label names
pred_class_names = [kinetics_id_to_classname[int(i)] for i in pred_classes]
print("Top 5 predicted labels: %s" % ", ".join(pred_class_names))

Top 5 predicted labels: singing, dancing ballet, diving cliff, spinning poi, bungee jumping


In [34]:
# get answer with prob>0.1
result = []
for i, score in enumerate(preds[0].topk(k=5)[0].tolist()):
    if(score >= 0.1):
        ActionID = preds[0].topk(k=5)[1][i] 
        ActionLabel = kinetics_id_to_classname[int(ActionID)]
        print(ActionLabel, round(score, 2))
        result.append([ActionLabel, round(score, 2)])

singing 0.59
dancing ballet 0.32


In [35]:
result

[['singing', 0.59], ['dancing ballet', 0.32]]