In [1]:
import cv2
import threading 
import time
import torch
import torchvision
from torchvision import transforms

In [None]:
if(torch.cuda.is_available()):
    device = torch.device("cuda")
    print(device, torch.cuda.get_device_name(0))
else:
    device= torch.device("cpu")
    print(device)

In [2]:
# if CUDA Out of Memory, use CPU
device = torch.device("cpu")

### Load Slow Fast

In [3]:
torch.hub._validate_not_a_forked_repo=lambda a,b,c: True
SlowFastNet = torch.hub.load('facebookresearch/pytorchvideo', 'slowfast_r50', pretrained=True)
SlowFastNet = SlowFastNet.eval()
SlowFastNet = SlowFastNet.to(device)

Using cache found in C:\Users\ADMIN/.cache\torch\hub\facebookresearch_pytorchvideo_main


In [4]:
import urllib
import json

json_url = "https://dl.fbaipublicfiles.com/pyslowfast/dataset/class_names/kinetics_classnames.json"
json_filename = "kinetics_classnames.json"
try: urllib.URLopener().retrieve(json_url, json_filename)
except: urllib.request.urlretrieve(json_url, json_filename)
    
with open(json_filename, "r") as f:
    kinetics_classnames = json.load(f)

# Create an id to label name mapping
kinetics_id_to_classname = {}
for k, v in kinetics_classnames.items():
    kinetics_id_to_classname[v] = str(k).replace('"', "")

In [5]:
from torchvision.transforms import Compose, Lambda
from torchvision.transforms._transforms_video import NormalizeVideo
from pytorchvideo.transforms import UniformTemporalSubsample



In [6]:
# Slow Fast parameters
mean = [0.45, 0.45, 0.45]
std = [0.225, 0.225, 0.225]
slowfast_alpha = 4
num_frames = 32
sampling_rate = 2

In [7]:
SlowFastTransform=Compose(
    [
        UniformTemporalSubsample(num_frames),
        Lambda(lambda x: x/255.0),
        NormalizeVideo(mean, std),
    ])

### Read img frames from web cam using Threading

In [8]:
# read web cam thread
def Read_web_cam():
    global __ret, __frame
    t1 = threading.currentThread()
    while getattr(t1, "do_run", True):
        __ret, __frame = __cap.read()

In [9]:
# used by Predict_action thread
def Collect_img_frames_and_predict_action (NoFrames):
    #collect No. of image frames from web cam
    #return a list [img1, img2, ...], img = torch (3, H, W)
    global __ret, __frame
    imgLst =[]
    frame_count = 1
    while (frame_count<= NoFrames):
        if __ret == True:
            transform = transforms.Compose([transforms.ToTensor()]) 
            img = transform(__frame) # torch (3, H, W)
            imgLst.append(img)
            frame_count +=1
    
    # pass image frames to SlowFast
    ImgsTensor = torch.stack(imgLst)  #stach a list of tensors 
    ImgsTensor = ImgsTensor.permute(1, 0, 2, 3)
    fast_pathway = SlowFastTransform(ImgsTensor)
    slow_pathway = torch.index_select(
        fast_pathway,
        1,
        torch.linspace(
            0, fast_pathway.shape[1] - 1, fast_pathway.shape[1] // slowfast_alpha
        ).long(),
    )
    SlowFastInputFrames = [slow_pathway, fast_pathway]
    SlowFastInputFrames = [i.to(device)[None, ...] for i in SlowFastInputFrames]
    preds = SlowFastNet(SlowFastInputFrames)
    post_act = torch.nn.Softmax(dim=1)
    preds = post_act(preds)
    del SlowFastInputFrames
    torch.cuda.empty_cache() #release GPU memory cache
    return preds

In [10]:
# calculate No of frames need to be sent to SlowFast
SlowFastFrames = num_frames * sampling_rate

In [11]:
# thread to predict action
def Predict_action():
    #every 30 seconds, collect image frames and send to SlowFast
    t2 = threading.currentThread()
    while getattr(t2, "do_run", True):
        start = time.time()
        time_to_start = time.strftime("%m/%d/%Y, %H:%M:%S", time.localtime())
        print(time_to_start, end ="==>")
        preds= Collect_img_frames_and_predict_action (SlowFastFrames)
        for i, score in enumerate(preds[0].topk(k=5)[0].tolist()):
            if(score >= 0.1):
                ActionID = preds[0].topk(k=5)[1][i] 
                ActionLabel = kinetics_id_to_classname[int(ActionID)]
                print(ActionLabel, round(score, 2), end=",  ")
        print("\n")
        timePassed = time.time()-start
        #time.sleep(30-timePassed)
        if(timePassed<10):
            time.sleep(10-timePassed)

In [12]:
#啟動 web cam, 以 global variables 紀錄, 
__cap = cv2.VideoCapture(0) 
__ret, __frame = __cap.read()

# seperate thread to read image frames from web cam
t1 = threading.Thread(target = Read_web_cam, args = ())
t1.start()

# seperate thread to collect image frames and send to SlowFast
t2 = threading.Thread(target = Predict_action, args = ())
t2.start()

# main thread to display
cv2.namedWindow("window", cv2.WINDOW_NORMAL)
while (True):
    if __ret == True:
        cv2.imshow("window", __frame)
    # Press Q on keyboard to exit
    if (cv2.waitKey(25) & 0xFF == ord('q')):
        t1.do_run = t2.do_run = False  #stop thread
        break #break from while loop

t1.join() #when thread finishes comes here
t2.join()
__cap.release()
cv2.destroyAllWindows()

01/08/2022, 15:46:19==>singing 0.51,  dancing ballet 0.38,  

01/08/2022, 15:46:29==>singing 0.45,  dancing ballet 0.44,  

01/08/2022, 15:46:39==>singing 0.45,  dancing ballet 0.45,  

01/08/2022, 15:46:49==>singing 0.47,  dancing ballet 0.41,  

01/08/2022, 15:46:59==>dancing ballet 0.44,  singing 0.44,  

01/08/2022, 15:47:09==>singing 0.69,  dancing ballet 0.26,  

01/08/2022, 15:47:19==>singing 0.63,  dancing ballet 0.31,  

01/08/2022, 15:47:29==>singing 0.59,  dancing ballet 0.36,  

01/08/2022, 15:47:39==>singing 0.61,  dancing ballet 0.33,  

01/08/2022, 15:47:49==>singing 0.56,  dancing ballet 0.36,  

01/08/2022, 15:47:59==>singing 0.65,  dancing ballet 0.3,  

01/08/2022, 15:48:09==>singing 0.62,  dancing ballet 0.3,  

01/08/2022, 15:48:19==>singing 0.57,  dancing ballet 0.38,  

01/08/2022, 15:48:29==>singing 0.55,  dancing ballet 0.39,  

01/08/2022, 15:48:39==>singing 0.57,  dancing ballet 0.37,  

01/08/2022, 15:48:49==>singing 0.67,  dancing ballet 0.27,  

01/08/2022