# UCF Encoding

In this notebook, we will try to encode the whole UCF-Crime dataset, and form a dataset of embeddings.

For this, we will encode each frame and text with CLIP.

### Imports

In [1]:
# General
import numpy as np
import os

# UCA Dataset
import pandas as pd

# CLIP encoding
from transformers import CLIPModel, AutoProcessor
import torch
import cv2

# My code
from utils import read_uca_as_df

ModuleNotFoundError: No module named 'numpy'

### Paths

In [2]:
ucf_crime_path = '/media/pablo/358690d7-e500-45fb-b8f8-bc48c6be13e3/UCF-Crimes/Videos'

uca_path = '/media/pablo/358690d7-e500-45fb-b8f8-bc48c6be13e3/Surveillance-Video-Understanding/UCF Annotation/json'

save_path = 'clip_embs'

### Read UCA dataset

In [3]:
uca_df = read_uca_as_df(uca_path=uca_path)

In [4]:
uca_df.head()

Unnamed: 0,video_duration,timestamp,sentence,video,dataset,clip_duration,class_name,anomaly,sentence_length
0,91.0,"[0.0, 5.3]","A woman with short hair, slightly fat, wearing...",Abuse001_x264,train,5.3,Abuse,True,158
1,91.0,"[7.0, 8.5]",A man wearing a white shirt and black pants en...,Abuse001_x264,train,1.5,Abuse,True,144
2,91.0,"[7.2, 8.5]",A man wearing a black shirt and black pants en...,Abuse001_x264,train,1.3,Abuse,True,144
3,91.0,"[8.2, 8.9]",A man wearing a white shirt and black pants ap...,Abuse001_x264,train,0.7,Abuse,True,275
4,91.0,"[8.9, 11.2]",A man in black clothes approached a short-hair...,Abuse001_x264,train,2.3,Abuse,True,185


### Initialize CLIP

In [5]:
model = CLIPModel.from_pretrained('openai/clip-vit-large-patch14')
processor = AutoProcessor.from_pretrained("openai/clip-vit-large-patch14")

In [6]:
def get_img_embedding(model:CLIPModel, processor:AutoProcessor, img:np.array) -> torch.FloatTensor:
    inputs = processor(images=img, return_tensors='pt')

    image_features = model.get_image_features(**inputs)

    return image_features

### Get frames

In [7]:
uca_df.head()

Unnamed: 0,video_duration,timestamp,sentence,video,dataset,clip_duration,class_name,anomaly,sentence_length
0,91.0,"[0.0, 5.3]","A woman with short hair, slightly fat, wearing...",Abuse001_x264,train,5.3,Abuse,True,158
1,91.0,"[7.0, 8.5]",A man wearing a white shirt and black pants en...,Abuse001_x264,train,1.5,Abuse,True,144
2,91.0,"[7.2, 8.5]",A man wearing a black shirt and black pants en...,Abuse001_x264,train,1.3,Abuse,True,144
3,91.0,"[8.2, 8.9]",A man wearing a white shirt and black pants ap...,Abuse001_x264,train,0.7,Abuse,True,275
4,91.0,"[8.9, 11.2]",A man in black clothes approached a short-hair...,Abuse001_x264,train,2.3,Abuse,True,185


In [8]:
uca_df[uca_df['video'] == 'Assault013_x264']

Unnamed: 0,video_duration,timestamp,sentence,video,dataset,clip_duration,class_name,anomaly,sentence_length
1174,79.34,"[1.3, 18.5]","In the restaurant, two women wanted to leave a...",Assault013_x264,train,17.2,Assault,True,132
1175,79.34,"[19.0, 26.8]",Two thugs with sticks came in and beat people ...,Assault013_x264,train,7.8,Assault,True,103
1176,79.34,"[19.5, 26.0]",Several other women were also kicked out of th...,Assault013_x264,train,6.5,Assault,True,91
1177,79.34,"[41.8, 79.3]","Two or three people came in from outside, lift...",Assault013_x264,train,37.5,Assault,True,115


In [9]:
for root, dirs, files in os.walk(ucf_crime_path):
    if not dirs: # It is a category's folder
        for video in files:
            
            # Get all annotations belonging to this video
            sub_df = uca_df[uca_df['video'] == video.split('.')[0]]

            # Read video
            video_path = os.path.join(root, video)
            cap = cv2.VideoCapture(video_path)
            fps = cap.get(cv2.CAP_PROP_FPS)

            # Get timestamps
            timestamps = sub_df['timestamp'].tolist()

            sorted_timestamps = sorted(timestamps, key=lambda x: x[0])
            
            for timestamp in (sorted_timestamps):
                
                start_frame, end_frame = int(timestamp[0]*fps), int(timestamp[1]*fps)

                clip_array = np.zeros((end_frame-start_frame, 768))

                cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)

                for i in range(start_frame, end_frame):
                    ret, frame = cap.read()
                    if ret:
                        print('Getting emb...')
                        emb = get_img_embedding(model=model, processor=processor, img=frame)
                        clip_array[i-start_frame] = emb.detach().numpy()
                    if i >= start_frame+5:
                        break
                
                np_file_name = video.split('.')[0] + '_' + str(start_frame) + '-' + str(end_frame) + '.npy'
                print('Saving to ' + os.path.join(save_path, np_file_name))
                np.save(os.path.join(save_path, np_file_name), clip_array)

            cap.release()
        break







Getting emb...
Getting emb...
Getting emb...
Getting emb...
Getting emb...
Getting emb...
Saving to clip_embs/Assault013_x264_39-555.npy
Getting emb...
Getting emb...
Getting emb...
Getting emb...
Getting emb...
Getting emb...
Saving to clip_embs/Assault013_x264_570-804.npy
Getting emb...
Getting emb...
Getting emb...
Getting emb...
Getting emb...
Getting emb...
Saving to clip_embs/Assault013_x264_585-780.npy
Getting emb...
Getting emb...
Getting emb...
Getting emb...
Getting emb...
Getting emb...
Saving to clip_embs/Assault013_x264_1254-2379.npy
Getting emb...
Getting emb...
Getting emb...
Getting emb...


KeyboardInterrupt: 

In [13]:
np.load('clip_embs/Assault013_x264_570-804.npy')[0]

array([ 1.77332401e-01,  1.10363793e+00,  4.30029482e-01,  9.51468527e-01,
       -7.02912956e-02, -5.57404757e-02,  3.70428592e-01,  1.84616685e-01,
       -4.06397104e-01, -4.25457597e-01, -3.02474052e-01,  3.71142805e-01,
        1.02982402e-01,  4.94437546e-01,  3.68305087e-01, -3.08489144e-01,
       -6.71047449e-01, -7.09713817e-01, -3.21384639e-01, -5.09767532e-01,
       -3.45718592e-01, -3.01636189e-01, -1.49021745e-02, -3.47284555e-01,
        1.87883779e-01,  5.61046481e-01,  1.07402876e-01, -2.06557512e-02,
       -5.47952503e-02, -5.40044785e-01,  1.05459504e-01, -8.14974308e-02,
       -2.91178256e-01, -6.75406098e-01, -1.09499300e+00,  3.00662398e-01,
        7.41113573e-02,  3.57261077e-02, -5.07738829e-01,  3.91648591e-01,
       -7.17744529e-02, -3.39519948e-01,  3.87987435e-01,  1.02559768e-01,
       -4.83796537e-01, -7.34097242e-01,  3.38603199e-01,  3.17896247e-01,
        1.75777227e-01, -1.09225130e+00, -7.26605058e-02,  1.82681888e-01,
        2.02016830e-01,  