# X-CLIP : Video to Text Retrieval

## XCLIPModel

In [1]:
## clip_len = 8
# model_name = "microsoft/xclip-base-patch32" # clip_len = 8
# model_name = "microsoft/xclip-base-patch16" # clip_len = 8
# model_name = "microsoft/xclip-large-patch14" # clip_len = 8

## clip_len = 16
# model_name = "microsoft/xclip-base-patch32-16-frames" # clip_len = 16
model_name = "microsoft/xclip-base-patch16-16-frames" # clip_len = 16
# model_name = "microsoft/xclip-large-patch14-16-frames" #  clip_len = 16

## HMDB-51
# model_name = "microsoft/xclip-base-patch16-hmdb-2-shot" # clip_len = 32
# model_name = "microsoft/xclip-base-patch16-hmdb-4-shot" # clip_len = 32
# model_name = "microsoft/xclip-base-patch16-hmdb-8-shot" # clip_len = 32
# model_name = "microsoft/xclip-base-patch16-hmdb-16-shot" # clip_len = 32

## UCF-101
# model_name = "microsoft/xclip-base-patch16-ucf-2-shot" # clip_len = 32
# model_name = "microsoft/xclip-base-patch16-ucf-4-shot" # clip_len = 32
# model_name = "microsoft/xclip-base-patch16-ucf-8-shot" # clip_len = 32
# model_name = "microsoft/xclip-base-patch16-ucf-16-shot" # clip_len = 32

## Kinetics-400
# model_name = "microsoft/xclip-base-patch16-zero-shot" # clip_len = 8

## Kinetics-600
# model_name = "microsoft/xclip-base-patch16-kinetics-600" # clip_len = 8
# model_name = "microsoft/xclip-large-patch14-kinetics-600" # clip_len = 8
# model_name = "microsoft/xclip-base-patch16-kinetics-600-16-frames" # clip_len = 16

label = [
    "abuse", "arrest", "arson", "assault", "burglary",
    "explosion", "fighting", "road accident", "robbery", "shooting",
    "shoplifting", "stealing", "vandalism"
]

label = list(map(lambda x : f"A video of action, {x}", label))

clip_len = 16
seed = 826

label

['A video of action, abuse',
 'A video of action, arrest',
 'A video of action, arson',
 'A video of action, assault',
 'A video of action, burglary',
 'A video of action, explosion',
 'A video of action, fighting',
 'A video of action, road accident',
 'A video of action, robbery',
 'A video of action, shooting',
 'A video of action, shoplifting',
 'A video of action, stealing',
 'A video of action, vandalism']

In [2]:
import av
import torch
import numpy as np

from transformers import AutoProcessor, AutoModel
from huggingface_hub import hf_hub_download

def read_video_pyav(container, indices):
    '''
    Decode the video with PyAV decoder.
    Args:
        container (`av.container.input.InputContainer`): PyAV container.
        indices (`List[int]`): List of frame indices to decode.
    Returns:
        result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
    '''
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])


def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
    '''
    Sample a given number of frame indices from the video.
    Args:
        clip_len (`int`): Total number of frames to sample.
        frame_sample_rate (`int`): Sample every n-th frame.
        seg_len (`int`): Maximum allowed index of sample's last frame.
    Returns:
        indices (`List[int]`): List of sampled frame indices
    '''
    converted_len = int(clip_len * frame_sample_rate)
    end_idx = np.random.randint(converted_len, seg_len)
    start_idx = end_idx - converted_len
    indices = np.linspace(start_idx, end_idx, num=clip_len)
    indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
    return indices

### Abuse

In [3]:
np.random.seed(seed)

# video clip consists of 300 frames (10 seconds at 30 FPS)
file_path = "data/UCF-Crime/Abuse001_x264.mp4"
# file_path = "data/UCF-Crime_TH/Abuse001_x264_5s-15s.mp4"
# file_path = "data/UCF-Crime_TH/Abuse001_x264_5s-15s_crop.mp4"
container = av.open(file_path)

frame_sample_rate = int((container.streams.video[0].frames-1) / clip_len)
print(f"frame_sample_rate : {frame_sample_rate}")
print(f"seg_len : {container.streams.video[0].frames}")

# sample clip_len frames
indices = sample_frame_indices(
    clip_len=clip_len, frame_sample_rate=frame_sample_rate,
    seg_len=container.streams.video[0].frames
)
video = read_video_pyav(container, indices)

processor = AutoProcessor.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

inputs = processor(
    text=label,
    videos=list(video),
    return_tensors="pt",
    padding=True,
)

# forward pass
with torch.no_grad():
    outputs = model(**inputs)

logits_per_video = outputs.logits_per_video  # this is the video-text similarity score
probs = logits_per_video.softmax(dim=1)  # we can take the softmax to get the label probabilities
print(probs)

print(f"abuse true : 0")
print(f"abuse pred : {torch.argmax(probs).numpy()}")

frame_sample_rate : 170
seg_len : 2729


  return self.fget.__get__(instance, owner)()
Unused or unrecognized kwargs: padding.
  return torch.tensor(value)


tensor([[0.2072, 0.1117, 0.0448, 0.0172, 0.1716, 0.0355, 0.0190, 0.0550, 0.0914,
         0.0627, 0.0495, 0.0668, 0.0678]])
abuse true : 0
abuse pred : 0


### Arrest

In [4]:
np.random.seed(seed)

# video clip consists of 300 frames (10 seconds at 30 FPS)
file_path = "data/UCF-Crime/Arrest023_x264.mp4"
container = av.open(file_path)

frame_sample_rate = int((container.streams.video[0].frames-1) / clip_len)
print(f"frame_sample_rate : {frame_sample_rate}")
print(f"seg_len : {container.streams.video[0].frames}")

# sample clip_len frames
indices = sample_frame_indices(
    clip_len=clip_len, frame_sample_rate=frame_sample_rate,
    seg_len=container.streams.video[0].frames
)
video = read_video_pyav(container, indices)

processor = AutoProcessor.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

inputs = processor(
    text=label,
    videos=list(video),
    return_tensors="pt",
    padding=True,
)


# forward pass
with torch.no_grad():
    outputs = model(**inputs)

logits_per_video = outputs.logits_per_video  # this is the video-text similarity score
probs = logits_per_video.softmax(dim=1)  # we can take the softmax to get the label probabilities
print(probs)

print(f"arrest true : 1")
print(f"arrest pred : {torch.argmax(probs).numpy()}")

frame_sample_rate : 112
seg_len : 1807


Unused or unrecognized kwargs: padding.


tensor([[0.0485, 0.1312, 0.0076, 0.0392, 0.0486, 0.0084, 0.0246, 0.0868, 0.1934,
         0.0469, 0.3162, 0.0074, 0.0412]])
arrest true : 1
arrest pred : 10


### Arson

In [5]:
np.random.seed(seed)

# video clip consists of 300 frames (10 seconds at 30 FPS)
file_path = "data/UCF-Crime/Arson002_x264.mp4"
container = av.open(file_path)

frame_sample_rate = int((container.streams.video[0].frames-1) / clip_len)
print(f"frame_sample_rate : {frame_sample_rate}")
print(f"seg_len : {container.streams.video[0].frames}")

# sample clip_len frames
indices = sample_frame_indices(
    clip_len=clip_len, frame_sample_rate=frame_sample_rate,
    seg_len=container.streams.video[0].frames
)
video = read_video_pyav(container, indices)

processor = AutoProcessor.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

inputs = processor(
    text=label,
    videos=list(video),
    return_tensors="pt",
    padding=True,
)


# forward pass
with torch.no_grad():
    outputs = model(**inputs)

logits_per_video = outputs.logits_per_video  # this is the video-text similarity score
probs = logits_per_video.softmax(dim=1)  # we can take the softmax to get the label probabilities
print(probs)

print(f"arson true : 2")
print(f"arson pred : {torch.argmax(probs).numpy()}")

frame_sample_rate : 277
seg_len : 4439


Unused or unrecognized kwargs: padding.


tensor([[0.0051, 0.1894, 0.1708, 0.0180, 0.1055, 0.0225, 0.0155, 0.0359, 0.0956,
         0.0181, 0.0931, 0.0118, 0.2187]])
arson true : 2
arson pred : 12


### Assault

In [6]:
np.random.seed(seed)

# video clip consists of 300 frames (10 seconds at 30 FPS)
file_path = "data/UCF-Crime/Assault002_x264.mp4"
container = av.open(file_path)

frame_sample_rate = int((container.streams.video[0].frames-1) / clip_len)
print(f"frame_sample_rate : {frame_sample_rate}")
print(f"seg_len : {container.streams.video[0].frames}")

# sample clip_len frames
indices = sample_frame_indices(
    clip_len=clip_len, frame_sample_rate=frame_sample_rate,
    seg_len=container.streams.video[0].frames
)
video = read_video_pyav(container, indices)

processor = AutoProcessor.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

inputs = processor(
    text=label,
    videos=list(video),
    return_tensors="pt",
    padding=True,
)


# forward pass
with torch.no_grad():
    outputs = model(**inputs)

logits_per_video = outputs.logits_per_video  # this is the video-text similarity score
probs = logits_per_video.softmax(dim=1)  # we can take the softmax to get the label probabilities
print(probs)

print(f"assualt true : 3")
print(f"assualt pred : {torch.argmax(probs).numpy()}")

frame_sample_rate : 157
seg_len : 2523


Unused or unrecognized kwargs: padding.


tensor([[0.0862, 0.5320, 0.0077, 0.0564, 0.0108, 0.0014, 0.0069, 0.0102, 0.0744,
         0.0052, 0.1991, 0.0041, 0.0056]])
assualt true : 3
assualt pred : 1


### Burglary

In [7]:
np.random.seed(seed)

# video clip consists of 300 frames (10 seconds at 30 FPS)
file_path = "data/UCF-Crime/Burglary003_x264.mp4"
container = av.open(file_path)

frame_sample_rate = int((container.streams.video[0].frames-1) / clip_len)
print(f"frame_sample_rate : {frame_sample_rate}")
print(f"seg_len : {container.streams.video[0].frames}")

# sample clip_len frames
indices = sample_frame_indices(
    clip_len=clip_len, frame_sample_rate=frame_sample_rate,
    seg_len=container.streams.video[0].frames
)
video = read_video_pyav(container, indices)

processor = AutoProcessor.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

inputs = processor(
    text=label,
    videos=list(video),
    return_tensors="pt",
    padding=True,
)


# forward pass
with torch.no_grad():
    outputs = model(**inputs)

logits_per_video = outputs.logits_per_video  # this is the video-text similarity score
probs = logits_per_video.softmax(dim=1)  # we can take the softmax to get the label probabilities
print(probs)

print(f"burglary true : 4")
print(f"burglary pred : {torch.argmax(probs).numpy()}")

frame_sample_rate : 73
seg_len : 1173


Unused or unrecognized kwargs: padding.


tensor([[0.0217, 0.1391, 0.0221, 0.0143, 0.0749, 0.2172, 0.0723, 0.0747, 0.0420,
         0.0612, 0.0381, 0.1763, 0.0462]])
burglary true : 4
burglary pred : 5


### Explosion

In [8]:
np.random.seed(seed)

# video clip consists of 300 frames (10 seconds at 30 FPS)
file_path = "data/UCF-Crime/Explosion003_x264.mp4"
container = av.open(file_path)

frame_sample_rate = int((container.streams.video[0].frames-1) / clip_len)
print(f"frame_sample_rate : {frame_sample_rate}")
print(f"seg_len : {container.streams.video[0].frames}")

# sample clip_len frames
indices = sample_frame_indices(
    clip_len=clip_len, frame_sample_rate=frame_sample_rate,
    seg_len=container.streams.video[0].frames
)
video = read_video_pyav(container, indices)

processor = AutoProcessor.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

inputs = processor(
    text=label,
    videos=list(video),
    return_tensors="pt",
    padding=True,
)


# forward pass
with torch.no_grad():
    outputs = model(**inputs)

logits_per_video = outputs.logits_per_video  # this is the video-text similarity score
probs = logits_per_video.softmax(dim=1)  # we can take the softmax to get the label probabilities
print(probs)

print(f"explosion true : 5")
print(f"explosion pred : {torch.argmax(probs).numpy()}")

frame_sample_rate : 35
seg_len : 576


Unused or unrecognized kwargs: padding.


tensor([[0.0098, 0.0266, 0.0051, 0.0045, 0.0035, 0.0067, 0.0123, 0.8523, 0.0114,
         0.0154, 0.0345, 0.0056, 0.0123]])
explosion true : 5
explosion pred : 7


### Fighting

In [9]:
np.random.seed(seed)

# video clip consists of 300 frames (10 seconds at 30 FPS)
file_path = "data/UCF-Crime/Fighting004_x264.mp4"
container = av.open(file_path)

frame_sample_rate = int((container.streams.video[0].frames-1) / clip_len)
print(f"frame_sample_rate : {frame_sample_rate}")
print(f"seg_len : {container.streams.video[0].frames}")

# sample clip_len frames
indices = sample_frame_indices(
    clip_len=clip_len, frame_sample_rate=frame_sample_rate,
    seg_len=container.streams.video[0].frames
)
video = read_video_pyav(container, indices)

processor = AutoProcessor.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

inputs = processor(
    text=label,
    videos=list(video),
    return_tensors="pt",
    padding=True,
)


# forward pass
with torch.no_grad():
    outputs = model(**inputs)

logits_per_video = outputs.logits_per_video  # this is the video-text similarity score
probs = logits_per_video.softmax(dim=1)  # we can take the softmax to get the label probabilities
print(probs)

print(f"fighting true : 6")
print(f"fighting pred : {torch.argmax(probs).numpy()}")

frame_sample_rate : 1048
seg_len : 16777


Unused or unrecognized kwargs: padding.


tensor([[0.1926, 0.3968, 0.0158, 0.1336, 0.0446, 0.0075, 0.0157, 0.0064, 0.0325,
         0.0293, 0.1014, 0.0167, 0.0071]])
fighting true : 6
fighting pred : 1


### Road Accident

In [10]:
np.random.seed(seed)

# video clip consists of 300 frames (10 seconds at 30 FPS)
file_path = "data/UCF-Crime/RoadAccidents009_x264.mp4"
container = av.open(file_path)

frame_sample_rate = int((container.streams.video[0].frames-1) / clip_len)
print(f"frame_sample_rate : {frame_sample_rate}")
print(f"seg_len : {container.streams.video[0].frames}")

# sample clip_len frames
indices = sample_frame_indices(
    clip_len=clip_len, frame_sample_rate=frame_sample_rate,
    seg_len=container.streams.video[0].frames
)
video = read_video_pyav(container, indices)

processor = AutoProcessor.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

inputs = processor(
    text=label,
    videos=list(video),
    return_tensors="pt",
    padding=True,
)


# forward pass
with torch.no_grad():
    outputs = model(**inputs)

logits_per_video = outputs.logits_per_video  # this is the video-text similarity score
probs = logits_per_video.softmax(dim=1)  # we can take the softmax to get the label probabilities
print(probs)

print(f"road accident true : 7")
print(f"road accident pred : {torch.argmax(probs).numpy()}")

frame_sample_rate : 57
seg_len : 918


Unused or unrecognized kwargs: padding.


tensor([[0.0766, 0.0292, 0.0179, 0.0348, 0.0336, 0.0792, 0.0272, 0.2805, 0.0070,
         0.0104, 0.0187, 0.0147, 0.3701]])
road accident true : 7
road accident pred : 12


### Robbery

In [11]:
np.random.seed(seed)

# video clip consists of 300 frames (10 seconds at 30 FPS)
file_path = "data/UCF-Crime/Robbery004_x264.mp4"
container = av.open(file_path)

frame_sample_rate = int((container.streams.video[0].frames-1) / clip_len)
print(f"frame_sample_rate : {frame_sample_rate}")
print(f"seg_len : {container.streams.video[0].frames}")

# sample clip_len frames
indices = sample_frame_indices(
    clip_len=clip_len, frame_sample_rate=frame_sample_rate,
    seg_len=container.streams.video[0].frames
)
video = read_video_pyav(container, indices)

processor = AutoProcessor.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

inputs = processor(
    text=label,
    videos=list(video),
    return_tensors="pt",
    padding=True,
)


# forward pass
with torch.no_grad():
    outputs = model(**inputs)

logits_per_video = outputs.logits_per_video  # this is the video-text similarity score
probs = logits_per_video.softmax(dim=1)  # we can take the softmax to get the label probabilities
print(probs)

print(f"robbery true : 8")
print(f"robbery pred : {torch.argmax(probs).numpy()}")

frame_sample_rate : 95
seg_len : 1529


Unused or unrecognized kwargs: padding.


tensor([[1.9307e-03, 2.3960e-02, 8.9805e-04, 1.6033e-02, 2.0826e-02, 4.7560e-04,
         1.5415e-03, 6.3453e-04, 5.0097e-01, 7.7569e-04, 4.1898e-01, 8.5359e-03,
         4.4376e-03]])
robbery true : 8
robbery pred : 8


### Shooting

In [12]:
np.random.seed(seed)

# video clip consists of 300 frames (10 seconds at 30 FPS)
file_path = "data/UCF-Crime/Shooting001_x264.mp4"
container = av.open(file_path)

frame_sample_rate = int((container.streams.video[0].frames-1) / clip_len)
print(f"frame_sample_rate : {frame_sample_rate}")
print(f"seg_len : {container.streams.video[0].frames}")

# sample clip_len frames
indices = sample_frame_indices(
    clip_len=clip_len, frame_sample_rate=frame_sample_rate,
    seg_len=container.streams.video[0].frames
)
video = read_video_pyav(container, indices)

processor = AutoProcessor.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

inputs = processor(
    text=label,
    videos=list(video),
    return_tensors="pt",
    padding=True,
)


# forward pass
with torch.no_grad():
    outputs = model(**inputs)

logits_per_video = outputs.logits_per_video  # this is the video-text similarity score
probs = logits_per_video.softmax(dim=1)  # we can take the softmax to get the label probabilities
print(probs)

print(f"shooting true : 9")
print(f"shooting pred : {torch.argmax(probs).numpy()}")

frame_sample_rate : 15
seg_len : 253


Unused or unrecognized kwargs: padding.


tensor([[0.0397, 0.0452, 0.0048, 0.0844, 0.0203, 0.0041, 0.0151, 0.0104, 0.0505,
         0.0048, 0.7141, 0.0014, 0.0052]])
shooting true : 9
shooting pred : 10


### Shoplifting

In [13]:
np.random.seed(seed)

# video clip consists of 300 frames (10 seconds at 30 FPS)
file_path = "data/UCF-Crime/Shoplifting001_x264.mp4"
container = av.open(file_path)

frame_sample_rate = int((container.streams.video[0].frames-1) / clip_len)
print(f"frame_sample_rate : {frame_sample_rate}")
print(f"seg_len : {container.streams.video[0].frames}")

# sample clip_len frames
indices = sample_frame_indices(
    clip_len=clip_len, frame_sample_rate=frame_sample_rate,
    seg_len=container.streams.video[0].frames
)
video = read_video_pyav(container, indices)

processor = AutoProcessor.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

inputs = processor(
    text=label,
    videos=list(video),
    return_tensors="pt",
    padding=True,
)


# forward pass
with torch.no_grad():
    outputs = model(**inputs)

logits_per_video = outputs.logits_per_video  # this is the video-text similarity score
probs = logits_per_video.softmax(dim=1)  # we can take the softmax to get the label probabilities
print(probs)

print(f"shoplifting true : 10")
print(f"shoplifting pred : {torch.argmax(probs).numpy()}")

frame_sample_rate : 271
seg_len : 4344


Unused or unrecognized kwargs: padding.


tensor([[0.0089, 0.0119, 0.0029, 0.0050, 0.0328, 0.0058, 0.0030, 0.0057, 0.0587,
         0.0150, 0.8334, 0.0148, 0.0019]])
shoplifting true : 10
shoplifting pred : 10


### Stealing

In [14]:
np.random.seed(seed)

# video clip consists of 300 frames (10 seconds at 30 FPS)
file_path = "data/UCF-Crime/Stealing006_x264.mp4"
container = av.open(file_path)

frame_sample_rate = int((container.streams.video[0].frames-1) / clip_len)
print(f"frame_sample_rate : {frame_sample_rate}")
print(f"seg_len : {container.streams.video[0].frames}")

# sample clip_len frames
indices = sample_frame_indices(
    clip_len=clip_len, frame_sample_rate=frame_sample_rate,
    seg_len=container.streams.video[0].frames
)
video = read_video_pyav(container, indices)

processor = AutoProcessor.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

inputs = processor(
    text=label,
    videos=list(video),
    return_tensors="pt",
    padding=True,
)


# forward pass
with torch.no_grad():
    outputs = model(**inputs)

logits_per_video = outputs.logits_per_video  # this is the video-text similarity score
probs = logits_per_video.softmax(dim=1)  # we can take the softmax to get the label probabilities
print(probs)

print(f"stealing true : 11")
print(f"stealing pred : {torch.argmax(probs).numpy()}")

frame_sample_rate : 201
seg_len : 3223


Unused or unrecognized kwargs: padding.


tensor([[0.0422, 0.0324, 0.0026, 0.0094, 0.0848, 0.0014, 0.0146, 0.6209, 0.0260,
         0.0168, 0.0803, 0.0090, 0.0598]])
stealing true : 11
stealing pred : 7


### Vandalism

In [15]:
np.random.seed(seed)

# video clip consists of 300 frames (10 seconds at 30 FPS)
file_path = "data/UCF-Crime/Vandalism004_x264.mp4"
container = av.open(file_path)

frame_sample_rate = int((container.streams.video[0].frames-1) / clip_len)
print(f"frame_sample_rate : {frame_sample_rate}")
print(f"seg_len : {container.streams.video[0].frames}")

# sample clip_len frames
indices = sample_frame_indices(
    clip_len=clip_len, frame_sample_rate=frame_sample_rate,
    seg_len=container.streams.video[0].frames
)
video = read_video_pyav(container, indices)

processor = AutoProcessor.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

inputs = processor(
    text=label,
    videos=list(video),
    return_tensors="pt",
    padding=True,
)


# forward pass
with torch.no_grad():
    outputs = model(**inputs)

logits_per_video = outputs.logits_per_video  # this is the video-text similarity score
probs = logits_per_video.softmax(dim=1)  # we can take the softmax to get the label probabilities
print(probs)

print(f"vandalism true : 12")
print(f"vandalism pred : {torch.argmax(probs).numpy()}")

frame_sample_rate : 176
seg_len : 2820


Unused or unrecognized kwargs: padding.


tensor([[0.0113, 0.0233, 0.0034, 0.0066, 0.0362, 0.0077, 0.0220, 0.3574, 0.1210,
         0.0249, 0.0708, 0.0649, 0.2503]])
vandalism true : 12
vandalism pred : 7
