# Cài đặt thư viện

In [2]:
# !pip install ffmpeg-python
# !pip install ftfy regex tqdm
# !pip install git+https://github.com/openai/CLIP.git
# !pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.0 MB)
[K     |████████████████████████████████| 27.0 MB 9.5 MB/s eta 0:00:01
Installing collected packages: faiss-cpu
Successfully installed faiss-cpu-1.8.0


# Import thư viện

In [3]:
import os
import cv2
import json
import clip
import ffmpeg
import torch
import faiss
import numpy as np
from PIL import Image
from tqdm import tqdm
import matplotlib.pyplot as plt
from os.path import join as osp
import glob
import shutil

  from .autonotebook import tqdm as notebook_tqdm


In [24]:
INPUT_PATH = "/home/totuanan/Workplace/AICityChallenge2024_Track2/feat_extractor/data/caption"
OUTPUT_PATH = "/home/totuanan/Workplace/AICityChallenge2024_Track2/feat_extractor/data/retrieval/caption"
DATA_TYPE = ["train", "val", "test"]
SUBFOLDER = "*"

for data_type in DATA_TYPE:
    for video in glob.glob(f"{INPUT_PATH}/{data_type}/*/*/*.json"):
        print(video)
        video_name = video.split("/")[-1]
        os.makedirs(f"{OUTPUT_PATH}/{data_type}", exist_ok=True)
        if not os.path.isfile(f"{OUTPUT_PATH}/{data_type}/{video_name}"):
            shutil.copy(video, f"{OUTPUT_PATH}/{data_type}/{video_name}")


/home/totuanan/Workplace/AICityChallenge2024_Track2/feat_extractor/data/caption/train/20231006_13_SY13_T1/vehicle_view/20231006_13_SY13_T1_caption.json
/home/totuanan/Workplace/AICityChallenge2024_Track2/feat_extractor/data/caption/train/20231006_13_SY13_T1/overhead_view/20231006_13_SY13_T1_caption.json
/home/totuanan/Workplace/AICityChallenge2024_Track2/feat_extractor/data/caption/train/20230922_45_CN4_T1/vehicle_view/20230922_45_CN4_T1_caption.json
/home/totuanan/Workplace/AICityChallenge2024_Track2/feat_extractor/data/caption/train/20230922_45_CN4_T1/overhead_view/20230922_45_CN4_T1_caption.json
/home/totuanan/Workplace/AICityChallenge2024_Track2/feat_extractor/data/caption/train/20231006_26_CN15_T1/vehicle_view/20231006_26_CN15_T1_caption.json
/home/totuanan/Workplace/AICityChallenge2024_Track2/feat_extractor/data/caption/train/20231006_26_CN15_T1/overhead_view/20231006_26_CN15_T1_caption.json
/home/totuanan/Workplace/AICityChallenge2024_Track2/feat_extractor/data/caption/train/202

# Định nghĩa 1 số tham số cần thiết

In [4]:
root = '/home/totuanan/Workplace/AICityChallenge2024_Track2/feat_extractor/data/retrieval'
#bbox_root = osp(root, 'annotations/bbox_annotated')
caption_root = osp(root, 'caption')
video_root = osp(root, 'videos')

# EDA data và trích xuất embedding

In [5]:
# Định nghĩa hàm plot data
def full_frame(width=None, height=None):
    import matplotlib as mpl
    mpl.rcParams['savefig.pad_inches'] = 0
    figsize = None if width is None else (width, height)
    fig = plt.figure(figsize=figsize)
    ax = plt.axes([0,0,1,1], frameon=False)
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)
    plt.autoscale(tight=True)

In [6]:
# Định nghĩa dict để map giữa label số và label tên của từng phase
mapper = {
    "0":"prerecognition",
    "1":"recognition",
    "2":"judgement",
    "3":"action",
    "4":"avoidance"
    }

In [7]:
# Load model CLIP để trích xuất embedding
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-L/14@336px", device=device)

In [8]:
# Xây dựng faiss index để thực hiện truy vấn
# Ta sẽ thực hiện faiss cho từng segment của từng video
feature_shape = 768

indexes = [faiss.IndexFlatIP(feature_shape) for _ in range(5)]
id2video_caption = [[] for _ in range(5)]

In [9]:
# Visualize 1 số dữ liệu của tập train

eda_or_embed = 'embed' #@param ['eda', 'embed'] {'type':'string'}
data_type = ['train', 'val']


for typo in data_type:
  video_paths = os.listdir(osp(video_root, typo))
  for video_path in tqdm(video_paths):
      video_name = video_path[:-4]

      # Load thông tin bbox đã được đánh nhãn
      # with open(osp(bbox_root, data_type, f'{video_name}_bbox.json'), 'r') as f:
      #   bbox_annotation = json.load(f)['annotations']

      # Load thông tin caption đã được đánh nhãn
      with open(osp(caption_root, typo, f'{video_name}_caption.json'), 'r') as f:
        caption_annotation = json.load(f)

      # bounding_box_dict = dict()
      # for bb in bbox_annotation:
      #     phase_number = bb['phase_number']
      #     phase_name = mapper[str(phase_number)]
      #     bounding_box_dict[phase_name] = bb

      # Process thông tin từng phase
      phase_captions = dict()
      for e in caption_annotation['event_phase']:
          phase_name = e['labels'][0]
          phase_captions[phase_name] = dict(pedes=e['caption_pedestrian'],
                                            vehicle=e['caption_vehicle'],
                                            start_time=float(e['start_time']),
                                            end_time=float(e['end_time']))

      for phase in range(5):
          phase_name = str(phase)
          phase_anno = phase_captions[phase_name]

          # Load video của từng phase
          video_path = osp(video_root, typo, video_path)
          fps = float(cv2.VideoCapture(video_path).get(cv2.CAP_PROP_FPS))

          vcap = cv2.VideoCapture(video_path) # 0=camera
          width  = int(vcap.get(cv2.CAP_PROP_FRAME_WIDTH))   # float `width`
          height = int(vcap.get(cv2.CAP_PROP_FRAME_HEIGHT))  # float `height`

          
          try:
            # Trích xuất frame chứa thông tin bbox
            out, _ = (
                ffmpeg
                .input(video_path, ss=(phase_anno['start_time'] +  phase_anno['end_time'])/2.0)
                .output('pipe:', format='rawvideo', pix_fmt='rgb24', vframes=1, loglevel="quiet")
                .run(capture_stdout=True)
            )
            frame = np.frombuffer(out, np.uint8)
            #print(frame.shape)
            frame = np.copy(frame.reshape([height, width, 3]))
          except:
             print(video_name)
             continue

          if eda_or_embed == 'eda':
            x, y, w, h = phase_anno['bbox']
            cv2.rectangle(frame, (x, y), (x + w, y + h), (255, 0, 0), 2)

            print(f'Phase name: {phase_name}')
            print(f"Caption Pedestrian: {phase_anno['pedes']}")
            print(f"Caption Vehicle: {phase_anno['vehicle']}")

            full_frame()
            plt.axis('off')
            plt.imshow(frame)
            plt.pause(0.1)
          else:
            pil_frame = Image.fromarray(frame)
            pil_frame = preprocess(pil_frame).unsqueeze(0).to(device)
            with torch.no_grad():
                image_feature = model.encode_image(pil_frame)

            indexes[phase].add(image_feature.cpu().numpy())
            id2video_caption[phase].append(osp(caption_root, typo, f'{video_name}_caption.json'))

      if eda_or_embed == 'eda':
        break

id2video_caption = [dict(enumerate(e)) for e in id2video_caption]

 19%|█▉        | 560/2969 [07:41<38:19,  1.05it/s] 

20230728_36_SN27_T1_vehicle_view


 22%|██▏       | 649/2969 [08:51<24:43,  1.56it/s]

video4645


 26%|██▌       | 762/2969 [10:17<29:33,  1.24it/s]

20230922_45_CN4_T1_vehicle_view


 26%|██▌       | 771/2969 [10:25<32:38,  1.12it/s]

20230728_25_SY22_T1_vehicle_view


 28%|██▊       | 829/2969 [11:11<28:03,  1.27it/s]

video1045


 34%|███▍      | 1018/2969 [13:43<26:48,  1.21it/s]

20230728_61_SN25_T1_vehicle_view


 65%|██████▍   | 1916/2969 [25:48<17:59,  1.03s/it]

20230707_31_SY20_T1_vehicle_view


 71%|███████▏  | 2116/2969 [28:29<09:24,  1.51it/s]

video7474


 82%|████████▏ | 2441/2969 [32:50<07:26,  1.18it/s]

20230728_39_SN30_T1_vehicle_view


100%|██████████| 2969/2969 [39:43<00:00,  1.25it/s]
  4%|▍         | 47/1243 [00:36<15:13,  1.31it/s][mov,mp4,m4a,3gp,3g2,mj2 @ 0x7e07140] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x1e8a5ac0] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0xa24b9cc0] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x91bbff40] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0xa24d2f40] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0xa24b9cc0] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x91bbff40] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0xa24d2f40] moov atom not found


video8192
video8192
video8192
video8192


[mov,mp4,m4a,3gp,3g2,mj2 @ 0xa24b9cc0] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x91bbff40] moov atom not found
  4%|▍         | 48/1243 [00:37<12:19,  1.62it/s]

video8192


 86%|████████▌ | 1071/1243 [14:52<02:29,  1.15it/s]

video6639


100%|██████████| 1243/1243 [17:08<00:00,  1.21it/s]


In [30]:
import pickle 
with open('retrieval.pickle', 'wb') as handle:
    pickle.dump(indexes, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('retrieval_json.pickle', 'wb') as handle:
    pickle.dump(id2video_caption, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [1]:
import pickle 

with open('retrieval.pickle', 'rb') as handle:
    id2video_caption = pickle.load(handle)

In [2]:
id2video_caption

[{0: '/home/totuanan/Workplace/AICityChallenge2024_Track2/feat_extractor/data/retrieval/caption/train/video3672_caption.json',
  1: '/home/totuanan/Workplace/AICityChallenge2024_Track2/feat_extractor/data/retrieval/caption/train/video5214_caption.json',
  2: '/home/totuanan/Workplace/AICityChallenge2024_Track2/feat_extractor/data/retrieval/caption/train/video2639_caption.json',
  3: '/home/totuanan/Workplace/AICityChallenge2024_Track2/feat_extractor/data/retrieval/caption/train/video8777_caption.json',
  4: '/home/totuanan/Workplace/AICityChallenge2024_Track2/feat_extractor/data/retrieval/caption/train/video1193_caption.json',
  5: '/home/totuanan/Workplace/AICityChallenge2024_Track2/feat_extractor/data/retrieval/caption/train/video1188_caption.json',
  6: '/home/totuanan/Workplace/AICityChallenge2024_Track2/feat_extractor/data/retrieval/caption/train/video5843_caption.json',
  7: '/home/totuanan/Workplace/AICityChallenge2024_Track2/feat_extractor/data/retrieval/caption/train/video2105

# Thực hiện Retrieval trên tập test để tạo file submit
File submission có cấu trúc như sau:


```
{
"video3334": [  ##scneario index for multiple view situations OR video name for single view "BDD_PC_5K".
        {
            "labels": [  ##segment number, this is known information will be given
                "4"
            ],
            "caption_pedestrian": "",  ##caption regarding pedestrian
            "caption_vehicle": ""      ##caption regarding vehicle
        },
        {
            "labels": [
                "3"
            ],
            "caption_pedestrian": "",
            "caption_vehicle": ""
        },
        ...
],
...
}
```


In [18]:
data_type = 'test'
video_paths = os.listdir(osp(video_root, data_type))

final_result = dict()
for video_path in tqdm(video_paths):
    video_name = video_path[:-4]

    # Load thông tin bbox đã được đánh nhãn
    # with open(osp(bbox_root, data_type, f'{video_name}_bbox.json'), 'r') as f:
    #   bbox_annotation = json.load(f)['annotations']

    # Load thông tin caption đã được đánh nhãn
    with open(osp(caption_root, data_type, f'{video_name}_caption.json'), 'r') as f:
      caption_annotation = json.load(f)

    # bounding_box_dict = dict()
    # for bb in bbox_annotation:
    #     phase_number = bb['phase_number']
    #     phase_name = mapper[str(phase_number)]
    #     bounding_box_dict[phase_name] = bb

    # Process thông tin từng phase
    phase_captions = dict()
    for e in caption_annotation['event_phase']:
        phase_name = e['labels'][0]
        phase_captions[phase_name] = dict(
                                          start_time=float(e['start_time']),
                                          end_time=float(e['end_time']))

    video_result = []
    for phase in range(5):
        phase_name = str(phase)
        try:
            phase_anno = phase_captions[phase_name]
        except:
           continue

        # Load video của từng phase
        video_path = osp(video_root, data_type, video_path)
        fps = float(cv2.VideoCapture(video_path).get(cv2.CAP_PROP_FPS))

        vcap = cv2.VideoCapture(video_path) # 0=camera
        width  = int(vcap.get(cv2.CAP_PROP_FRAME_WIDTH))   # float `width`
        height = int(vcap.get(cv2.CAP_PROP_FRAME_HEIGHT))  # float `height`

        # Trích xuất frame chứa thông tin bbox
        try:
            out, _ = (
                ffmpeg
                .input(video_path, ss=(phase_anno['start_time'] +  phase_anno['end_time'])//2.0)
                .output('pipe:', format='rawvideo', pix_fmt='rgb24', vframes=1, loglevel="quiet")
                .run(capture_stdout=True)
            )

            frame = np.frombuffer(out, np.uint8)
            frame = np.copy(frame.reshape([height, width, 3]))
            pil_frame = Image.fromarray(frame)
            pil_frame = preprocess(pil_frame).unsqueeze(0).to(device)
            with torch.no_grad():
                image_feature = model.encode_image(pil_frame)

            _, idx = indexes[phase].search(image_feature.cpu().numpy(), k=1)
            idx = int(idx[0][0])

            with open(id2video_caption[phase][idx]) as f:
                cap_searched = json.load(f)['event_phase']
            for e in cap_searched:
                if e['labels'][0] == phase_name:
                    cap1 = e['caption_pedestrian']
                    cap2 = e['caption_vehicle']
                    break
            
            phase_result = {
                "labels": [str(phase)],
                "caption_pedestrian": cap1,
                "caption_vehicle": cap2
            }
        except:
            print(video_path)
            phase_result = {
                "labels": [str(phase)],
                "caption_pedestrian": "",
                "caption_vehicle": ""
            }
        
        video_result.append(phase_result)

    final_result[video_name] = video_result

 22%|██▏       | 145/664 [02:29<08:00,  1.08it/s]

/home/totuanan/Workplace/AICityChallenge2024_Track2/feat_extractor/data/retrieval/videos/test/20230728_35_SN26_T1_vehicle_view.mp4


 22%|██▏       | 147/664 [02:31<08:23,  1.03it/s]

/home/totuanan/Workplace/AICityChallenge2024_Track2/feat_extractor/data/retrieval/videos/test/video9485.mp4


 70%|██████▉   | 462/664 [07:37<03:10,  1.06it/s]

/home/totuanan/Workplace/AICityChallenge2024_Track2/feat_extractor/data/retrieval/videos/test/20230728_44_CY20_T1_vehicle_view.mp4


100%|██████████| 664/664 [10:59<00:00,  1.01it/s]


In [19]:
with open('full_submission.json', "w") as outfile:
    json.dump(final_result, outfile, indent=4)

In [1]:
import json
f = open("label.json", "r")

data = json.load(f)

In [3]:
cnt = 0
label = {}
for key in data:
    if cnt == 101:
        break
    label[key] = data[key]
    cnt += 1

with open('my_label.json', "w") as outfile:
    json.dump(label, outfile, indent=4)
