In [1]:
import os
import torch
import numpy as np
from torch.utils.data import DataLoader
import random
import shutil

from algorithms.feature_extraction_loading import FeatureDataset, extract_diffusion_features, feature_collate_fn, concatenate_video_features
from evaluation.visualization import safe_heatmap_as_gif, place_marker_in_frames

from evaluation.evaluation_datasets import compute_tapvid_metrics

from algorithms.heatmap_generator import HeatmapGenerator
from algorithms.zero_shot_tracker import ZeroShotTracker

heatmap_generator = HeatmapGenerator()
zero_shot_tracker = ZeroShotTracker()

  from .autonotebook import tqdm as notebook_tqdm
2024-06-05 00:35:03.300186: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-05 00:35:03.340698: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
#extract_diffusion_features(input_dataset_paths={'tapvid_davis': '../tapvid_davis/tapvid_davis.pkl'}, diffusion_model_path='../text-to-video-ms-1.7b/', restrict_frame_size=False, max_frame_size=2**18)

In [3]:
mode = "pca"

feature_dataset = FeatureDataset(feature_dataset_path='output/features/davis')
feature_loader = DataLoader(feature_dataset, batch_size=1, collate_fn=feature_collate_fn)

video_idx = 0

tracking_accuracy = []

for batch in feature_loader:
    batch_query_points = []
    batch_gt_occluded = []
    batch_gt_tracks = []
    batch_pred_tracks = []

    sample = batch[0]
        
    if mode == "pca":
        #concat_downblock = concatenate_video_features(
        #    {
        #        'down_block': sample['features']['down_block'][:]
        #    },
        #    perform_pca = True,
        #    n_components = 10
        #)
        #concat_midblock = concatenate_video_features(
        #    {
        #        'mid_block': sample['features']['mid_block'][:]
        #    },
        #    perform_pca = True,
        #    n_components = 10
        #)
        concat_upblock = concatenate_video_features(
            {
                'up_block': sample['features']['up_block'][0:3]
            },
            perform_pca = True,
            n_components = 20
        )
        concat_decoderblock = concatenate_video_features(
            {
                'decoder_block': sample['features']['decoder_block'][1:2]
            },
            perform_pca = True,
            n_components = 10
        )
    
    elif mode == "pooling":
        concat_downblock = concatenate_video_features(
            {
                'down_block': sample['features']['down_block'][:]
            },
            perform_pooling = True
        )
        concat_midblock = concatenate_video_features(
            {
                'mid_block': sample['features']['mid_block'][:]
            },
            perform_pooling = True
        )
        concat_upblock = concatenate_video_features(
            {
                'up_block': sample['features']['up_block'][:]
            },
            perform_pooling = True
        )
        concat_decoderblock = concatenate_video_features(
            {
                'decoder_block': sample['features']['decoder_block'][:]
            },
            perform_pooling = True
        )

    video_features = concatenate_video_features(
        {
            #'down_block': [concat_downblock],
            #'mid_block': [concat_midblock],
            'up_block': [concat_upblock],
            'decoder_block': [concat_decoderblock],
        }
    )

    query_points = torch.cat([torch.from_numpy(p).unsqueeze(0) for p in sample['query_points'][0]], dim=0)

    batch_query_points.append(query_points)

    occluded = sample['occluded'][0]
    batch_gt_occluded.append(occluded)

    gt_track = sample['target_points'][0]
    batch_gt_tracks.append(gt_track)

    folder_path = os.path.join('output', 'video_' + str(video_idx))
    if os.path.exists(folder_path):
        shutil.rmtree(folder_path)
    os.makedirs(folder_path)
    query_point_file_name = os.path.join(folder_path, 'query_point.txt')
    with open(query_point_file_name, 'w') as query_point_file:
        query_point_file.write(str(query_points))
        query_point_file.close()

    heatmaps = heatmap_generator.generate(video_features, query_points, device="cpu")
    pred_track = zero_shot_tracker.track(heatmaps)

    batch_pred_tracks.append(pred_track[..., [1,0]])
    gt_track = gt_track[..., [1, 0]]

    place_marker_in_frames(frames=sample['video'].squeeze(), tracks=pred_track[0].unsqueeze(0), occluded=occluded[0][None], ground_truth_tracks=gt_track[0], folder_path=folder_path)
    safe_heatmap_as_gif(heatmaps, True, sample['video'].squeeze(), folder_path=folder_path)

    video_idx += 1

    metrics = compute_tapvid_metrics(query_points=np.array(batch_query_points), gt_occluded=np.array(batch_gt_occluded), gt_tracks=np.array(batch_gt_tracks), pred_occluded=np.array(batch_gt_occluded), pred_tracks=np.array(batch_pred_tracks), query_mode='strided')
    print(metrics)

    metrics_file_name = os.path.join(folder_path, 'metrics.txt')
    with open(metrics_file_name, 'w') as metrics_file:
        metrics_file.write(str(metrics))

    tracking_accuracy.append(metrics['average_pts_within_thresh'])

print(np.mean(tracking_accuracy))

    

{'occlusion_accuracy': array([1.]), 'pts_within_1': array([0.0652232]), 'jaccard_1': array([0.03371097]), 'pts_within_2': array([0.20707476]), 'jaccard_2': array([0.11549548]), 'pts_within_4': array([0.45380023]), 'jaccard_4': array([0.29349392]), 'pts_within_8': array([0.65962755]), 'jaccard_8': array([0.49212258]), 'pts_within_16': array([0.78027265]), 'jaccard_16': array([0.63971072]), 'average_jaccard': array([0.31490673]), 'average_pts_within_thresh': array([0.43319968])}
0.4331996792301524


In [4]:
# Check mean and max of featuremaps before pca

featuremapssss = sample["features"]
print(featuremapssss.keys())

for (key, value) in sample["features"].items():
    concat_block = concatenate_video_features(
        {
            'x': value[:]
        }
    )
    print(key)
    print("avg:")
    print(torch.mean(concat_block))
    print("max:")
    print(torch.max(concat_block))
    
        

dict_keys(['up_block', 'down_block', 'mid_block', 'decoder_block'])
up_block
avg:
tensor(-0.1026, dtype=torch.float16)
max:
tensor(305., dtype=torch.float16)
down_block
avg:
tensor(-0.2408, dtype=torch.float16)
max:
tensor(70.6250, dtype=torch.float16)
mid_block
avg:
tensor(-0.2261, dtype=torch.float16)
max:
tensor(78.2500, dtype=torch.float16)
decoder_block
avg:
tensor(0.4607, dtype=torch.float16)
max:
tensor(6612., dtype=torch.float16)


In [5]:
## Use decoder for upsampling

feature_dataset = FeatureDataset(feature_dataset_path='output/features/davis')
feature_loader = DataLoader(feature_dataset, batch_size=1, collate_fn=feature_collate_fn)

video_idx = 0

for batch in feature_loader:
    query_points = []
    gt_occluded = []
    gt_tracks = []
    pred_tracks = []

    for sample in batch:
          
        concat_upblock = concatenate_video_features(
            {
                'up_block': sample['features']['up_block'][:]
            },
            perform_pca = True,
            n_components = 4
        )

        ###
        HIER DECODER REINLADEN UND EIN FORWARDPASS
        ###

        #idx = random.randint(0, len(sample['query_points'][0]) - 1)
        query_points = sample['query_points'][0][idx]
        query_points.append(query_points[None, :])

        occluded = sample['occluded'][0, idx]
        gt_occluded.append(occluded[None])

        gt_track = sample['target_points'][0, idx]
        gt_tracks.append(gt_track[None])

        folder_path = os.path.join('output', 'video_' + str(video_idx))
        if os.path.exists(folder_path):
            shutil.rmtree(folder_path)
        os.makedirs(folder_path)
        query_point_file_name = os.path.join(folder_path, 'query_point.txt')
        with open(query_point_file_name, 'w') as query_point_file:
            query_point_file.write(str(query_points))

        target = torch.tensor([[int(query_points[0]), query_points[1], query_points[2]]]) # Targets are now tensor in shape Nx3
        heatmaps = heatmap_generator.generate(video_features, target, device="cpu")

        pred_track = zero_shot_tracker.track(heatmaps)

        pred_tracks.append(pred_track.numpy()[None])

        gt_track_switched = np.zeros_like(gt_track)
        gt_track_switched[:, 1] = gt_track[:, 0]
        gt_track_switched[:, 0] = gt_track[:, 1]

        v.place_marker_in_frames(sample['video'].squeeze(), pred_track, ground_truth_tracks=gt_track_switched, folder_path=folder_path)
        v.safe_heatmap_as_gif(heatmaps, True, sample['video'].squeeze(), folder_path=folder_path)

        video_idx += 1

    #metrics = compute_tapvid_metrics(query_points=np.array(query_points), gt_occluded=np.array(gt_occluded), gt_tracks=np.array(gt_tracks), pred_occluded=np.array(gt_occluded), pred_tracks=np.array(pred_tracks), query_mode='strided')

    #print(metrics)

SyntaxError: invalid syntax (489999786.py, line 25)