In [1]:
import numpy as np
import pandas as pd
import random
import sys

sys.path.append("/kaggle/input/dust3r-package")
sys.path.append("/kaggle/input/module-roma")
sys.path.append("/kaggle/input/einops/einops-master")
sys.path.append("/kaggle/input/trimesh-for-kaggle")

from dust3r.inference import inference
from dust3r.model import AsymmetricCroCo3DStereo
from dust3r.utils.image import load_images
from dust3r.image_pairs import make_pairs
from dust3r.cloud_opt import global_aligner, GlobalAlignerMode

import gc



In [2]:
def arr_to_str(a):
    return ';'.join([str(x) for x in a.reshape(-1)])

In [3]:
src = '/kaggle/input/image-matching-challenge-2024'

# Get data from csv.
data_dict = {}
with open(f'{src}/sample_submission.csv', 'r') as f:
    for i, l in enumerate(f):
        # Skip header.
        if l and i > 0:
            image, dataset, scene, _, _ = l.strip().split(',')
            if dataset not in data_dict:
                data_dict[dataset] = {}
            if scene not in data_dict[dataset]:
                data_dict[dataset][scene] = []
            data_dict[dataset][scene].append(image)
                    
for dataset in data_dict:
    for scene in data_dict[dataset]:
        print(f'{dataset} / {scene} -> {len(data_dict[dataset][scene])} images')

church / church -> 41 images


In [4]:
device = 'cuda'
batch_size = 1
schedule = 'cosine'
lr = 0.01
niter = 1000
max_num_pairs = 300

def swin(dataset):
    if dataset in ["lizard", "pond"]:
        return "swin-1"
    else:
        return "swin"

model_path = "/kaggle/input/dust3r/pytorch/512_dpt/1/DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth"

In [5]:
result = {}
for dataset in data_dict.keys():
    if dataset not in result.keys():
        result[dataset] = {}
    print("dataset:", dataset)
    for scene in data_dict[dataset].keys():
        if scene not in result[dataset].keys():
            result[dataset][scene] = {}
        print("scene:", scene)

        img_names = data_dict[dataset][scene]
        img_fnames = [f'{src}/{x}' for x in img_names]
        print (f"Got {len(img_fnames)} images")
        
        try:
            # you can put the path to a local checkpoint in model_name if needed
            model = AsymmetricCroCo3DStereo.from_pretrained(model_path).to(device)

            # load_images can take a list of images or a directory
            images = load_images(img_fnames, size=512)
            pairs = make_pairs(images, scene_graph=swin(dataset), prefilter=None, symmetrize=True)
            paris = random.sample(pairs, min(max_num_pairs, len(pairs)))
            output = inference(pairs, model, device, batch_size=batch_size)

            # at this stage, you have the raw dust3r predictions
            view1, pred1 = output['view1'], output['pred1']
            view2, pred2 = output['view2'], output['pred2']

            output = global_aligner(output, device=device, mode=GlobalAlignerMode.PointCloudOptimizer)
            loss = output.compute_global_alignment(init="mst", niter=niter, schedule=schedule, lr=lr)

            # retrieve useful values from scene:
            # imgs = output.imgs
            # focals = output.get_focals()
            poses = output.get_im_poses()
            # pts3d = output.get_pts3d()
            # confidence_masks = output.get_masks()

            for img_name, pose in zip(img_names, poses):
                rotation_matrix,  = pose[:3,:3].cpu().detach().numpy(), 
                translation_vector = pose[:3,3].cpu().detach().numpy()
                result[dataset][scene][img_name] = (rotation_matrix, translation_vector)

            # visualize reconstruction
            # output.show()

            del model
            del images
            del output

            gc.collect()
            
        except Exception as e:
            print(e)

dataset: church
scene: church
Got 41 images
... loading model from /kaggle/input/dust3r/pytorch/512_dpt/1/DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth
instantiating : AsymmetricCroCo3DStereo(enc_depth=24, dec_depth=12, enc_embed_dim=1024, dec_embed_dim=768, enc_num_heads=16, dec_num_heads=12, pos_embed='RoPE100', patch_embed_cls='PatchEmbedDust3R', img_size=(512, 512), head_type='dpt', output_mode='pts3d', depth_mode=('exp', -inf, inf), conf_mode=('exp', 1, inf), landscape_only=False)
<All keys matched successfully>
>> Loading a list of 41 images
 - adding /kaggle/input/image-matching-challenge-2024/test/church/images/00046.png with resolution 768x1024 --> 384x512
 - adding /kaggle/input/image-matching-challenge-2024/test/church/images/00090.png with resolution 768x1024 --> 384x512
 - adding /kaggle/input/image-matching-challenge-2024/test/church/images/00092.png with resolution 768x1024 --> 384x512
 - adding /kaggle/input/image-matching-challenge-2024/test/church/images/00087.png with reso

100%|██████████| 246/246 [01:35<00:00,  2.57it/s]


 init edge (23*,25*) score=348.2974853515625
 init edge (22*,25) score=101.68627166748047
 init edge (26*,25) score=53.23867416381836
 init edge (23,21*) score=40.575191497802734
 init edge (27*,25) score=17.983867645263672
 init edge (24*,25) score=17.73099708557129
 init edge (27,29*) score=13.156030654907227
 init edge (29,28*) score=10.342477798461914
 init edge (20*,22) score=8.830647468566895
 init edge (22,19*) score=7.445827484130859
 init edge (18*,21) score=6.624990940093994
 init edge (16*,18) score=3.7041454315185547
 init edge (29,30*) score=23.824005126953125
 init edge (30,33*) score=16.5194034576416
 init edge (17*,20) score=11.792767524719238
 init edge (34*,33) score=63.98694610595703
 init edge (17,15*) score=56.947811126708984
 init edge (34,37*) score=52.91121292114258
 init edge (37,35*) score=28.890911102294922
 init edge (39*,37) score=27.79238510131836
 init edge (39,40*) score=12.79019546508789
 init edge (32*,35) score=10.68260383605957
 init edge (36*,35) sc

100%|██████████| 1000/1000 [09:49<00:00,  1.70it/s, lr=1.02467e-06 loss=0.00516378]


In [6]:
with open(f'submission.csv', 'w') as f:
    f.write('image_path,dataset,scene,rotation_matrix,translation_vector\n')
    for dataset in data_dict:
        for scene in data_dict[dataset]:
            for image in data_dict[dataset][scene]:
                if image in result[dataset][scene].keys():
                    print(image)
                    rotation_matrix = result[dataset][scene][image][0]
                    translation_vector = result[dataset][scene][image][1]
                else:
                    rotation_matrix = np.eye(3)
                    translation_vector = np.zeros((3,1))
                f.write(f'{image},{dataset},{scene},{arr_to_str(rotation_matrix)},{arr_to_str(translation_vector)}\n')

test/church/images/00046.png
test/church/images/00090.png
test/church/images/00092.png
test/church/images/00087.png
test/church/images/00050.png
test/church/images/00068.png
test/church/images/00083.png
test/church/images/00096.png
test/church/images/00069.png
test/church/images/00081.png
test/church/images/00042.png
test/church/images/00018.png
test/church/images/00030.png
test/church/images/00024.png
test/church/images/00032.png
test/church/images/00026.png
test/church/images/00037.png
test/church/images/00008.png
test/church/images/00035.png
test/church/images/00021.png
test/church/images/00010.png
test/church/images/00039.png
test/church/images/00011.png
test/church/images/00013.png
test/church/images/00006.png
test/church/images/00012.png
test/church/images/00029.png
test/church/images/00001.png
test/church/images/00098.png
test/church/images/00072.png
test/church/images/00066.png
test/church/images/00104.png
test/church/images/00058.png
test/church/images/00059.png
test/church/im

In [7]:
!cat submission.csv

image_path,dataset,scene,rotation_matrix,translation_vector
test/church/images/00046.png,church,church,-0.19619188;0.38640407;0.9012218;-0.11904976;0.90289843;-0.41303957;-0.97331184;-0.18832524;-0.13113996,-0.3604628;0.09148759;0.4396099
test/church/images/00090.png,church,church,0.9738032;0.062767416;0.21855822;0.014381669;0.9422241;-0.3346746;-0.22693747;0.32905042;0.9166381,0.019109283;0.0504517;0.13774267
test/church/images/00092.png,church,church,0.9383378;0.10269065;0.33011627;-0.00021410361;0.9550392;-0.2964795;-0.34571964;0.27812722;0.8961714,0.023552695;0.031003982;0.17744933
test/church/images/00087.png,church,church,0.88997823;0.15669504;0.42823538;-0.01818049;0.9505524;-0.31003183;-0.45564058;0.26813602;0.8488197,0.006465416;0.037059776;0.18463089
test/church/images/00050.png,church,church,0.68967044;-0.04523753;0.72270894;-0.013691146;0.9970536;0.07547523;-0.7239939;-0.061947748;0.6870191,-0.0925116;0.013238394;0.19848752
test/church/images/00068.png,church,church,0