In [None]:
!pip install https://github.com/Simon-Bertrand/FastCrossCorr-PyTorch/archive/main.zip

In [84]:
import torch
import torch_crosscorr
import torch_bidimcc

In [85]:
H,W = 1024,1024
h,w = 256,256

a = 5* torch.rand(8,1,H, H, device='cuda').to(torch.double)
b = 5* torch.rand(8,1,h, w, device='cuda').to(torch.double)


In [86]:
old_cc = torch_crosscorr.FastNormalizedCrossCorrelation("corr", "fft", padding="valid", center=False, dtype=torch.double)

In [87]:
new_cc = torch_bidimcc.fft_cc

In [88]:
old_cc_ans = old_cc(a,b)
new_cc_ans = new_cc(a,b)
assert torch.allclose(old_cc_ans, new_cc_ans)

In [89]:
# 1. Create fresh distinct inputs for each method to ensure no gradient accumulation interference
a1 = a.clone().detach().requires_grad_(True)
b1 = b.clone().detach().requires_grad_(True)
a2 = a.clone().detach().requires_grad_(True)
b2 = b.clone().detach().requires_grad_(True)
# 2. Run Forward Passes
res_old = old_cc(a1, b1)
res_new = new_cc(a2, b2)
# 3. Check Forward consistency first (sanity check)
print(f"Forward pass close? {torch.allclose(res_old, res_new, atol=1e-4)}")
# 4. Run Backward Passes
res_old.sum().backward()
res_new.sum().backward()
# 5. Compare Gradients
grads_a_close = torch.allclose(a1.grad, a2.grad, atol=1e-2)
grads_b_close = torch.allclose(b1.grad, b2.grad, atol=1e-2)
print(f"Gradients for 'a' close? {grads_a_close}")
print(f"Gradients for 'b' close? {grads_b_close}")
if not grads_a_close:
    print(f"Max diff 'a': {(a1.grad - a2.grad).abs().max().item()}")
if not grads_b_close:
    print(f"Max diff 'b': {(b1.grad - b2.grad).abs().max().item()}")


Forward pass close? True
Gradients for 'a' close? True
Gradients for 'b' close? True


# Speed comparison for raw 2D cross-correlation

In [90]:
%timeit old_cc(a,b)

17.8 ms ± 75.6 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [91]:
%timeit new_cc(a,b)

13.7 ms ± 1.14 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


# Memory usage for raw 2D cross-correlation

In [92]:
import torch

torch.cuda.reset_peak_memory_stats()
old_cc(a, b)
peak = torch.cuda.max_memory_allocated()

print(f"Peak GPU memory usage: {peak / 1024**2:.2f} MB")

Peak GPU memory usage: 1236.22 MB


In [93]:
torch.cuda.reset_peak_memory_stats()
new_cc(a, b)
peak = torch.cuda.max_memory_allocated()

print(f"Peak GPU memory usage: {peak / 1024**2:.2f} MB")

Peak GPU memory usage: 896.08 MB


# Results : 
Could increase a bit the speed for double and reduce the memory usage in comparison with https://github.com/Simon-Bertrand/FastCrossCorr-PyTorch

The speed increase is not as good as expected because the bottleneck is, for both case, the same cuFFT algorithm under the hood. The new_cc method processes ~1.5x fewer pixels (valid padding), which aligns well with the ~1.3x - 1.4x speedup we see (factoring in some fixed overheads). 
This means that the new_cc method is faster and less memory-hungry than the old_cc method but the difference is not significant.