diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..1436e1a
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "extension"]
+	path = extension
+	url = git@github.com:ThibaultGROUEIX/chamfer_pytorch.git
diff --git a/extension b/extension
new file mode 160000
index 0000000..719b0f1
--- /dev/null
+++ b/extension
@@ -0,0 +1 @@
+Subproject commit 719b0f1ca5ba370616cb837c03ab88d9a88173ff
diff --git a/extension/README.md b/extension/README.md
deleted file mode 100755
index 9cae8d1..0000000
--- a/extension/README.md
+++ /dev/null
@@ -1,8 +0,0 @@
-Chamfer distance timings for records
-
-| 50 epoch AE_Atlasnet_25prim      | time(s) | memory(GB) |
-| -------------------------------- | ------- | ---------- |
-| without chamfer loss (just mean) | 20.46   | 4.08       |
-| with pytorch chamfer             | 24.86   | 8.80       |
-| with CUDA chamfer                | 20.90   | 4.08       |
-
diff --git a/extension/chamfer.cu b/extension/chamfer.cu
deleted file mode 100755
index d5b886d..0000000
--- a/extension/chamfer.cu
+++ /dev/null
@@ -1,196 +0,0 @@
-
-#include <stdio.h>
-#include <ATen/ATen.h>
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-#include <vector>
-
-
-
-__global__ void NmDistanceKernel(int b,int n,const float * xyz,int m,const float * xyz2,float * result,int * result_i){
-	const int batch=512;
-	__shared__ float buf[batch*3];
-	for (int i=blockIdx.x;i<b;i+=gridDim.x){
-		for (int k2=0;k2<m;k2+=batch){
-			int end_k=min(m,k2+batch)-k2;
-			for (int j=threadIdx.x;j<end_k*3;j+=blockDim.x){
-				buf[j]=xyz2[(i*m+k2)*3+j];
-			}
-			__syncthreads();
-			for (int j=threadIdx.x+blockIdx.y*blockDim.x;j<n;j+=blockDim.x*gridDim.y){
-				float x1=xyz[(i*n+j)*3+0];
-				float y1=xyz[(i*n+j)*3+1];
-				float z1=xyz[(i*n+j)*3+2];
-				int best_i=0;
-				float best=0;
-				int end_ka=end_k-(end_k&3);
-				if (end_ka==batch){
-					for (int k=0;k<batch;k+=4){
-						{
-							float x2=buf[k*3+0]-x1;
-							float y2=buf[k*3+1]-y1;
-							float z2=buf[k*3+2]-z1;
-							float d=x2*x2+y2*y2+z2*z2;
-							if (k==0 || d<best){
-								best=d;
-								best_i=k+k2;
-							}
-						}
-						{
-							float x2=buf[k*3+3]-x1;
-							float y2=buf[k*3+4]-y1;
-							float z2=buf[k*3+5]-z1;
-							float d=x2*x2+y2*y2+z2*z2;
-							if (d<best){
-								best=d;
-								best_i=k+k2+1;
-							}
-						}
-						{
-							float x2=buf[k*3+6]-x1;
-							float y2=buf[k*3+7]-y1;
-							float z2=buf[k*3+8]-z1;
-							float d=x2*x2+y2*y2+z2*z2;
-							if (d<best){
-								best=d;
-								best_i=k+k2+2;
-							}
-						}
-						{
-							float x2=buf[k*3+9]-x1;
-							float y2=buf[k*3+10]-y1;
-							float z2=buf[k*3+11]-z1;
-							float d=x2*x2+y2*y2+z2*z2;
-							if (d<best){
-								best=d;
-								best_i=k+k2+3;
-							}
-						}
-					}
-				}else{
-					for (int k=0;k<end_ka;k+=4){
-						{
-							float x2=buf[k*3+0]-x1;
-							float y2=buf[k*3+1]-y1;
-							float z2=buf[k*3+2]-z1;
-							float d=x2*x2+y2*y2+z2*z2;
-							if (k==0 || d<best){
-								best=d;
-								best_i=k+k2;
-							}
-						}
-						{
-							float x2=buf[k*3+3]-x1;
-							float y2=buf[k*3+4]-y1;
-							float z2=buf[k*3+5]-z1;
-							float d=x2*x2+y2*y2+z2*z2;
-							if (d<best){
-								best=d;
-								best_i=k+k2+1;
-							}
-						}
-						{
-							float x2=buf[k*3+6]-x1;
-							float y2=buf[k*3+7]-y1;
-							float z2=buf[k*3+8]-z1;
-							float d=x2*x2+y2*y2+z2*z2;
-							if (d<best){
-								best=d;
-								best_i=k+k2+2;
-							}
-						}
-						{
-							float x2=buf[k*3+9]-x1;
-							float y2=buf[k*3+10]-y1;
-							float z2=buf[k*3+11]-z1;
-							float d=x2*x2+y2*y2+z2*z2;
-							if (d<best){
-								best=d;
-								best_i=k+k2+3;
-							}
-						}
-					}
-				}
-				for (int k=end_ka;k<end_k;k++){
-					float x2=buf[k*3+0]-x1;
-					float y2=buf[k*3+1]-y1;
-					float z2=buf[k*3+2]-z1;
-					float d=x2*x2+y2*y2+z2*z2;
-					if (k==0 || d<best){
-						best=d;
-						best_i=k+k2;
-					}
-				}
-				if (k2==0 || result[(i*n+j)]>best){
-					result[(i*n+j)]=best;
-					result_i[(i*n+j)]=best_i;
-				}
-			}
-			__syncthreads();
-		}
-	}
-}
-// int chamfer_cuda_forward(int b,int n,const float * xyz,int m,const float * xyz2,float * result,int * result_i,float * result2,int * result2_i, cudaStream_t stream){
-int chamfer_cuda_forward(at::Tensor xyz1, at::Tensor xyz2, at::Tensor dist1, at::Tensor dist2, at::Tensor idx1, at::Tensor idx2){
-
-	const auto batch_size = xyz1.size(0);
-	const auto n = xyz1.size(1); //num_points point cloud A
-	const auto m = xyz2.size(1); //num_points point cloud B
-
-	NmDistanceKernel<<<dim3(32,16,1),512>>>(batch_size, n, xyz1.data<float>(), m, xyz2.data<float>(), dist1.data<float>(), idx1.data<int>());
-	NmDistanceKernel<<<dim3(32,16,1),512>>>(batch_size, m, xyz2.data<float>(), n, xyz1.data<float>(), dist2.data<float>(), idx2.data<int>());
-
-	cudaError_t err = cudaGetLastError();
-	  if (err != cudaSuccess) {
-	    printf("error in nnd updateOutput: %s\n", cudaGetErrorString(err));
-	    //THError("aborting");
-	    return 0;
-	  }
-	  return 1;
-
-
-}
-__global__ void NmDistanceGradKernel(int b,int n,const float * xyz1,int m,const float * xyz2,const float * grad_dist1,const int * idx1,float * grad_xyz1,float * grad_xyz2){
-	for (int i=blockIdx.x;i<b;i+=gridDim.x){
-		for (int j=threadIdx.x+blockIdx.y*blockDim.x;j<n;j+=blockDim.x*gridDim.y){
-			float x1=xyz1[(i*n+j)*3+0];
-			float y1=xyz1[(i*n+j)*3+1];
-			float z1=xyz1[(i*n+j)*3+2];
-			int j2=idx1[i*n+j];
-			float x2=xyz2[(i*m+j2)*3+0];
-			float y2=xyz2[(i*m+j2)*3+1];
-			float z2=xyz2[(i*m+j2)*3+2];
-			float g=grad_dist1[i*n+j]*2;
-			atomicAdd(&(grad_xyz1[(i*n+j)*3+0]),g*(x1-x2));
-			atomicAdd(&(grad_xyz1[(i*n+j)*3+1]),g*(y1-y2));
-			atomicAdd(&(grad_xyz1[(i*n+j)*3+2]),g*(z1-z2));
-			atomicAdd(&(grad_xyz2[(i*m+j2)*3+0]),-(g*(x1-x2)));
-			atomicAdd(&(grad_xyz2[(i*m+j2)*3+1]),-(g*(y1-y2)));
-			atomicAdd(&(grad_xyz2[(i*m+j2)*3+2]),-(g*(z1-z2)));
-		}
-	}
-}
-// int chamfer_cuda_backward(int b,int n,const float * xyz1,int m,const float * xyz2,const float * grad_dist1,const int * idx1,const float * grad_dist2,const int * idx2,float * grad_xyz1,float * grad_xyz2, cudaStream_t stream){
-int chamfer_cuda_backward(at::Tensor xyz1, at::Tensor xyz2, at::Tensor gradxyz1, at::Tensor gradxyz2, at::Tensor graddist1, at::Tensor graddist2, at::Tensor idx1, at::Tensor idx2){
-	// cudaMemset(grad_xyz1,0,b*n*3*4);
-	// cudaMemset(grad_xyz2,0,b*m*3*4);
-	
-	const auto batch_size = xyz1.size(0);
-	const auto n = xyz1.size(1); //num_points point cloud A
-	const auto m = xyz2.size(1); //num_points point cloud B
-
-	NmDistanceGradKernel<<<dim3(1,16,1),256>>>(batch_size,n,xyz1.data<float>(),m,xyz2.data<float>(),graddist1.data<float>(),idx1.data<int>(),gradxyz1.data<float>(),gradxyz2.data<float>());
-	NmDistanceGradKernel<<<dim3(1,16,1),256>>>(batch_size,m,xyz2.data<float>(),n,xyz1.data<float>(),graddist2.data<float>(),idx2.data<int>(),gradxyz2.data<float>(),gradxyz1.data<float>());
-	
-	cudaError_t err = cudaGetLastError();
-	  if (err != cudaSuccess) {
-	    printf("error in nnd get grad: %s\n", cudaGetErrorString(err));
-	    //THError("aborting");
-	    return 0;
-	  }
-	  return 1;
-	
-}
-
diff --git a/extension/chamfer_cuda.cpp b/extension/chamfer_cuda.cpp
deleted file mode 100755
index 67574e2..0000000
--- a/extension/chamfer_cuda.cpp
+++ /dev/null
@@ -1,33 +0,0 @@
-#include <torch/torch.h>
-#include <vector>
-
-///TMP
-//#include "common.h"
-/// NOT TMP
-	
-
-int chamfer_cuda_forward(at::Tensor xyz1, at::Tensor xyz2, at::Tensor dist1, at::Tensor dist2, at::Tensor idx1, at::Tensor idx2);
-
-
-int chamfer_cuda_backward(at::Tensor xyz1, at::Tensor xyz2, at::Tensor gradxyz1, at::Tensor gradxyz2, at::Tensor graddist1, at::Tensor graddist2, at::Tensor idx1, at::Tensor idx2);
-
-
-
-
-int chamfer_forward(at::Tensor xyz1, at::Tensor xyz2, at::Tensor dist1, at::Tensor dist2, at::Tensor idx1, at::Tensor idx2) {
-    return chamfer_cuda_forward(xyz1, xyz2, dist1, dist2, idx1, idx2);
-}
-
-
-int chamfer_backward(at::Tensor xyz1, at::Tensor xyz2, at::Tensor gradxyz1, at::Tensor gradxyz2, at::Tensor graddist1, 
-					  at::Tensor graddist2, at::Tensor idx1, at::Tensor idx2) {
-
-    return chamfer_cuda_backward(xyz1, xyz2, gradxyz1, gradxyz2, graddist1, graddist2, idx1, idx2);
-}
-
-
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("forward", &chamfer_forward, "chamfer forward (CUDA)");
-  m.def("backward", &chamfer_backward, "chamfer backward (CUDA)");
-}
\ No newline at end of file
diff --git a/extension/chamfer_python.py b/extension/chamfer_python.py
deleted file mode 100644
index 3a86186..0000000
--- a/extension/chamfer_python.py
+++ /dev/null
@@ -1,28 +0,0 @@
-import torch
-
-
-def pairwise_dist(x, y):
-    xx, yy, zz = torch.mm(x, x.t()), torch.mm(y, y.t()), torch.mm(x, y.t())
-    rx = xx.diag().unsqueeze(0).expand_as(xx)
-    ry = yy.diag().unsqueeze(0).expand_as(yy)
-    P = rx.t() + ry - 2 * zz
-    return P
-
-
-def NN_loss(x, y, dim=0):
-    dist = pairwise_dist(x, y)
-    values, indices = dist.min(dim=dim)
-    return values.mean()
-
-
-def distChamfer(a, b):
-    x, y = a, b
-    bs, num_points, points_dim = x.size()
-    xx = torch.bmm(x, x.transpose(2, 1))
-    yy = torch.bmm(y, y.transpose(2, 1))
-    zz = torch.bmm(x, y.transpose(2, 1))
-    diag_ind = torch.arange(0, num_points).type(torch.cuda.LongTensor)
-    rx = xx[:, diag_ind, diag_ind].unsqueeze(1).expand_as(xx)
-    ry = yy[:, diag_ind, diag_ind].unsqueeze(1).expand_as(yy)
-    P = rx.transpose(2, 1) + ry - 2 * zz
-    return torch.min(P, 1)[0], torch.min(P, 2)[0], torch.min(P, 1)[1], torch.min(P, 2)[1]
diff --git a/extension/dist_chamfer.py b/extension/dist_chamfer.py
deleted file mode 100755
index 3350304..0000000
--- a/extension/dist_chamfer.py
+++ /dev/null
@@ -1,54 +0,0 @@
-import math
-from torch import nn
-from torch.autograd import Function
-import torch
-import sys
-from numbers import Number
-from collections import Set, Mapping, deque
-import chamfer
-
-
-# Chamfer's distance module @thibaultgroueix
-# GPU tensors only
-class chamferFunction(Function):
-    @staticmethod
-    def forward(ctx, xyz1, xyz2):
-        batchsize, n, _ = xyz1.size()
-        _, m, _ = xyz2.size()
-
-        dist1 = torch.zeros(batchsize, n)
-        dist2 = torch.zeros(batchsize, m)
-
-        idx1 = torch.zeros(batchsize, n).type(torch.IntTensor)
-        idx2 = torch.zeros(batchsize, m).type(torch.IntTensor)
-
-        dist1 = dist1.cuda()
-        dist2 = dist2.cuda()
-        idx1 = idx1.cuda()
-        idx2 = idx2.cuda()
-
-        chamfer.forward(xyz1, xyz2, dist1, dist2, idx1, idx2)
-        ctx.save_for_backward(xyz1, xyz2, idx1, idx2)
-        return dist1, dist2
-
-    @staticmethod
-    def backward(ctx, graddist1, graddist2):
-        xyz1, xyz2, idx1, idx2 = ctx.saved_tensors
-        graddist1 = graddist1.contiguous()
-        graddist2 = graddist2.contiguous()
-
-        gradxyz1 = torch.zeros(xyz1.size())
-        gradxyz2 = torch.zeros(xyz2.size())
-
-        gradxyz1 = gradxyz1.cuda()
-        gradxyz2 = gradxyz2.cuda()
-        chamfer.backward(xyz1, xyz2, gradxyz1, gradxyz2, graddist1, graddist2, idx1, idx2)
-        return gradxyz1, gradxyz2
-
-class chamferDist(nn.Module):
-    def __init__(self):
-        super(chamferDist, self).__init__()
-
-    def forward(self, input1, input2):
-        return chamferFunction.apply(input1, input2)
-
diff --git a/extension/dist_chamfer_idx.py b/extension/dist_chamfer_idx.py
deleted file mode 100644
index 7551bf3..0000000
--- a/extension/dist_chamfer_idx.py
+++ /dev/null
@@ -1,52 +0,0 @@
-from torch import nn
-from torch.autograd import Function
-import torch
-import chamfer
-
-
-# Chamfer's distance module @thibaultgroueix
-# GPU tensors only
-class chamferFunction(Function):
-    @staticmethod
-    def forward(ctx, xyz1, xyz2):
-        batchsize, n, _ = xyz1.size()
-        _, m, _ = xyz2.size()
-
-        dist1 = torch.zeros(batchsize, n)
-        dist2 = torch.zeros(batchsize, m)
-
-        idx1 = torch.zeros(batchsize, n).type(torch.IntTensor)
-        idx2 = torch.zeros(batchsize, m).type(torch.IntTensor)
-
-        dist1 = dist1.cuda()
-        dist2 = dist2.cuda()
-        idx1 = idx1.cuda()
-        idx2 = idx2.cuda()
-
-        chamfer.forward(xyz1, xyz2, dist1, dist2, idx1, idx2)
-        ctx.save_for_backward(xyz1, xyz2, idx1, idx2)
-        return dist1, dist2, idx1, idx2
-
-    @staticmethod
-    def backward(ctx, graddist1, graddist2, gradidx1, gradidx2):
-        xyz1, xyz2, idx1, idx2 = ctx.saved_tensors
-        graddist1 = graddist1.contiguous()
-        graddist2 = graddist2.contiguous()
-
-        gradxyz1 = torch.zeros(xyz1.size())
-        gradxyz2 = torch.zeros(xyz2.size())
-
-        gradxyz1 = gradxyz1.cuda()
-        gradxyz2 = gradxyz2.cuda()
-        chamfer.backward(
-            xyz1, xyz2, gradxyz1, gradxyz2, graddist1, graddist2, idx1, idx2
-        )
-        return gradxyz1, gradxyz2
-
-
-class chamferDist(nn.Module):
-    def __init__(self):
-        super(chamferDist, self).__init__()
-
-    def forward(self, input1, input2):
-        return chamferFunction.apply(input1, input2)
diff --git a/extension/get_chamfer.py b/extension/get_chamfer.py
deleted file mode 100644
index 314e7da..0000000
--- a/extension/get_chamfer.py
+++ /dev/null
@@ -1,10 +0,0 @@
-def get(opt):
-    if opt.accelerated_chamfer:
-        import dist_chamfer_idx as ext
-
-        distChamfer = ext.chamferDist()
-    else:
-        import chamfer_python
-
-        distChamfer = chamfer_python.distChamfer
-    return distChamfer
diff --git a/extension/setup.py b/extension/setup.py
deleted file mode 100755
index 9055958..0000000
--- a/extension/setup.py
+++ /dev/null
@@ -1,14 +0,0 @@
-from setuptools import setup
-from torch.utils.cpp_extension import BuildExtension, CUDAExtension
-
-setup(
-    name='chamfer',
-    ext_modules=[
-        CUDAExtension('chamfer', [
-            'chamfer_cuda.cpp',
-            'chamfer.cu',
-        ]),
-    ],
-    cmdclass={
-        'build_ext': BuildExtension
-    })
\ No newline at end of file
diff --git a/extension/test_chamfer.py b/extension/test_chamfer.py
deleted file mode 100644
index 33f7a51..0000000
--- a/extension/test_chamfer.py
+++ /dev/null
@@ -1,57 +0,0 @@
-import torch
-import dist_chamfer as ext
-
-distChamfer = ext.chamferDist()
-from torch.autograd import Variable
-
-
-def pairwise_dist(x, y):
-    xx, yy, zz = torch.mm(x, x.t()), torch.mm(y, y.t()), torch.mm(x, y.t())
-    rx = xx.diag().unsqueeze(0).expand_as(xx)
-    ry = yy.diag().unsqueeze(0).expand_as(yy)
-    P = rx.t() + ry - 2 * zz
-    return P
-
-
-def NN_loss(x, y, dim=0):
-    dist = pairwise_dist(x, y)
-    values, indices = dist.min(dim=dim)
-    return values.mean()
-
-
-def mydistChamfer(a, b):
-    x, y = a, b
-    bs, num_points, points_dim = x.size()
-    xx = torch.bmm(x, x.transpose(2, 1))
-    yy = torch.bmm(y, y.transpose(2, 1))
-    zz = torch.bmm(x, y.transpose(2, 1))
-    diag_ind = torch.arange(0, num_points).type(torch.cuda.LongTensor)
-    rx = xx[:, diag_ind, diag_ind].unsqueeze(1).expand_as(xx)
-    ry = yy[:, diag_ind, diag_ind].unsqueeze(1).expand_as(yy)
-    P = rx.transpose(2, 1) + ry - 2 * zz
-    return torch.min(P, 2)[0], torch.min(P, 1)[0]
-
-
-def test_chamfer():
-    distChamfer = ext.chamferDist()
-    p1 = torch.rand(4, 100, 3).cuda()
-    p2 = torch.rand(4, 100, 3).cuda()
-    points1 = Variable(p1, requires_grad=True)
-    points2 = Variable(p2)
-    dist1, dist2, = distChamfer(points1, points2)
-
-    loss = torch.sum(dist1)
-    print(loss)
-    loss.backward()
-    print(points1.grad, points2.grad)
-
-    mydist1, mydist2 = mydistChamfer(points1, points2)
-    d1 = (dist1 - mydist1) ** 2
-    d2 = (dist2 - mydist2) ** 2
-    print(d1, d2)
-    assert (
-        torch.sum(d1) + torch.sum(d2) < 0.00000001
-    ), "chamfer cuda and chamfer normal are not giving the same results"
-
-
-test_chamfer()