In [4]:
%pip install torch torchvision numpy matplotlib open3d transformers json5 tqdm



In [11]:
%pip install scipy



In [23]:
import os
import json
import torch
import numpy as np
import scipy.io  # Replacing h5py with scipy
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from transformers import ViTModel
from tqdm import tqdm
from PIL import Image

# Enable automatic mixed precision for faster training
torch.backends.cudnn.benchmark = True  # Optimizes CUDA performance
scaler = torch.amp.GradScaler('cuda')

# Optimized Dataset
class Pix3DDataset(Dataset):
    def __init__(self, json_path, root_dir, transform=None):
        with open(json_path, "r") as file:
            self.data = json.load(file)
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        img_path = os.path.join(self.root_dir, sample["img"])
        voxel_path = os.path.join(self.root_dir, sample["voxel"])

        # Load and transform image
        image = Image.open(img_path).convert("RGB")
        if self.transform:
            image = self.transform(image)

        # Load voxel data using scipy
        voxel_data = scipy.io.loadmat(voxel_path)["voxel"]
        voxel_data = torch.tensor(voxel_data, dtype=torch.float32)

        return image, voxel_data

# Define transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

# Initialize dataset and dataloader
dataset = Pix3DDataset("/home/user/Imagin3D/pix3d.json", "/home/user/Imagin3D", transform=transform)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=0)

In [24]:
# Model definition
class Simple3DModel(nn.Module):
    def __init__(self):
        super(Simple3DModel, self).__init__()
        self.encoder = ViTModel.from_pretrained("")
        self.fc = nn.Linear(768, 128*128*128)  # Adjust output size to match voxel dimensions

    def forward(self, x):
        x = self.encoder(x).last_hidden_state[:, 0, :]
        x = self.fc(x)
        return x.view(-1, 128, 128, 128)  # Reshape for voxel output

In [25]:
# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Simple3DModel().to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-4, betas=(0.9, 0.999), eps=1e-8)  # Optimized Adam params
criterion = nn.MSELoss()

best_loss = float("inf")
for epoch in range(10):  # Number of epochs
    loop = tqdm(dataloader, leave=True)
    optimizer.zero_grad()
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    for i, (images, voxels) in enumerate(loop):
        images, voxels = images.to(device, non_blocking=True), voxels.to(device, non_blocking=True)  # Enable non_blocking memory transfer
        
        with torch.amp.autocast('cuda'):  # Enable mixed precision training
            predictions = model(images)
            loss = criterion(predictions, voxels)
        
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()
        
        running_loss += loss.item()
        total += voxels.numel()
        correct += ((predictions > 0.5) == (voxels > 0.5)).sum().item()  # Binary voxel accuracy
        
        loop.set_description(f"Epoch [{epoch+1}/10]")
        loop.set_postfix(loss=loss.item(), accuracy=100.0 * correct / total)
    
    avg_loss = running_loss / len(dataloader)
    avg_accuracy = 100.0 * correct / total
    print(f"Epoch {epoch+1}, Loss: {avg_loss:.4f}, Accuracy: {avg_accuracy:.2f}%")
    
    # Save model if it achieves the best loss
    if avg_loss < best_loss:
        best_loss = avg_loss
        torch.save(model.state_dict(), "optimized_lrgt_3d_reconstruction.pth")
        print("Model saved with lower loss.")

Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch [1/10]: 100%|█| 315/315 [11:20<00:00,  2.16s/it, accuracy=93.3, loss=0.053


Epoch 1, Loss: 0.0596, Accuracy: 93.29%
Model saved with lower loss.


Epoch [2/10]: 100%|█| 315/315 [11:16<00:00,  2.15s/it, accuracy=94.7, loss=0.058


Epoch 2, Loss: 0.0441, Accuracy: 94.66%
Model saved with lower loss.


Epoch [3/10]: 100%|█| 315/315 [11:06<00:00,  2.12s/it, accuracy=95.4, loss=0.031


Epoch 3, Loss: 0.0372, Accuracy: 95.43%
Model saved with lower loss.


Epoch [4/10]: 100%|█| 315/315 [11:08<00:00,  2.12s/it, accuracy=96.3, loss=0.024


Epoch 4, Loss: 0.0313, Accuracy: 96.32%
Model saved with lower loss.


Epoch [5/10]: 100%|█| 315/315 [11:06<00:00,  2.11s/it, accuracy=96.8, loss=0.027


Epoch 5, Loss: 0.0273, Accuracy: 96.84%
Model saved with lower loss.


Epoch [6/10]: 100%|█| 315/315 [11:06<00:00,  2.11s/it, accuracy=97.3, loss=0.021


Epoch 6, Loss: 0.0237, Accuracy: 97.26%
Model saved with lower loss.


Epoch [7/10]: 100%|█| 315/315 [11:02<00:00,  2.10s/it, accuracy=97.6, loss=0.016


Epoch 7, Loss: 0.0211, Accuracy: 97.57%
Model saved with lower loss.


Epoch [8/10]: 100%|█| 315/315 [11:05<00:00,  2.11s/it, accuracy=97.8, loss=0.016


Epoch 8, Loss: 0.0192, Accuracy: 97.80%
Model saved with lower loss.


Epoch [9/10]: 100%|█| 315/315 [11:04<00:00,  2.11s/it, accuracy=98, loss=0.0133]


Epoch 9, Loss: 0.0176, Accuracy: 98.00%
Model saved with lower loss.


Epoch [10/10]: 100%|█| 315/315 [11:03<00:00,  2.11s/it, accuracy=98.1, loss=0.01


Epoch 10, Loss: 0.0165, Accuracy: 98.12%
Model saved with lower loss.


In [7]:
import torch
import numpy as np
import open3d as o3d
from PIL import Image
from torchvision import transforms
from transformers import ViTModel
import scipy.ndimage

# Model definition
class Simple3DModel(torch.nn.Module):
    def __init__(self):
        super(Simple3DModel, self).__init__()
        self.encoder = ViTModel.from_pretrained("google/vit-base-patch16-224")
        self.fc = torch.nn.Linear(768, 128*128*128)

    def forward(self, x):
        x = self.encoder(x).last_hidden_state[:, 0, :]
        x = self.fc(x)
        return x.view(-1, 128, 128, 128)

def visualize_voxel_grid(voxel_data, threshold=0.3):
    voxel_data = (voxel_data > threshold).astype(np.uint8)  # Lower threshold
    voxel_indices = np.argwhere(voxel_data)
    
    # Check if we have any points before creating point cloud
    if len(voxel_indices) == 0:
        print(f"No voxels detected with threshold {threshold}. Try lowering the threshold value.")
        return None
        
    pcd = o3d.geometry.PointCloud()
    pcd.points = o3d.utility.Vector3dVector(voxel_indices)
    o3d.visualization.draw_geometries([pcd])
    
    return pcd

def smooth_voxel_data(voxel_data, threshold=0.3, sigma=1.2):
    """
    Apply Gaussian smoothing to voxel data before thresholding
    """
    # Print statistics about the voxel data
    print(f"Voxel statistics: min={voxel_data.min()}, max={voxel_data.max()}, mean={voxel_data.mean()}")
    print(f"Number of voxels > 0.5: {np.sum(voxel_data > 0.5)}")
    print(f"Number of voxels > 0.3: {np.sum(voxel_data > 0.3)}")
    print(f"Number of voxels > 0.1: {np.sum(voxel_data > 0.1)}")
    
    # Apply Gaussian smoothing to the raw voxel predictions
    smoothed_data = scipy.ndimage.gaussian_filter(voxel_data, sigma=sigma)
    
    # Apply threshold after smoothing
    binary_voxels = smoothed_data > threshold
    
    return binary_voxels

def save_as_smooth_mesh(voxel_data, filename="smooth_output.obj", threshold=0.3, 
                        sigma=1.2, depth=9, scale=1.1, linear_fit=False):
    """
    Convert voxel data to a smooth mesh using Poisson surface reconstruction
    
    Parameters:
        voxel_data: numpy array of voxel predictions
        filename: output filename
        threshold: value threshold for binary voxel decision (lowered from 0.5 to 0.3)
        sigma: smoothing factor for Gaussian filter
        depth: depth parameter for Poisson reconstruction (higher = more detail)
        scale: scale factor for the reconstructed mesh
        linear_fit: whether to use linear fit for color interpolation
    """
    # Apply smoothing to the voxel data
    binary_voxels = smooth_voxel_data(voxel_data, threshold, sigma)
    
    # Extract voxel indices where value > threshold
    voxel_indices = np.argwhere(binary_voxels)
    
    if voxel_indices.size == 0:
        print(f"No voxels detected after thresholding with threshold={threshold}. Try lowering the threshold value further.")
        return None
    
    print(f"Found {len(voxel_indices)} voxels after thresholding at {threshold}")
    
    # Convert voxel indices to point cloud
    pcd = o3d.geometry.PointCloud()
    pcd.points = o3d.utility.Vector3dVector(voxel_indices.astype(np.float32))
    
    # Estimate normals with consistent orientation
    # Increased parameters for better normal estimation
    pcd.estimate_normals(search_param=o3d.geometry.KDTreeSearchParamHybrid(radius=5, max_nn=50))
    pcd.orient_normals_consistent_tangent_plane(k=30)
    
    # Apply Poisson surface reconstruction (produces smoother results than ball pivoting)
    mesh, densities = o3d.geometry.TriangleMesh.create_from_point_cloud_poisson(
        pcd, depth=depth, scale=scale, linear_fit=linear_fit)
    
    # Optional: Remove low-density vertices which are often outliers
    vertices_to_remove = densities < np.quantile(densities, 0.01)
    mesh.remove_vertices_by_mask(vertices_to_remove)
    
    # Final mesh cleanup and preparation
    mesh.compute_vertex_normals()
    
    # Optional: Apply Laplacian smoothing for even smoother results
    mesh = mesh.filter_smooth_laplacian(number_of_iterations=5)
    
    # Save the final mesh
    o3d.io.write_triangle_mesh(filename, mesh)
    print(f"Saved smooth 3D model as {filename}")
    
    return mesh

# Function to predict 3D model from a single image with improved smoothing
def predict_smooth_3d_from_image(image_path, model, device, transform, 
                               output_filename="smooth_output.obj", threshold=0.3):
    """
    Generate a smooth 3D model from a single image
    """
    # Load and preprocess the image
    try:
        image = Image.open(image_path).convert("RGB")
        print(f"Successfully loaded image from {image_path}")
    except Exception as e:
        print(f"Error loading image: {e}")
        return None
        
    image_tensor = transform(image).unsqueeze(0).to(device)
    print(f"Image tensor shape: {image_tensor.shape}")
    
    # Generate prediction
    with torch.no_grad():
        try:
            predicted_voxel = model(image_tensor).cpu().numpy().squeeze()
            print(f"Successfully generated voxel prediction with shape: {predicted_voxel.shape}")
        except Exception as e:
            print(f"Error during model prediction: {e}")
            return None
    
    # Add debug information
    print(f"Voxel statistics: min={predicted_voxel.min()}, max={predicted_voxel.max()}, mean={predicted_voxel.mean()}")
    print(f"Number of voxels > 0.5: {np.sum(predicted_voxel > 0.5)}")
    print(f"Number of voxels > 0.3: {np.sum(predicted_voxel > 0.3)}")
    print(f"Number of voxels > 0.1: {np.sum(predicted_voxel > 0.1)}")
    
    # If no voxels above threshold, try with a lower threshold
    if np.sum(predicted_voxel > threshold) == 0:
        print(f"No voxels above threshold {threshold}, trying with lower threshold 0.1")
        new_threshold = 0.1
        if np.sum(predicted_voxel > new_threshold) == 0:
            print("Still no voxels detected. The model might not be generating valid predictions.")
            return None
        else:
            threshold = new_threshold
    
    # Save and visualize the smoothed prediction
    mesh = save_as_smooth_mesh(predicted_voxel, filename=output_filename, threshold=threshold)
    
    return mesh

# Main execution function
def generate_smooth_3d_model(image_path, model_path, output_path="smooth_output.obj", threshold=0.3):
    """
    Complete pipeline to generate a smooth 3D model from an image
    """
    # Set up device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    
    # Load the trained model
    try:
        model = Simple3DModel().to(device)
        model.load_state_dict(torch.load(model_path, map_location=device))
        model.eval()
        print(f"Successfully loaded model from {model_path}")
    except Exception as e:
        print(f"Error loading model: {e}")
        return None
    
    # Define image transformation
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
    ])
    
    # Generate the smooth 3D model
    mesh = predict_smooth_3d_from_image(image_path, model, device, transform, 
                                       output_path, threshold=threshold)
    
    # Visualize the result if mesh was created successfully
    if mesh is not None:
        try:
            o3d.visualization.draw_geometries([mesh])
        except Exception as e:
            print(f"Error visualizing mesh: {e}")
    else:
        print("Failed to generate valid mesh")
    
    return mesh

# Example usage
if __name__ == "__main__":
    image_path = r"C:\Working\Imagin3D\data\img\sofa\0158.jpeg"  # Replace with your image path
    model_path = r"models\optimized_lrgt_3d_reconstruction.pth"  # Path to your trained model
    output_path = r"C:\Working\Imagin3D\smooth_output.obj"  # Where to save the smooth model
    
    # Try with a lower threshold
    generate_smooth_3d_model(image_path, model_path, output_path, threshold=0.2)

Using device: cpu


Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Successfully loaded model from models\optimized_lrgt_3d_reconstruction.pth
Successfully loaded image from C:\Working\Imagin3D\data\img\sofa\0158.jpeg
Image tensor shape: torch.Size([1, 3, 224, 224])
Successfully generated voxel prediction with shape: (128, 128, 128)
Voxel statistics: min=-0.19323106110095978, max=1.2068321704864502, mean=0.10854323953390121
Number of voxels > 0.5: 221367
Number of voxels > 0.3: 233847
Number of voxels > 0.1: 277531
Voxel statistics: min=-0.19323106110095978, max=1.2068321704864502, mean=0.10854323953390121
Number of voxels > 0.5: 221367
Number of voxels > 0.3: 233847
Number of voxels > 0.1: 277531
Found 265889 voxels after thresholding at 0.2
Saved smooth 3D model as C:\Working\Imagin3D\smooth_output.obj


In [3]:
import open3d as o3d

def view_obj_file(filename="output.obj"):
    mesh = o3d.io.read_triangle_mesh(filename)
    mesh.compute_vertex_normals()
    o3d.visualization.draw_geometries([mesh])

# Example usage
view_obj_file(r"C:\Working\Imagin3D\output.obj")  # Replace with your file path