In [1]:
import torch

def mse(img1, img2):
    return (((img1 - img2)) ** 2).view(img1.shape[0], -1).mean(1, keepdim=True)

def psnr(img1, img2):
    mse = (((img1 - img2)) ** 2).view(img1.shape[0], -1).mean(1, keepdim=True)
    return 20 * torch.log10(1.0 / torch.sqrt(mse))


In [58]:
import cv2
import os
import numpy as np
from PIL import Image, ImageDraw, ImageFont

# Paths
test_images_dir = r"E:\lab\test_images"
rendered_images_dir = r"E:\lab\rendered"
output_video_path = r"E:\lab\output_video.avi"
video_path = r"E:\lab\video.mp4"  # Update this with the actual video file name

# Target video size and background color (black)
output_width = 2001
output_height = 1126
background_color = (0, 0, 0)  # Black background

# Video size (fixed at 960x544) and position (shifted to the left and lower)
video_width = 960
video_height = 544
video_left_x = 150  # Move video 100px from the left
video_top_y = 395  # Adjust to control the vertical position of the video

# Image size for the right
image_size = (610, 340)  # Resize images to this size

# Border thickness for the images
border_thickness = 5
blue_border = (255, 0, 0)  # Blue border for left-side video and top-right image
green_border = (0, 255, 0)  # Green border for the bottom-right image

# Font settings for the text
font_path = "C:/Windows/Fonts/times.ttf"  # Path to Times New Roman font
font_size_large = 40  # Larger font for the two lines above the video
font_size_normal = 30  # Normal font size for other annotations

# Load fonts
font_large = ImageFont.truetype(font_path, font_size_large)
font_captial = ImageFont.truetype(font_path, 60)
font_normal = ImageFont.truetype(font_path, font_size_normal)

# Load the video to be shown
cap = cv2.VideoCapture(video_path)

# Check if video is loaded correctly
if not cap.isOpened():
    print("Error: Could not open video.")
    exit()

# Get the video frame rate
video_fps = cap.get(cv2.CAP_PROP_FPS)

# Calculate the time per frame (1 second / video_fps)
time_per_video_frame = 1 / video_fps

# Video writer (without sound)
fourcc = cv2.VideoWriter_fourcc(*'XVID')
out = cv2.VideoWriter(output_video_path, fourcc, video_fps, (output_width, output_height))

# Get sorted image lists
test_images = sorted([os.path.join(test_images_dir, img) for img in os.listdir(test_images_dir)])
rendered_images = sorted([os.path.join(rendered_images_dir, img) for img in os.listdir(rendered_images_dir)])

# Set the video to start from the 20-second mark
start_frame = int(0 * video_fps)
cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)

# Variables to control image switching
image_change_interval = 0.2  # Change images every 0.2 seconds (5 FPS for the images)
elapsed_time = 0  # Track time for image changes
frame_index = 18  # Start from the first image

# Keep track of the last images shown
current_test_image = cv2.imread(test_images[frame_index])
current_rendered_image = cv2.imread(rendered_images[frame_index])
current_test_image = cv2.resize(current_test_image, image_size)
current_rendered_image = cv2.resize(current_rendered_image, image_size)

# Function to overlay text using Pillow
def draw_text(image, text, position, font, color):
    """Draw text on an image using Pillow."""
    # Convert OpenCV image (numpy array) to PIL image
    pil_img = Image.fromarray(image)
    draw = ImageDraw.Draw(pil_img)
    draw.text(position, text, font=font, fill=color)
    # Convert back to OpenCV image
    return np.array(pil_img)

# Main loop to process images and video
while True:
    print(frame_index)
    # Read a frame from the video
    ret, video_frame = cap.read()
    if not ret or frame_index >= (len(test_images)-18):
        break  # Stop when video or images end

    # Resize the video frame to its fixed size (960x544)
    video_frame_resized = cv2.resize(video_frame, (video_width, video_height))

    # Create a black background
    combined_frame = np.full((output_height, output_width, 3), background_color, dtype=np.uint8)

    # Place the video on the left, slightly lower
    combined_frame[video_top_y:video_top_y + video_height, video_left_x:video_left_x + video_width] = video_frame_resized

    # Add blue border around the left-side video
    combined_frame[video_top_y - border_thickness:video_top_y + video_height + border_thickness,
                   video_left_x - border_thickness:video_left_x + video_width + border_thickness] = \
        cv2.copyMakeBorder(video_frame_resized, border_thickness, border_thickness, border_thickness, border_thickness, 
                           cv2.BORDER_CONSTANT, value=blue_border)

    # Add text above the video: "train and localize with only 20 images" and "real-time localization demo"
    combined_frame = draw_text(combined_frame, "LOGS:", (video_left_x, video_top_y - 300), font_captial, (255, 255, 255))

    combined_frame = draw_text(combined_frame, "real-time localization demo", (video_left_x, video_top_y - 160), font_large, (255, 255, 255))
    combined_frame = draw_text(combined_frame, "train and localize with only 20 images", (video_left_x, video_top_y - 120), font_large, (255, 255, 255))
    combined_frame = draw_text(combined_frame, "localization speed: 5 fps", (video_left_x, video_top_y - 120), font_large, (255, 255, 255))
    
    # Add "trajectory of movement" above the video
    combined_frame = draw_text(combined_frame, "trajectory of movement", (video_left_x, video_top_y - 50), font_normal, (255, 255, 255))

    # Every 0.2 seconds, change the images
    if elapsed_time >= image_change_interval:
        # Update the current images
        current_test_image = cv2.imread(test_images[frame_index])
        current_rendered_image = cv2.imread(rendered_images[frame_index])

        # Resize images to fit the right side
        current_test_image = cv2.resize(current_test_image, image_size)
        current_rendered_image = cv2.resize(current_rendered_image, image_size)

        # Reset the elapsed time and increment the frame index for image switching
        elapsed_time = 0
        frame_index += 1
        if frame_index >= len(test_images):  # Loop back if necessary
            frame_index = 0

    # Place the current images on the right (one on top of the other)
    right_x = output_width - 150 - image_size[0]  # 200 px padding from the right
    top_image_y = 155  # Position the first image at 150 px from the top
    bottom_image_y = top_image_y + image_size[1] + 100  # 100 px gap between the two images

    # Add blue border around the top-right image (query image)
    current_test_image_with_border = cv2.copyMakeBorder(
        current_test_image, border_thickness, border_thickness, border_thickness, border_thickness,
        cv2.BORDER_CONSTANT, value=blue_border)

    # Add green border around the bottom-right image (rendered image)
    current_rendered_image_with_border = cv2.copyMakeBorder(
        current_rendered_image, border_thickness, border_thickness, border_thickness, border_thickness,
        cv2.BORDER_CONSTANT, value=green_border)

    # Place the top image on the right
    combined_frame[top_image_y:top_image_y + image_size[1] + border_thickness * 2,
                   right_x:right_x + image_size[0] + border_thickness * 2] = current_test_image_with_border

    # Add text "query image" above the first image
    combined_frame = draw_text(combined_frame, "query image", (right_x, top_image_y - 40), font_normal, (255, 255, 255))

    # Place the bottom image on the right
    combined_frame[bottom_image_y:bottom_image_y + image_size[1] + border_thickness * 2,
                   right_x:right_x + image_size[0] + border_thickness * 2] = current_rendered_image_with_border

    # Add text "rendered image from estimated pose" above the second image
    combined_frame = draw_text(combined_frame, "rendered image from estimated pose", (right_x, bottom_image_y - 40), font_normal, (255, 255, 255))

    # Add text in the top-right corner: "x1"
    combined_frame = draw_text(combined_frame, "vedio play: x1", (output_width - 285, 50), font_normal, (255, 255, 255))

    # Write the combined frame to output
    out.write(combined_frame)
    
    # Add the time of the current frame to elapsed_time
    elapsed_time += time_per_video_frame

cap.release()
out.release()

print("Video created successfully with larger font for specific text")


18
18
18
18
18
18
18
19
19
19
19
19
19
20
20
20
20
20
20
21
21
21
21
21
21
22
22
22
22
22
22
23
23
23
23
23
23
24
24
24
24
24
24
25
25
25
25
25
25
26
26
26
26
26
26
27
27
27
27
27
27
28
28
28
28
28
28
29
29
29
29
29
29
30
30
30
30
30
30
31
31
31
31
31
31
32
32
32
32
32
32
33
33
33
33
33
33
34
34
34
34
34
34
35
35
35
35
35
35
36
36
36
36
36
36
37
37
37
37
37
37
38
38
38
38
38
38
39
39
39
39
39
39
40
40
40
40
40
40
41
41
41
41
41
41
42
42
42
42
42
42
43
43
43
43
43
43
44
44
44
44
44
44
45
45
45
45
45
45
46
46
46
46
46
46
47
47
47
47
47
47
48
48
48
48
48
48
49
49
49
49
49
49
50
50
50
50
50
50
51
51
51
51
51
51
52
52
52
52
52
52
53
53
53
53
53
53
54
54
54
54
54
54
55
55
55
55
55
55
56
56
56
56
56
56
57
57
57
57
57
57
58
58
58
58
58
58
59
59
59
59
59
59
60
60
60
60
60
60
61
61
61
61
61
61
62
62
62
62
62
62
63
63
63
63
63
63
64
64
64
64
64
64
65
65
65
65
65
65
66
66
66
66
66
66
67
67
67
67
67
67
68
68
68
68
68
68
69
69
69
69
69
69
70
70
70
70
70
70
71
71
71
71
71
71
72
72
72
72
72
72
73
73
7

In [35]:
import cv2
import os
import numpy as np
from PIL import Image, ImageDraw, ImageFont

# Paths
test_images_dir = r"E:\lab\test_images"
rendered_images_dir = r"E:\lab\rendered"
output_video_path = r"E:\lab\output_video1.avi"
video_path = r"E:\lab\video.mp4"  # Update this with the actual video file name

# Target video size and background color (black)
output_width = 2001
output_height = 1126
background_color = (0, 0, 0)  # Black background

# Video size (fixed at 960x544) and position (shifted to the left and lower)
video_width = 960
video_height = 544
video_left_x = 150  # Move video 150px from the left
video_top_y = 395  # Adjust to control the vertical position of the video

# Image size for the right
image_size = (610, 340)  # Resize images to this size

# Border thickness for the images
border_thickness = 5
blue_border = (255, 0, 0)  # Blue border for left-side video and top-right image
green_border = (0, 255, 0)  # Green border for the bottom-right image

# Font settings for the text
font_path = "C:/Windows/Fonts/times.ttf"  # Path to Times New Roman font
font_size_large = 40  # Larger font for the two lines above the video
font_size_normal = 30  # Normal font size for other annotations

# Load fonts
font_large = ImageFont.truetype(font_path, font_size_large)
font_captial = ImageFont.truetype(font_path, 60)
font_normal = ImageFont.truetype(font_path, font_size_normal)

# Load the video to be shown
cap = cv2.VideoCapture(video_path)

# Check if video is loaded correctly
if not cap.isOpened():
    print("Error: Could not open video.")
    exit()

# Get the video frame rate
video_fps = cap.get(cv2.CAP_PROP_FPS)

# Calculate the time per frame (1 second / video_fps)
time_per_video_frame = 1 / video_fps

# Video writer (without sound)
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_video_path, fourcc, video_fps, (output_width, output_height))

# Get sorted image lists
test_images = sorted([os.path.join(test_images_dir, img) for img in os.listdir(test_images_dir)])
rendered_images = sorted([os.path.join(rendered_images_dir, img) for img in os.listdir(rendered_images_dir)])

# Set the video to start from the 20-second mark
start_frame = int(0 * video_fps)
cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)

# Variables to control image switching
image_change_interval = 0.2  # Initially 5 FPS (1/5s)
fast_image_change_interval = 0.1  # 10 FPS after 30s (x2 speed for both video and image switching)
elapsed_time = 0  # Track time for image changes
frame_index = 18  # Start from the first image

# Time tracking
total_elapsed_time = 0
speed_multiplier = 1  # Initially, no speed up (x1)

# Keep track of the last images shown
current_test_image = cv2.imread(test_images[frame_index])
current_rendered_image = cv2.imread(rendered_images[frame_index])
current_test_image = cv2.resize(current_test_image, image_size)
current_rendered_image = cv2.resize(current_rendered_image, image_size)

# Function to overlay text using Pillow
def draw_text(image, text, position, font, color):
    """Draw text on an image using Pillow."""
    # Convert OpenCV image (numpy array) to PIL image
    pil_img = Image.fromarray(image)
    draw = ImageDraw.Draw(pil_img)
    draw.text(position, text, font=font, fill=color)
    # Convert back to OpenCV image
    return np.array(pil_img)

# Main loop to process images and video
while True:
    print(f"Frame Index: {frame_index}, Total Time: {total_elapsed_time:.2f}s")
    
    # Check if we have reached the 30-second mark and adjust speed accordingly
    if total_elapsed_time >= 30:
        image_change_interval = fast_image_change_interval  # 10 FPS for images (2x speed)
        speed_multiplier = 2  # Video play speed x2
    
    # Read a frame from the video
    ret, video_frame = cap.read()
    if not ret or frame_index >= (len(test_images)-19):
        break  # Stop when video or images end
    
    # Skip frames to adjust video speed (speed_multiplier)
    for _ in range(speed_multiplier - 1):
        cap.grab()  # Skip frames to make the video faster
    
    # Resize the video frame to its fixed size (960x544)
    video_frame_resized = cv2.resize(video_frame, (video_width, video_height))

    # Create a black background
    combined_frame = np.full((output_height, output_width, 3), background_color, dtype=np.uint8)

    # Place the video on the left, slightly lower
    combined_frame[video_top_y:video_top_y + video_height, video_left_x:video_left_x + video_width] = video_frame_resized

    # Add blue border around the left-side video
    combined_frame[video_top_y - border_thickness:video_top_y + video_height + border_thickness,
                   video_left_x - border_thickness:video_left_x + video_width + border_thickness] = \
        cv2.copyMakeBorder(video_frame_resized, border_thickness, border_thickness, border_thickness, border_thickness, 
                           cv2.BORDER_CONSTANT, value=blue_border)

    # Add text above the video: "LOGS", "real-time localization demo", "train and localize with only 20 images"
    combined_frame = draw_text(combined_frame, "LOGS:", (video_left_x, video_top_y - 300), font_captial, (255, 255, 255))
    if speed_multiplier == 1:
        
        combined_frame = draw_text(combined_frame, "real-time localization demo", (video_left_x, video_top_y - 220), font_large, (255, 255, 255))
        combined_frame = draw_text(combined_frame, "train and localize with only 20 images", (video_left_x, video_top_y - 180), font_large, (255, 255, 255))
        combined_frame = draw_text(combined_frame, f"localization speed: 5 fps", (video_left_x, video_top_y - 140), font_large, (255, 255, 255))
    else:
        combined_frame = draw_text(combined_frame, "small difference between", (video_left_x, video_top_y - 220), font_large, (255, 255, 255))
        combined_frame = draw_text(combined_frame, "query and rendered images due to:", (video_left_x, video_top_y - 180), font_large, (255, 255, 255))
        combined_frame = draw_text(combined_frame, "high render quality + accurate pose estimation", (video_left_x, video_top_y - 140), font_large, (255, 255, 255))
    # Add "trajectory of movement" above the video
    combined_frame = draw_text(combined_frame, "trajectory of movement", (video_left_x, video_top_y - 50), font_normal, (255, 255, 255))

    # Every 0.2 seconds (5 FPS) or 0.1 seconds (10 FPS), change the images
    if elapsed_time >= image_change_interval:
        # Update the current images
        current_test_image = cv2.imread(test_images[frame_index])
        current_rendered_image = cv2.imread(rendered_images[frame_index])

        # Resize images to fit the right side
        current_test_image = cv2.resize(current_test_image, image_size)
        current_rendered_image = cv2.resize(current_rendered_image, image_size)

        # Reset the elapsed time and increment the frame index for image switching
        elapsed_time = 0
        frame_index += 1
        if frame_index >= len(test_images):  # Loop back if necessary
            frame_index = 0

    # Place the current images on the right (one on top of the other)
    right_x = output_width - 150 - image_size[0]  # 150 px padding from the right
    top_image_y = 155  # Position the first image at 150 px from the top
    bottom_image_y = top_image_y + image_size[1] + 100  # 100 px gap between the two images

    # Add blue border around the top-right image (query image)
    current_test_image_with_border = cv2.copyMakeBorder(
        current_test_image, border_thickness, border_thickness, border_thickness, border_thickness,
        cv2.BORDER_CONSTANT, value=blue_border)

    # Add green border around the bottom-right image (rendered image)
    current_rendered_image_with_border = cv2.copyMakeBorder(
        current_rendered_image, border_thickness, border_thickness, border_thickness, border_thickness,
        cv2.BORDER_CONSTANT, value=green_border)

    # Place the top image on the right
    combined_frame[top_image_y:top_image_y + image_size[1] + border_thickness * 2,
                   right_x:right_x + image_size[0] + border_thickness * 2] = current_test_image_with_border

    # Add text "query image" above the first image
    combined_frame = draw_text(combined_frame, "query image", (right_x, top_image_y - 40), font_normal, (255, 255, 255))

    # Place the bottom image on the right
    combined_frame[bottom_image_y:bottom_image_y + image_size[1] + border_thickness * 2,
                   right_x:right_x + image_size[0] + border_thickness * 2] = current_rendered_image_with_border

    # Add text "rendered image from estimated pose" above the second image
    combined_frame = draw_text(combined_frame, "rendered image from estimated pose", (right_x, bottom_image_y - 40), font_normal, (255, 255, 255))

    # Add text in the top-right corner: "video play: x1" or "video play: x2"
    combined_frame = draw_text(combined_frame, f"video play: x{speed_multiplier}", (output_width - 320, 50), font_normal, (255, 255, 255))

    # Write the combined frame to output
    out.write(combined_frame)

    # Update the elapsed time for both video and images
    elapsed_time += time_per_video_frame
    total_elapsed_time += time_per_video_frame * speed_multiplier  # Account for speed multiplier

# Release the video capture and writer
cap.release()
out.release()

print("Video created successfully with variable speed and FPS!")


Frame Index: 18, Total Time: 0.00s
Frame Index: 18, Total Time: 0.03s
Frame Index: 18, Total Time: 0.07s
Frame Index: 18, Total Time: 0.10s
Frame Index: 18, Total Time: 0.13s
Frame Index: 18, Total Time: 0.17s
Frame Index: 18, Total Time: 0.20s
Frame Index: 19, Total Time: 0.24s
Frame Index: 19, Total Time: 0.27s
Frame Index: 19, Total Time: 0.30s
Frame Index: 19, Total Time: 0.34s
Frame Index: 19, Total Time: 0.37s
Frame Index: 19, Total Time: 0.40s
Frame Index: 20, Total Time: 0.44s
Frame Index: 20, Total Time: 0.47s
Frame Index: 20, Total Time: 0.50s
Frame Index: 20, Total Time: 0.54s
Frame Index: 20, Total Time: 0.57s
Frame Index: 20, Total Time: 0.60s
Frame Index: 21, Total Time: 0.64s
Frame Index: 21, Total Time: 0.67s
Frame Index: 21, Total Time: 0.71s
Frame Index: 21, Total Time: 0.74s
Frame Index: 21, Total Time: 0.77s
Frame Index: 21, Total Time: 0.81s
Frame Index: 22, Total Time: 0.84s
Frame Index: 22, Total Time: 0.87s
Frame Index: 22, Total Time: 0.91s
Frame Index: 22, Tot

In [19]:
import cv2
import numpy as np
from PIL import Image, ImageDraw, ImageFont

# Paths to the left and right videos
left_video_path = r"C:\Users\27118\Desktop\ellipse\render.mp4"  # Path to the left-side video
right_video_path = r"C:\Users\27118\Desktop\ellipse\ellipse.mp4"  # Path to the right-side video
output_video_path = r"E:\lab\output_video2.mp4"  # Output video path

# Target video size and background color (black)
output_width = 2001
output_height = 1126
background_color = (0, 0, 0)  # Black background


# Resized video dimensions (950x529)
video_width = 950
video_height = 529

# Border settings
border_thickness = 5
border_color = (0, 255, 0)  # Green border

# Positioning the videos in the output frame, shifted down by 100 pixels
left_video_x = 50  # Left padding for the left video
right_video_x = output_width - video_width - 50  # Right padding for the right video
video_y = (output_height - video_height) // 2 + 100  # Center the videos vertically and shift down by 100px

# Font settings for the text
font_path = "C:/Windows/Fonts/times.ttf"  # Path to Times New Roman font
font_size_large = 40  # Larger font for annotations
font_normal = 30  # Normal font size for the x2 label

# Load the videos
left_cap = cv2.VideoCapture(left_video_path)
right_cap = cv2.VideoCapture(right_video_path)

# Check if both videos are loaded correctly
if not left_cap.isOpened() or not right_cap.isOpened():
    print("Error: Could not open one or both videos.")
    exit()

# Get the frame rate from the left video (assuming both videos have the same frame rate)
fps = left_cap.get(cv2.CAP_PROP_FPS)

# Video writer (without sound)
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_video_path, fourcc, fps, (output_width, output_height))

# Function to overlay text using Pillow
def draw_text(image, text, position, font, color):
    """Draw text on an image using Pillow."""
    # Convert OpenCV image (numpy array) to PIL image
    pil_img = Image.fromarray(image)
    draw = ImageDraw.Draw(pil_img)
    draw.text(position, text, font=font, fill=color)
    # Convert back to OpenCV image
    return np.array(pil_img)

# Load fonts
font_large = ImageFont.truetype(font_path, font_size_large)
font_capital = ImageFont.truetype(font_path, 60)
font_normal_font = ImageFont.truetype(font_path, font_normal)

count = 0
# Main loop to process the frames from both videos
while True:
    print(count)
    count +=1
    # Read frames from both videos
    ret_left, left_frame = left_cap.read()
    ret_right, right_frame = right_cap.read()
    
    # Stop if any video ends
    if not ret_left or not ret_right:
        break

    # Resize both videos to fit within the target dimensions
    left_frame_resized = cv2.resize(left_frame, (video_width, video_height))
    right_frame_resized = cv2.resize(right_frame, (video_width, video_height))

    # Add green border around the left video
    left_frame_with_border = cv2.copyMakeBorder(
        left_frame_resized, border_thickness, border_thickness, border_thickness, border_thickness,
        cv2.BORDER_CONSTANT, value=border_color)

    # Add green border around the right video
    right_frame_with_border = cv2.copyMakeBorder(
        right_frame_resized, border_thickness, border_thickness, border_thickness, border_thickness,
        cv2.BORDER_CONSTANT, value=border_color)

    # Create a black background
    combined_frame = np.full((output_height, output_width, 3), background_color, dtype=np.uint8)

    # Place the left and right videos (with borders) in the combined frame
    combined_frame[video_y:video_y + video_height + border_thickness * 2, left_video_x:left_video_x + video_width + border_thickness * 2] = left_frame_with_border
    combined_frame[video_y:video_y + video_height + border_thickness * 2, right_video_x:right_video_x + video_width + border_thickness * 2] = right_frame_with_border

    # Add text "LOGS" on the left side
    combined_frame = draw_text(combined_frame, "LOGS", (150, 95), font_capital, (255, 255, 255))
    combined_frame = draw_text(combined_frame, "map render illustration", (150, 395 - 220), font_large, (255, 255, 255))
    combined_frame = draw_text(combined_frame, "train with only 20 images", (150, 395 - 180), font_large, (255, 255, 255))

    # Add text "x2" in the top-right corner
    combined_frame = draw_text(combined_frame, f"video play: x2", (2001 - 320, 50), font_normal_font, (255, 255, 255))

    # Add text "rendered image" above the left video
    combined_frame = draw_text(combined_frame, "rendered image", (left_video_x, video_y - 60), font_large, (255, 255, 255))

    # Add text "gaussian ellipsoids" above the right video
    combined_frame = draw_text(combined_frame, "gaussian ellipsoids", (right_video_x, video_y - 60), font_large, (255, 255, 255))

    # Write the combined frame to the output video
    out.write(combined_frame)

# Release the video captures and writer
left_cap.release()
right_cap.release()
out.release()

print("Video created successfully with two side-by-side videos, green borders, and correct text!")


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

KeyboardInterrupt: 

In [22]:
2560/(2001/1126)

1440.55972013993

In [19]:
import os
import h5py
import cv2
import numpy as np
import torch
from tqdm import tqdm
from PIL import Image
from pathlib import Path
import sys
import yaml
from munch import munchify
from math import atan
from collections import OrderedDict

sys.path.append("D:/gs-localization/gaussian_splatting")
sys.path.append("D:/gs-localization")
sys.path.append("D:/gs-localization/gs_localization/pipelines")


from tools.config_utils import load_config, update_recursive
from tools import read_write_model
from tools.gaussian_model import GaussianModel
from tools import render
from tools.camera_utils import Camera
from tools.descent_utils import get_loss_tracking
from tools.pose_utils import update_pose
from tools.graphics_utils import getProjectionMatrix2


def gradient_decent(viewpoint, config, initial_R, initial_T):

    viewpoint.update_RT(initial_R, initial_T)
    
    opt_params = []
    opt_params.append(
        {
            "params": [viewpoint.cam_rot_delta],
            "lr": 0.001,
            "name": "rot_{}".format(viewpoint.uid),
        }
    )
    opt_params.append(
        {
            "params": [viewpoint.cam_trans_delta],
            "lr": 0.001,
            "name": "trans_{}".format(viewpoint.uid),
        }
    )
    opt_params.append(
        {
            "params": [viewpoint.exposure_a],
            "lr": 0.001,
            "name": "exposure_a_{}".format(viewpoint.uid),
        }
    )
    opt_params.append(
        {
            "params": [viewpoint.exposure_b],
            "lr": 0.001,
            "name": "exposure_b_{}".format(viewpoint.uid),
        }
    )
    

    pose_optimizer = torch.optim.Adam(opt_params)
    
    for tracking_itr in range(50):
        
        render_pkg = render(
            viewpoint, Model, pipeline_params, background
        )
        
        image, depth, opacity = (
            render_pkg["render"],
            render_pkg["depth"],
            render_pkg["opacity"],
        )
          
        pose_optimizer.zero_grad()
        
        loss_tracking = get_loss_tracking(
            config, image, depth, opacity, viewpoint
        )
        loss_tracking.backward()
        
    
        with torch.no_grad():
            pose_optimizer.step()
            converged = update_pose(viewpoint, converged_threshold=1e-4)
    
        if converged:
            break
             
    return viewpoint.R, viewpoint.T, render_pkg


class Transformation:
    def __init__(self, R=None, T=None):
        self.R = R
        self.T = T

def quat_to_rotmat(qvec):
    qvec = np.array(qvec, dtype=float)
    w, x, y, z = qvec
    R = np.array([
        [1 - 2*y**2 - 2*z**2, 2*x*y - 2*z*w, 2*x*z + 2*y*w],
        [2*x*y + 2*z*w, 1 - 2*x**2 - 2*z**2, 2*y*z - 2*x*w],
        [2*x*z - 2*y*w, 2*y*z + 2*x*w, 1 - 2*x**2 - 2*y**2]
    ])
    return R


def focal2fov(focal, pixels):
    return 2 * atan(pixels / (2 * focal))

def load_pose(pose_txt):
    pose = []
    with open(pose_txt, 'r') as f:
        for line in f:
            row = line.strip('\n').split()
            row = [float(c) for c in row]
            pose.append(row)
    pose = np.array(pose).astype(np.float32)
    assert pose.shape == (4,4)
    return pose

def create_mask(mkpts_lst, width, height, k):
    # Initial mask as all False
    mask = np.zeros((height, width), dtype=bool)
    
    # Calculat k radius
    half_k = k // 2
    
    # Iterate through all points
    for pt in mkpts_lst:
        x, y = int(pt[0]), int(pt[1])
        
        # Calculate k*k borders
        x_min = max(0, x - half_k)
        x_max = min(width, x + half_k + 1)
        y_min = max(0, y - half_k)
        y_max = min(height, y + half_k + 1)
        
        # Set mask k*k area as True
        mask[y_min:y_max, x_min:x_max] = True
    
    # Shape: (1, height, width)
    mask = mask[np.newaxis, :, :]
    
    return mask

class BaseDataset(torch.utils.data.Dataset):
    def __init__(self, args, path, config):
        self.args = args
        self.path = path
        self.config = config
        self.device = "cuda:0"
        self.dtype = torch.float32
        self.num_imgs = 9999

    def __len__(self):
        return self.num_imgs

    def __getitem__(self, idx):
        pass

class MonocularDataset(BaseDataset):
    def __init__(self, args, path, config):
        super().__init__(args, path, config)
        calibration = config["Dataset"]["Calibration"]
        # Camera prameters
        self.fx = calibration["fx"]
        self.fy = calibration["fy"]
        self.cx = calibration["cx"]
        self.cy = calibration["cy"]
        self.width = calibration["width"]
        self.height = calibration["height"]
        self.fovx = focal2fov(self.fx, self.width)
        self.fovy = focal2fov(self.fy, self.height)
        self.K = np.array(
            [[self.fx, 0.0, self.cx], [0.0, self.fy, self.cy], [0.0, 0.0, 1.0]]
        )
        # distortion parameters
        self.disorted = calibration["distorted"]
        self.dist_coeffs = np.array(
            [
                calibration["k1"],
                calibration["k2"],
                calibration["p1"],
                calibration["p2"],
                calibration["k3"],
            ]
        )
        self.map1x, self.map1y = cv2.initUndistortRectifyMap(
            self.K,
            self.dist_coeffs,
            np.eye(3),
            self.K,
            (self.width, self.height),
            cv2.CV_32FC1,
        )
        # depth parameters
        self.has_depth = True if "depth_scale" in calibration.keys() else False
        self.depth_scale = calibration["depth_scale"] if self.has_depth else None

        # Default scene scale
        nerf_normalization_radius = 5
        self.scene_info = {
            "nerf_normalization": {
                "radius": nerf_normalization_radius,
                "translation": np.zeros(3),
            },
        }

    def __getitem__(self, idx):
        color_path = self.color_paths[idx]
        pose = self.poses[idx]

        image = np.array(Image.open(color_path))
        depth = None

        if self.disorted:
            image = cv2.remap(image, self.map1x, self.map1y, cv2.INTER_LINEAR)

        image = (
            torch.from_numpy(image / 255.0)
            .clamp(0.0, 1.0)
            .permute(2, 0, 1)
            .to(device=self.device, dtype=self.dtype)
        )
        pose = torch.from_numpy(pose).to(device=self.device)
        return image, depth, pose


class seven_scenes_Dataset(MonocularDataset):
    def __init__(self, args, path, config, data_folder, scene):
        super().__init__(args, path, config)
        self.has_depth = True
        self.seven_scenes_Parser(data_folder, scene) 
        
    def seven_scenes_Parser(self, data_folder, scene):
        self.color_paths, self.poses, self.depths = [], [],[]

        gt_dirs = Path(data_folder) / scene / "views"/ "sparse/0"
        _, images, _ = read_write_model.read_model(gt_dirs, ".txt")

        # Read the filenames from test_fewshot.txt and store them in a set.
        test_images_path = Path(data_folder) / scene / "train.txt"
        
        with open(test_images_path, 'r') as f:
            test_images = set(line.strip() for line in f)
            
        for i, image in tqdm(images.items(),"Load dataset"):
            # Execute the following operation only if image.name exists in test_images."
            if image.name in test_images:
                image_path = Path(data_folder) / scene / 'images' / image.name
                self.color_paths.append(image_path)
                R_gt, t_gt = image.qvec2rotmat(), image.tvec
                pose = np.eye(4)            
                pose[:3, :3] = R_gt         
                pose[:3, 3] = t_gt 
                self.poses.append(pose)
                self.depths.append(None)

        # Sort self.color_paths, self.poses, and self.depth_paths based on normal file name order
        sorted_data = sorted(zip(self.color_paths, self.depths, self.poses), key=lambda x: x[0].name)
        self.color_paths, self.depths, self.poses = zip(*sorted_data)
        del images

with open("D:/gs-localization/gs_localization/pipelines/configs/mono/tum/fr3_office.yaml", "r") as f:
    cfg_special = yaml.full_load(f)

inherit_from = "D:/gs-localization/gs_localization/pipelines/configs/mono/tum/base_config.yaml"

if inherit_from is not None:
    cfg = load_config(inherit_from)
else:
    cfg = dict()

# merge per dataset cfg. and main cfg.
config = update_recursive(cfg, cfg_special)
config = cfg
    
data_folder = "E:/"
config["Dataset"]["Calibration"]["fx"] = 618.8359530192555 
config["Dataset"]["Calibration"]["fy"] = 612.76165758523564
config["Dataset"]["Calibration"]["cx"] = 610
config["Dataset"]["Calibration"]["cy"] = 340
config["Dataset"]["Calibration"]["width"] = 1220
config["Dataset"]["Calibration"]["height"] = 680   
config["Dataset"]["Calibration"]['depth_scale'] = 1000.0
config["Training"]["monocular"] = True
config["Training"]["alpha"] = 0.99


In [21]:
scene = "lab"
Model = GaussianModel(3, config)
Model.load_ply(f"E:/lab/views/point_cloud/iteration_30000/point_cloud.ply")

model_params = munchify(config["model_params"])
pipeline_params = munchify(config["pipeline_params"])
data_folder = "E:/"
dataset = seven_scenes_Dataset(model_params, model_params.source_path, config, data_folder, scene)
bg_color = [0, 0, 0] 
background = torch.tensor(bg_color, dtype=torch.float32, device="cuda")

projection_matrix = getProjectionMatrix2(
    znear=0.01,
    zfar=100.0,
    fx=dataset.fx,
    fy=dataset.fy,
    cx=dataset.cx,
    cy=dataset.cy,
    W=dataset.width,
    H=dataset.height,
).transpose(0, 1)
projection_matrix = projection_matrix.to(device="cuda:0")

config["Training"]["opacity_threshold"] = 0.99
config["Training"]["edge_threshold"] = 1.1

# use OrderedDict to substitute defaultdict
test_infos = OrderedDict()

Load dataset: 100%|███████████████████████████████████████████████████████████████| 601/601 [00:00<00:00, 23819.79it/s]


In [22]:
dataset[0]

(tensor([[[0.8510, 0.8549, 0.8549,  ..., 0.7216, 0.7216, 0.7176],
          [0.8549, 0.8549, 0.8588,  ..., 0.7255, 0.7255, 0.7216],
          [0.8549, 0.8549, 0.8588,  ..., 0.7216, 0.7216, 0.7176],
          ...,
          [0.1725, 0.1765, 0.1765,  ..., 0.8784, 0.8784, 0.8784],
          [0.1765, 0.1765, 0.1725,  ..., 0.8745, 0.8784, 0.8784],
          [0.1804, 0.1804, 0.1765,  ..., 0.8745, 0.8745, 0.8745]],
 
         [[0.8745, 0.8784, 0.8784,  ..., 0.7608, 0.7569, 0.7529],
          [0.8784, 0.8784, 0.8824,  ..., 0.7608, 0.7569, 0.7529],
          [0.8784, 0.8784, 0.8824,  ..., 0.7608, 0.7608, 0.7608],
          ...,
          [0.1686, 0.1725, 0.1725,  ..., 0.9020, 0.9020, 0.9020],
          [0.1725, 0.1725, 0.1686,  ..., 0.8980, 0.9020, 0.9020],
          [0.1765, 0.1765, 0.1725,  ..., 0.8980, 0.8980, 0.8980]],
 
         [[0.8745, 0.8784, 0.8784,  ..., 0.7647, 0.7608, 0.7569],
          [0.8784, 0.8784, 0.8824,  ..., 0.7686, 0.7647, 0.7608],
          [0.8784, 0.8784, 0.8824,  ...,

In [23]:
from PIL import Image, ImageDraw
import torchvision.transforms as transforms
import numpy as np

# suppose file open and read
with open(f"E:/lab/results_sparse.txt", "r") as f:
    for line in f:
        parts = line.strip().split()
        name = parts[0]
        qvec = list(map(float, parts[1:5]))
        tvec = list(map(float, parts[5:8]))

        R = quat_to_rotmat(qvec)
        T = np.array(tvec)

        # insert directly in OrderedDict
        test_infos[name] = Transformation(R=R, T=T)

# sort OrderedDict according to name 
test_infos = OrderedDict(sorted(test_infos.items(), key=lambda item: item[0]))

for i, image in enumerate(tqdm(test_infos, desc="Localization")):
    viewpoint = Camera.init_from_dataset(dataset, i, projection_matrix)
    initial_R = torch.from_numpy(test_infos[image].R)
    initial_T = torch.from_numpy(test_infos[image].T)

    viewpoint.update_RT(initial_R, initial_T)

    render_pkg = render(
            viewpoint, Model, pipeline_params, background
        )

    localized_tensor = render_pkg["render"]
    
    # 找到每个像素中 R、G、B 三个通道的最大值
    max_vals, _ = localized_tensor.max(dim=0)  # 得到每个像素的最大值 (H, W)
    
    # 找到哪些像素的最大值超过 1
    exceeds_one_mask = max_vals > 1  # 布尔掩码，标记哪些像素的最大值超过 1
    
    # 对超过 1 的地方，将 R、G、B 值同时按最大值进行归一化
    localized_tensor[:, exceeds_one_mask] = localized_tensor[:, exceeds_one_mask] / (max_vals[exceeds_one_mask] + 0.00001)
    
    # 将 Tensor 转换为 PIL 图像
    tensor_to_pil = transforms.ToPILImage()
    
    localized_image = tensor_to_pil(localized_tensor)

    # 定义保存路径
    save_path = r'E:/lab/rendered'
    os.makedirs(save_path, exist_ok=True)  # 如果文件夹不存在，创建文件夹
    
    # 保存图像
    image_save_path = os.path.join(save_path, image)
    localized_image.save(image_save_path)


Localization: 100%|██████████████████████████████████████████████████████████████████| 600/600 [03:42<00:00,  2.70it/s]


In [24]:
from PIL import Image, ImageDraw
import torchvision.transforms as transforms
import numpy as np

# 假设 viewpoint.original_image 和 render_pkg["render"] 是 Tensor
ground_truth_tensor = viewpoint.original_image
localized_tensor = render_pkg["render"]

# 找到每个像素中 R、G、B 三个通道的最大值
max_vals, _ = localized_tensor.max(dim=0)  # 得到每个像素的最大值 (H, W)

# 找到哪些像素的最大值超过 1
exceeds_one_mask = max_vals > 1  # 布尔掩码，标记哪些像素的最大值超过 1

# 对超过 1 的地方，将 R、G、B 值同时按最大值进行归一化
localized_tensor[:, exceeds_one_mask] = localized_tensor[:, exceeds_one_mask] / (max_vals[exceeds_one_mask] + 0.00001)

# 将 Tensor 转换为 PIL 图像
tensor_to_pil = transforms.ToPILImage()

ground_truth_image = tensor_to_pil(ground_truth_tensor)
localized_image = tensor_to_pil(localized_tensor)

# 确保两张图片大小相同（可以选择调整大小）
width, height = ground_truth_image.size
localized_image = localized_image.resize((width, height))

# 创建一个新的空白图像，用来合成 ground truth 和 localized image
combined_image = Image.new('RGB', (width, height))

# 将图像转换为 NumPy 数组，方便逐像素操作
ground_truth_array = np.array(ground_truth_image)
localized_image_array = np.array(localized_image)

# 根据条件 x < ay 来合成图像
for y in range(height):
    for x in range(width):
        if x < (y * (width / height)):  # 根据比例 x < ay 来判断
            combined_image.putpixel((x, y), tuple(ground_truth_array[y, x]))  # 放置 ground truth
        else:
            combined_image.putpixel((x, y), tuple(localized_image_array[y, x]))  # 放置 localized image

# 绘制白色虚线对角线
draw = ImageDraw.Draw(combined_image)
line_length = 20  # 每个虚线段的长度
gap_length = 10   # 每段虚线之间的间隔

# 计算对角线的总长度
diagonal_length = int((width**2 + height**2)**0.5)

# 循环绘制虚线
for i in range(0, diagonal_length, line_length + gap_length):
    start_x = int(i * (width / diagonal_length))  # 起点 x
    start_y = int(i * (height / diagonal_length))  # 起点 y
    end_x = int((i + line_length) * (width / diagonal_length))  # 终点 x
    end_y = int((i + line_length) * (height / diagonal_length))  # 终点 y

    # 绘制虚线的段
    draw.line((start_x, start_y, end_x, end_y), fill="white", width=3)

# 画小框
small_box_start = (200, 150)  # 小框左上角起始点 (x, y)
small_box_width = 150         # 小框的宽度
small_box_height = 100        # 小框的高度
small_box_end = (small_box_start[0] + small_box_width, small_box_start[1] + small_box_height)

# 绘制蓝色小框
draw.rectangle([small_box_start, small_box_end], outline="green", width=2)

# 提取小框中的部分
small_box_region = combined_image.crop((small_box_start[0], small_box_start[1], small_box_end[0], small_box_end[1]))

# 放大小框中的部分
scale_factor = 1.6  # 放大倍数
large_box_region = small_box_region.resize((int(small_box_width * scale_factor), int(small_box_height * scale_factor)))

# 将放大的大框放置在小框旁边，覆盖图片部分区域
large_box_start_x = small_box_start[0]   # 小框右边再加10像素
large_box_start_y = small_box_start[1] + small_box_width - 20

# 确保大框不会超出图片边界
if large_box_start_x + large_box_region.width > width:
    large_box_start_x = width - large_box_region.width - 10
if large_box_start_y + large_box_region.height > height:
    large_box_start_y = height - large_box_region.height - 10

# 将放大的区域粘贴回原图中
combined_image.paste(large_box_region, (large_box_start_x, large_box_start_y))

# 画大框
large_box_end = (large_box_start_x + large_box_region.width, large_box_start_y + large_box_region.height)
draw.rectangle([large_box_start_x, large_box_start_y, large_box_end[0], large_box_end[1]], outline="green", width=3)

# 显示结果
combined_image.show()

# 保存图片到指定路径
combined_image.save(f"C:/Users/27118/Desktop/{scene}_compare.png")


In [73]:
from PIL import Image

# 场景顺序
scenes = ["chess", "fire", "heads", "office", "pumpkin", "redkitchen", "stairs"]

# 图片路径模板
image_paths = [f"C:/Users/27118/Desktop/{scene}_compare.png" for scene in scenes]

# 打开所有图片，并获取宽高信息
images = [Image.open(img_path) for img_path in image_paths]
width, height = images[0].size  # 假设所有图片大小相同

# 创建一个白色的图片，用作第 8 张图
white_image = Image.new('RGB', (width, height), color='white')

# 添加白色图片到 images 列表中，确保最后总共有 8 张图片
images.append(white_image)

# 创建一个新的空白图像，宽度为4张图片的宽度，高度为两行的图片高度
new_image = Image.new('RGB', (width * 4, height * 2))

# 逐一粘贴图片到新图像中
for i, img in enumerate(images):
    # 计算每张图片的位置
    x_offset = (i % 4) * width  # 每行最多放置4张图片
    y_offset = (i // 4) * height  # 放置到第几行
    new_image.paste(img, (x_offset, y_offset))

# 显示合成后的图像
new_image.show()

# 保存最终合成图像
new_image.save("C:/Users/27118/Desktop/combined_image.png")


In [10]:
import numpy as np

# 指定.npy文件的路径
f"D:/gs-localization/output/7scenes/{scene}/results_sparse.txt"
file_path = 'D:/gs-localization/output/7scenes/{scene}/rot_errors.npy'

# 加载.npy文件
data = np.load(file_path)

# 查看文件内容
print(np.median(data))


0.48435812485309393
