In [62]:
import cv2 as cv
import numpy as np
from sklearn.decomposition import PCA
import os
import re
import glob

In [63]:
def get_out_folder(file_name):
    """
    Get the output folder path based on the input file name.
    """
    # Extract the number
    match = re.match(r"(\d+)_", file_name)
    if match:
        number = int(match.group(1))
    else:
        number = "unknown"

    # Check for 'query' or 'reference'
    if "query" in file_name:
        subfolder = "query"
    elif "reference" in file_name:
        subfolder = "reference"
    else:
        subfolder = "other"

    output_folder = os.path.join("frames", subfolder, f"vid_{number}")
    return output_folder

In [64]:
def extract_frames(video_path, output_folder, target_fps=15, max_pixels=120000):
    """
    Extract frames from a video and save them to the specified output folder.
    Args:
        video_path (str): Path to the input video file.
        output_folder (str): Path to the folder where frames will be saved.
    """
    
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
        cap = cv.VideoCapture(video_path)
        if not cap.isOpened():
            raise IOError(f"Cannot open video {video_path}")
        orig_fps = cap.get(cv.CAP_PROP_FPS) or target_fps
        frame_step = max(int(round(orig_fps / target_fps)), 1)

        count = 0
        saved = 0
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            if count % frame_step == 0:
                # Resize to max_pixels
                h, w = frame.shape[:2]
                scale = (max_pixels / float(w * h)) ** 0.5
                if scale < 1.0:
                    frame = cv.resize(frame, (int(w * scale), int(h * scale)), interpolation=cv.INTER_AREA)
                # Save frame
                cv.imwrite(os.path.join(output_folder, f"frame_{saved:04d}.jpg"), frame)
                saved += 1
            count += 1
        cap.release()

In [65]:
def match_query_to_reference(video_file, pca_model=None, target_fps=15, max_pixels=120000):
    """
    Match query frames to reference frames using PCA for dimensionality reduction.
    Args:
        video_file (str): Path to the input video file.
        pca_model (PCA, optional): Pre-trained PCA model. If None, a new model will be created.
        pca_dim (int): Number of dimensions for PCA.
        lambda_val (float): Regularization parameter for PCA.
        fps (int): Frames per second for the video.
    Returns:
        tuple: Output folder path and PCA model.
    """
    # Determine folders and extract frames if needed
    print(f"Processing video file: {video_file}...")
    out_folder = get_out_folder(os.path.basename(video_file))
    extract_frames(video_file, out_folder, target_fps, max_pixels)
    return out_folder, pca_model

In [66]:
from tqdm import tqdm

def load_frame_descriptors(frame_folder, sift, pca_model=None, pca_dim=64, fit_pca=False):
    # Could multithread
    
    """
    Load frames from folder, extract SIFT descriptors per frame, apply PCA, return frame-level vectors.
    If fit_pca=True, stack all keypoint descs for PCA fit; else apply existing pca_model.
    """
    # Gather frame paths
    frame_paths = sorted(glob.glob(os.path.join(frame_folder, "frame_*.jpg")))
    all_descs = []
    key_descs = []

    # First pass: collect for PCA if needed
    if fit_pca:
        for fp in tqdm(frame_paths, desc="Collecting descriptors for PCA"):
            img = cv.imread(fp, cv.IMREAD_GRAYSCALE)
            _, des = sift.detectAndCompute(img, None)
            if des is not None:
                key_descs.append(des)
        stacked = np.vstack(key_descs)
        pca_model = PCA(n_components=pca_dim, whiten=True, svd_solver='auto')
        pca_model.fit(stacked)
        
        print(f"Fitted PCA model with {pca_dim} dimensions.")
        print(f"Explained variance ratio: {pca_model.explained_variance_ratio_}")
        print(f"Total variance explained: {np.sum(pca_model.explained_variance_ratio_)}")
        print(f"Number of components: {pca_model.n_components_}")

    # Second pass: extract per-frame vector
    for fp in tqdm(frame_paths, desc="Extracting frame vectors"):
        img = cv.imread(fp, cv.IMREAD_GRAYSCALE)
        _, des = sift.detectAndCompute(img, None)
        if des is not None and pca_model is not None:
            des_pca = pca_model.transform(des)
            frame_vec = np.mean(des_pca, axis=0)
        else:
            frame_vec = np.zeros(pca_dim, dtype=np.float32)
        all_descs.append(frame_vec)
        
    print(f"Extracted {len(all_descs)} frame vectors from {len(frame_paths)} frames.")
    print(f"Shape of each frame vector: {all_descs[0].shape if all_descs else 'N/A'}")
    print(f"Total number of frames processed: {len(all_descs)}")
    print(f"Shape of all descriptors: {np.array(all_descs).shape}")
    print(f"Shape of PCA model components: {pca_model.components_.shape if pca_model else 'N/A'}")
    return np.vstack(all_descs), pca_model

In [67]:
def compute_cte_correlation(query_desc, ref_desc, lambda_val=1e-3):
    Tq, D = query_desc.shape
    Tr, _ = ref_desc.shape
    N = 1 << int(np.ceil(np.log2(max(Tq, Tr))))
    Q = np.zeros((N, D), dtype=np.float32)
    R = np.zeros((N, D), dtype=np.float32)
    Q[:Tq] = query_desc; R[:Tr] = ref_desc
    Qf = np.fft.rfft(Q, n=N, axis=0)
    Rf = np.fft.rfft(R, n=N, axis=0)
    num = np.sum(np.conj(Qf) * Rf, axis=1)
    den = np.sum(np.abs(Qf) ** 2, axis=1)
    scores = np.fft.irfft(num / (den + lambda_val), n=N)
    return scores

In [68]:
def find_best_offset(scores, fps=15):
    best_frame = int(np.argmax(scores))
    return best_frame, best_frame / fps

In [69]:
sift = cv.SIFT_create()
pca_model = None
results = []

videos_idx_limit = 1 # Limit to which video number we want to stop process (1 - 15)

for idx in range(1, videos_idx_limit + 1):
    q_file = f"train/task1/{idx:02d}_query.mp4"
    r_file = f"train/task1/{idx:02d}_reference.mp4"
    
    print(f"Processing video pair {idx:02d}: {q_file} (query) and {r_file} (reference)")
    
    # Process query video
    q_folder, pca_model = match_query_to_reference(q_file, pca_model, target_fps=25, max_pixels=180000)
    print("first time fitting PCA on query frames")
    q_desc, pca_model = load_frame_descriptors(q_folder, sift, pca_model, pca_dim=64, fit_pca=(pca_model is None))
    
    # Process reference video
    r_folder, _ = match_query_to_reference(r_file, pca_model, target_fps=25, max_pixels=180000)
    print("Extracting reference frames")
    r_desc, _ = load_frame_descriptors(r_folder, sift, pca_model, pca_dim=64, fit_pca=False)
    
    
    scores = compute_cte_correlation(q_desc, r_desc, lambda_val=1e-3)
    print(f"Computed CTE correlation scores with shape: {scores.shape}")
    print(f"Scores: {scores[:10]}")  # Print first 10 scores for verification
    best_frame, best_time = find_best_offset(scores, fps=30)
    results.append((idx, best_frame, best_time))
    print(f"Pair {idx:02d}: offset {best_frame} frames ({best_time:.2f}s)")


Processing video pair 01: train/task1/01_query.mp4 (query) and train/task1/01_reference.mp4 (reference)
Processing video file: train/task1/01_query.mp4...
first time fitting PCA on query frames


Collecting descriptors for PCA: 100%|██████████| 45/45 [00:01<00:00, 26.10it/s]


Fitted PCA model with 64 dimensions.
Explained variance ratio: [0.13364685 0.07236713 0.04777729 0.04365035 0.03892459 0.03587165
 0.0324444  0.03233577 0.03105574 0.02435083 0.02179296 0.0212551
 0.01936354 0.01897359 0.01732421 0.01696339 0.01562415 0.01480238
 0.01422864 0.01338267 0.01279366 0.01235731 0.01181194 0.0112356
 0.01041393 0.01005563 0.00940045 0.00902721 0.0077481  0.00745637
 0.00718191 0.00689055 0.00681362 0.00675104 0.00654068 0.00627357
 0.00609372 0.00577214 0.00570838 0.0054328  0.00534008 0.00507193
 0.00480839 0.00475587 0.00456237 0.00455519 0.00441729 0.0042141
 0.00411499 0.00393376 0.00382679 0.00374506 0.00368525 0.00360587
 0.00351729 0.0033998  0.00320866 0.00314879 0.00296239 0.00287407
 0.00279593 0.00270998 0.00266765 0.00261746]
Total variance explained: 0.9224327802658081
Number of components: 64


Extracting frame vectors: 100%|██████████| 45/45 [00:02<00:00, 22.02it/s]


Extracted 45 frame vectors from 45 frames.
Shape of each frame vector: (64,)
Total number of frames processed: 45
Shape of all descriptors: (45, 64)
Shape of PCA model components: (64, 128)
Processing video file: train/task1/01_reference.mp4...
Extracting reference frames


Extracting frame vectors: 100%|██████████| 450/450 [00:17<00:00, 25.44it/s]

Extracted 450 frame vectors from 450 frames.
Shape of each frame vector: (64,)
Total number of frames processed: 450
Shape of all descriptors: (450, 64)
Shape of PCA model components: (64, 128)
Computed CTE correlation scores with shape: (512,)
Scores: [-0.39294919 -0.3646284  -0.39549076 -0.40652204 -0.3671819  -0.38240468
 -0.39858979 -0.37130247 -0.34073845 -0.40034822]
Pair 01: offset 322 frames (10.73s)





In [70]:
import pandas as pd

# Suppose `results` is your list of (pair, best_frame, best_time_s)
# from the matching loop:

# Example:
# results = [
#     (1, 45, 3.00),
#     (2, 92, 6.13),
#     # ...
# ]

df = pd.DataFrame(results, columns=["pair", "best_frame", "best_time_s"])
print(df.to_markdown(index=False))

|   pair |   best_frame |   best_time_s |
|-------:|-------------:|--------------:|
|      1 |          322 |       10.7333 |
