In [71]:
import cv2 as cv
import numpy as np
from sklearn.decomposition import PCA
import os
import re
import glob
import json

In [72]:
def get_out_folder(file_name):
    """
    Get the output folder path based on the input file name.
    """
    # Extract the number
    match = re.match(r"(\d+)_", file_name)
    if match:
        number = int(match.group(1))
    else:
        number = "unknown"

    # Check for 'query' or 'reference'
    if "query" in file_name:
        subfolder = "query"
    elif "reference" in file_name:
        subfolder = "reference"
    else:
        subfolder = "other"

    output_folder = os.path.join("frames", subfolder, f"vid_{number}")
    return output_folder

In [73]:
def extract_frames(video_path, output_folder, target_fps=15, max_pixels=120000):
    """
    Extract frames at target_fps and resize so each saved frame has <= max_pixels.
    Creates (or loads) a mapping JSON with:
      - orig_fps:    original capture FPS
      - frame_step:  sampling step
      - orig_indices: list of original frame numbers for each saved frame
    """
    mapping_path = os.path.join(output_folder, "frame_mapping.json")
    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # If mapping already exists, load and return it
    if os.path.isfile(mapping_path):
        with open(mapping_path, "r") as f:
            data = json.load(f)
        return data["orig_fps"], data["frame_step"], data["orig_indices"]

    # Otherwise extract frames and build mapping
    cap = cv.VideoCapture(video_path)
    if not cap.isOpened():
        raise IOError(f"Cannot open video {video_path}")
    orig_fps   = cap.get(cv.CAP_PROP_FPS) or target_fps
    frame_step = max(int(round(orig_fps / target_fps)), 1)

    orig_indices = []
    count = 0   # original frame index
    saved = 0   # downsampled frame index

    while True:
        ret, frame = cap.read()
        if not ret:
            break
        if count % frame_step == 0:
            # resize to <= max_pixels
            h, w = frame.shape[:2]
            scale = (max_pixels / float(w * h)) ** 0.5
            if scale < 1.0:
                frame = cv.resize(
                    frame,
                    (int(w*scale), int(h*scale)),
                    interpolation=cv.INTER_AREA
                )
            # save frame
            frame_filename = os.path.join(output_folder, f"frame_{saved:04d}.jpg")
            cv.imwrite(frame_filename, frame)
            orig_indices.append(count)
            saved += 1
        count += 1

    cap.release()

    # Save the mapping JSON
    with open(mapping_path, "w") as f:
        json.dump({
            "orig_fps":    orig_fps,
            "frame_step":  frame_step,
            "orig_indices": orig_indices
        }, f)

    return orig_fps, frame_step, orig_indices

In [74]:
def match_query_to_reference(video_file, pca_model=None, target_fps=15, max_pixels=120000):
    """
    Match query frames to reference frames using PCA for dimensionality reduction.
    Args:
        video_file (str): Path to the input video file.
        pca_model (PCA, optional): Pre-trained PCA model. If None, a new model will be created.
        pca_dim (int): Number of dimensions for PCA.
        lambda_val (float): Regularization parameter for PCA.
        fps (int): Frames per second for the video.
    Returns:
        tuple: Output folder path and PCA model.
    """
    # Determine folders and extract frames if needed
    print(f"Processing video file: {video_file}...")
    out_folder = get_out_folder(os.path.basename(video_file))
    orig_fps, frame_step, orig_indices = extract_frames(video_file, out_folder, target_fps, max_pixels)
    return out_folder, pca_model, orig_fps, frame_step, orig_indices

In [75]:
from tqdm import tqdm

def load_frame_descriptors(frame_folder, sift, pca_model=None, pca_dim=64, fit_pca=False):
    # Could multithread
    
    """
    Load frames from folder, extract SIFT descriptors per frame, apply PCA, return frame-level vectors.
    If fit_pca=True, stack all keypoint descs for PCA fit; else apply existing pca_model.
    """
    # Gather frame paths
    frame_paths = sorted(glob.glob(os.path.join(frame_folder, "frame_*.jpg")))
    all_descs = []
    key_descs = []

    # First pass: collect for PCA if needed
    # if fit_pca:
    for fp in tqdm(frame_paths, desc="Collecting descriptors for PCA"):
        img = cv.imread(fp, cv.IMREAD_GRAYSCALE)
        _, des = sift.detectAndCompute(img, None)
        if des is not None:
            key_descs.append(des)
    stacked = np.vstack(key_descs)
    pca_model = PCA(n_components=pca_dim, whiten=True, svd_solver='auto')
    pca_model.fit(stacked)
    
    print(f"Fitted PCA model with {pca_dim} dimensions.")
    print(f"Explained variance ratio: {pca_model.explained_variance_ratio_}")
    print(f"Total variance explained: {np.sum(pca_model.explained_variance_ratio_)}")
    print(f"Number of components: {pca_model.n_components_}")

    # Second pass: extract per-frame vector
    for fp in tqdm(frame_paths, desc="Extracting frame vectors"):
        img = cv.imread(fp, cv.IMREAD_GRAYSCALE)
        _, des = sift.detectAndCompute(img, None)
        if des is not None and pca_model is not None:
            des_pca = pca_model.transform(des)
            frame_vec = np.mean(des_pca, axis=0)
        else:
            frame_vec = np.zeros(pca_dim, dtype=np.float32)
        all_descs.append(frame_vec)
        
    print(f"Extracted {len(all_descs)} frame vectors from {len(frame_paths)} frames.")
    print(f"Shape of each frame vector: {all_descs[0].shape if all_descs else 'N/A'}")
    print(f"Total number of frames processed: {len(all_descs)}")
    print(f"Shape of all descriptors: {np.array(all_descs).shape}")
    print(f"Shape of PCA model components: {pca_model.components_.shape if pca_model else 'N/A'}")
    return np.vstack(all_descs), pca_model

In [76]:
def compute_cte_correlation(query_desc, ref_desc, lambda_val=1e-3):
    Tq, D = query_desc.shape
    Tr, _ = ref_desc.shape
    N = 1 << int(np.ceil(np.log2(max(Tq, Tr))))
    Q = np.zeros((N, D), dtype=np.float32)
    R = np.zeros((N, D), dtype=np.float32)
    Q[:Tq] = query_desc; R[:Tr] = ref_desc
    Qf = np.fft.rfft(Q, n=N, axis=0)
    Rf = np.fft.rfft(R, n=N, axis=0)
    num = np.sum(np.conj(Qf) * Rf, axis=1)
    den = np.sum(np.abs(Qf) ** 2, axis=1)
    scores = np.fft.irfft(num / (den + lambda_val), n=N)
    return scores

In [77]:
def find_best_offset(scores, fps=15):
    best_frame = int(np.argmax(scores))
    print(f"Best frame offset: {best_frame} (at {best_frame / fps:.4f} seconds)")
    return best_frame, best_frame / fps

In [78]:
sift = cv.SIFT_create()
pca_model = None
results = []

videos_idx_limit = 1 # Limit to which video number we want to stop process (1 - 15)

for idx in range(1, videos_idx_limit + 1):
    idx = 1 # Set to a specific index for testing, remove this line for full loop
    q_file = f"train/task1/{idx:02d}_query.mp4"
    r_file = f"train/task1/{idx:02d}_reference.mp4"
    
    print(f"Processing video pair {idx:02d}: {q_file} (query) and {r_file} (reference)")
    
    # Process query video
    q_folder, pca_model, q_fps, q_step, q_map = match_query_to_reference(q_file, pca_model, target_fps=20, max_pixels=180000)
    print("first time fitting PCA on query frames")
    q_desc, pca_model = load_frame_descriptors(q_folder, sift, pca_model, pca_dim=64, fit_pca=(pca_model is None))
    
    # Process reference video
    r_folder, _, r_fps, r_step, r_map = match_query_to_reference(r_file, pca_model, target_fps=20, max_pixels=180000)
    print("Extracting reference frames")
    r_desc, _ = load_frame_descriptors(r_folder, sift, pca_model, pca_dim=64, fit_pca=False)
    
    
    scores = compute_cte_correlation(q_desc, r_desc, lambda_val=1e-3)
    
    # Find best *downsampled* frame delta
    N     = scores.shape[0]
    delta = int(np.argmax(scores))
    if delta > N//2:
        delta -= N

    # Map back to original reference frame
    delta = max(delta, 0)            # clamp if you only allow forward shifts
    orig_frame = r_map[delta]        # true original-frame index
    orig_time  = orig_frame / r_fps  # in seconds

    print(f"Pair {idx:02d}: matched original frame {orig_frame} "
        f"({orig_time:.2f}s)")
    results.append((idx, orig_frame, orig_time))


Processing video pair 01: train/task1/01_query.mp4 (query) and train/task1/01_reference.mp4 (reference)
Processing video file: train/task1/01_query.mp4...
first time fitting PCA on query frames


Collecting descriptors for PCA: 100%|██████████| 60/60 [00:03<00:00, 15.45it/s]


Fitted PCA model with 64 dimensions.
Explained variance ratio: [0.12943125 0.07260717 0.04968274 0.04381259 0.03843722 0.03686541
 0.03447191 0.03251746 0.03159618 0.02473073 0.0206832  0.02050975
 0.01949956 0.01876873 0.01774731 0.01685929 0.01583739 0.01538047
 0.01467589 0.01320246 0.01283477 0.01241744 0.01147002 0.01094152
 0.01028205 0.00973011 0.00937545 0.00915288 0.00823307 0.00716604
 0.00703366 0.00692178 0.00680079 0.00664198 0.00659646 0.00629149
 0.00590707 0.00566627 0.00549446 0.00543285 0.00535972 0.00509093
 0.00476356 0.00472138 0.00445179 0.00437367 0.0042983  0.00420794
 0.004095   0.00400351 0.00388165 0.0037839  0.00374683 0.00370276
 0.00338279 0.00333504 0.00321187 0.00315278 0.00308111 0.00283374
 0.00281845 0.00274101 0.00258864 0.00250044]
Total variance explained: 0.9218336939811707
Number of components: 64


Extracting frame vectors: 100%|██████████| 60/60 [00:03<00:00, 16.58it/s]


Extracted 60 frame vectors from 60 frames.
Shape of each frame vector: (64,)
Total number of frames processed: 60
Shape of all descriptors: (60, 64)
Shape of PCA model components: (64, 128)
Processing video file: train/task1/01_reference.mp4...
Extracting reference frames


Collecting descriptors for PCA: 100%|██████████| 600/600 [00:35<00:00, 16.73it/s]


Fitted PCA model with 64 dimensions.
Explained variance ratio: [0.12195007 0.07196461 0.04983318 0.04736897 0.04001319 0.03796236
 0.03645194 0.03519392 0.03283417 0.02388236 0.02276749 0.02002756
 0.01969174 0.01895109 0.0178208  0.01646458 0.01506472 0.01474224
 0.0141797  0.01303325 0.01269642 0.01218205 0.01105565 0.01044854
 0.00995714 0.00984532 0.00935247 0.00904017 0.00783704 0.00739236
 0.00724711 0.00693813 0.00661277 0.0065147  0.00612273 0.00597579
 0.00575747 0.0055827  0.00552923 0.00536137 0.00510534 0.00509681
 0.00485429 0.00456053 0.00448023 0.00442271 0.0043247  0.00413402
 0.00409416 0.00400369 0.00395451 0.0039111  0.00377584 0.00369176
 0.00348304 0.00333615 0.00319307 0.00300111 0.00287097 0.002828
 0.00275751 0.00270829 0.00260433 0.00255512]
Total variance explained: 0.9213943481445312
Number of components: 64


Extracting frame vectors: 100%|██████████| 600/600 [00:33<00:00, 17.73it/s]


Extracted 600 frame vectors from 600 frames.
Shape of each frame vector: (64,)
Total number of frames processed: 600
Shape of all descriptors: (600, 64)
Shape of PCA model components: (64, 128)
Pair 01: matched original frame 897 (14.95s)


In [79]:
# # —— Insert this block right here ——
# N = scores.shape[0]
# delta = int(np.argmax(scores))
# # wrap‐around correction
# if delta > N // 2:
#     delta = delta - N

# # map back to the original video’s frame rate
# # these must match how you extracted frames:
# #   orig_fps  = cap.get(cv2.CAP_PROP_FPS)    (e.g. 30)
# #   target_fps = 15
# #   frame_step = round(orig_fps/target_fps)  (e.g. 2)
# orig_frame = delta * frame_step
# orig_time  = orig_frame / orig_fps
# # —— end insertion ——  

# print(f"Pair {idx:02d}: match at normalized frame {delta},"
#       f" original frame {orig_frame}, time {orig_time:.2f}s")

In [80]:
import pandas as pd

# Suppose `results` is your list of (pair, best_frame, best_time_s)
# from the matching loop:

# Example:
# results = [
#     (1, 45, 3.00),
#     (2, 92, 6.13),
#     # ...
# ]

df = pd.DataFrame(results, columns=["pair", "best_frame", "best_time_s"])
print(df.to_markdown(index=False))

|   pair |   best_frame |   best_time_s |
|-------:|-------------:|--------------:|
|      1 |          897 |         14.95 |


In [81]:
# results = [
#     (1, 422, 14.0667),
#     (2, 625, 20.8333),
#     (3, 274, 9.13333),
#     (4, 418, 13.9333),
#     (5, 186, 6.2),
#     (6, 618, 20.6),
#     (7, 978, 32.6),
#     (8, 472, 15.7333),
#     (9, 649, 21.6333),
#     (10, 804, 26.8),
#     (11, 252, 8.4),
#     (12, 77, 2.56667),
#     (13, 448, 14.9333),
#     (14, 778, 25.9333),
#     (15, 223, 7.43333)
# ]
# df = pd.DataFrame(results, columns=["pair", "best_frame", "best_time_s"])

In [82]:
ground_truth = [
    585,
    # 756,
    # 650,
    # 360,
    # 340,
    # 560,
    # 590,
    # 1497,
    # 1196,
    # 1160,
    # 1120,
    # 450,
    # 636,
    # 610,
    # 1250
]

In [83]:
accuracy = np.mean([abs(gt - bf) <= 60 for gt, (_, bf, _) in zip(ground_truth, results)])
difference = [abs(gt - bf) for gt, (_, bf, _) in zip(ground_truth, results)]
print(f"Ground truth differences: {difference}")
print(f"Accuracy: {accuracy:.2%} (±60 frames tolerance)")

Ground truth differences: [312]
Accuracy: 0.00% (±60 frames tolerance)
