In [None]:
from google.colab import drive
drive.mount('/content/drive')

VEC_DIR   = '/content/drive/MyDrive/video_vectorizer/cnn_output'
AUX_ALL_NPZ = '/content/drive/MyDrive/video_vectorizer/final_npz/aux_features_clean.npz'
OUT_NPZ = '/content/drive/MyDrive/video_vectorizer/final_npz/bigru_early_fusion2.npz'


In [None]:
import pandas as pd
import numpy as np
import glob
import os
import tqdm

# keep only the three target classes
TARGET_CLASSES = {'Hamas', 'Fatah', 'Unaffiliated'}

aux_all   = np.load(AUX_ALL_NPZ, allow_pickle=True)
X_aux_all = aux_all['X']             # (N, p)
y_all     = aux_all['y']             # (N,)  str or int
ids_all   = aux_all['video_id']      # (N,)



In [None]:
X_rows, y_rows, id_rows = [], [], []

for vec_aux, label, vid in tqdm.tqdm(zip(X_aux_all, y_all, ids_all),
                                     total=len(y_all),
                                     desc='Pairing', unit='vid'):
    label = str(label)
    vid   = str(vid)

    if TARGET_CLASSES and label not in TARGET_CLASSES:
        continue                      # skip unwanted label

    extension_vec_path = os.path.join(VEC_DIR, f'{vid}_bigru.npy')
    if not os.path.exists(extension_vec_path):
        continue                      # skip if no Bi-GRU vector

    vec_video = np.load(extension_vec_path)   # (768,)
    X_rows.append(np.hstack([vec_video, vec_aux]))
    y_rows.append(label)
    id_rows.append(vid)

# stack & save --------------------------------------------------------
X = np.vstack(X_rows).astype(np.float32)     # (M, 768+p)
y = np.array(y_rows)                         # (M,)  strings
video_id = np.array(id_rows)                 # (M,)

np.savez_compressed(OUT_NPZ, X=X, y=y, video_id=video_id)
print(f'✓ early_fusion.npz  written → {OUT_NPZ}  |  shape {X.shape}')