In [None]:
import numpy as np
import pandas as pd
import math
import os
import sys
import joblib


In [None]:
project_root = os.path.abspath(os.path.join(os.getcwd(), "..", ".."))
if project_root not in sys.path:
    sys.path.append(project_root)

from img2vec import rgb2emb

### Read the data

In [None]:
train_data = pd.read_csv(os.path.join('..', '..', 'data', 'train.csv'))
val_data = pd.read_csv(os.path.join('..', '..', 'data', 'val.csv'))
test_data = pd.read_csv(os.path.join('..', '..', 'data', 'test.csv'))

In [None]:
def construct_img_path(row):
    return os.path.join("..", "..", "data", "faces", row['user_id'],
                        "coarse_tilt_aligned_face." + str(row['face_id']) + "." + row['original_image'])


train_data['img_path'] = train_data.apply(construct_img_path, axis=1)
val_data['img_path'] = val_data.apply(construct_img_path, axis=1)
test_data['img_path'] = test_data.apply(construct_img_path, axis=1)


In [None]:
train_data['img_exists'] = train_data['img_path'].apply(os.path.exists)
val_data['img_exists'] = val_data['img_path'].apply(os.path.exists)
test_data['img_exists'] = test_data['img_path'].apply(os.path.exists)

In [None]:
train_data_filtered = train_data_filtered.merge(
    train_data[['user_id', 'face_id', 'original_image', 'img_path', 'img_exists']], 
    on=['user_id', 'face_id', 'original_image']
)

val_data_filtered = val_data_filtered.merge(
    val_data[['user_id', 'face_id', 'original_image', 'img_path', 'img_exists']], 
    on=['user_id', 'face_id', 'original_image']
)

test_data_filtered = test_data_filtered.merge(
    test_data[['user_id', 'face_id', 'original_image', 'img_path', 'img_exists']], 
    on=['user_id', 'face_id', 'original_image']
)

# Now filter only those that exist
train_data_filtered = train_data_filtered[train_data_filtered['img_exists'] == True]
val_data_filtered = val_data_filtered[val_data_filtered['img_exists'] == True]
test_data_filtered = test_data_filtered[test_data_filtered['img_exists'] == True]

In [None]:
train_image_paths = train_data_filtered['img_path'].tolist()
train_labels = train_data_filtered['combined_label_encoded'].values  # Use encoded values

val_image_paths = val_data_filtered['img_path'].tolist()
val_labels = val_data_filtered['combined_label_encoded'].values  # Use encoded values

test_image_paths = test_data_filtered['img_path'].tolist()
test_labels = test_data_filtered['combined_label_encoded'].values  # Use encoded values

### Define function that process features in batches and store them to avoid recomputation

In [None]:
def preprocess_and_save_features(image_paths, output_file, batch_size=64):
    if os.path.exists(output_file):
        print(f"Loading pre-processed features from {output_file}")
        return np.load(output_file)

    print(f"Processing {len(image_paths)} images and saving to {output_file}")
    all_features = []

    for i in range(0, len(image_paths), batch_size):
        batch_paths = image_paths[i:i + batch_size]
        print(f"Processing batch {i // batch_size + 1}/{math.ceil(len(image_paths) / batch_size)}")
        batch_features = rgb2emb(batch_paths)
        all_features.append(batch_features)

    all_features = np.vstack(all_features)
    np.save(output_file, all_features)
    return all_features

### Process and save features

In [None]:
model_folder = os.path.join('..', '..', 'models', "rgbemb")

train_features = preprocess_and_save_features(train_image_paths, os.path.join(model_folder, 'train_features.npy'))
val_features = preprocess_and_save_features(val_image_paths, os.path.join(model_folder, 'val_features.npy'))
test_features = preprocess_and_save_features(test_image_paths, os.path.join(model_folder, 'test_features.npy'))


### Standardize features

In [None]:
scaler = StandardScaler()
scaler.fit_transform(train_features)

# Save the scaler
joblib.dump(scaler, 'feature_scaler.pkl')
print("Feature scaler created and saved successfully.")