In [None]:
import pandas as pd 



train_df = pd.read_csv('./data/train.csv')
test_df = pd.read_csv('./data/test.csv')

In [None]:
train_df.info()
train_df.head()

# test_df.info()
# test_df.head()  




In [None]:
import tensorflow as tf
from tensorflow.keras.applications import EfficientNetV2M
import os 
import numpy as np

tf.keras.mixed_precision.set_global_policy('mixed_float16')

train_images_path = './data/train_images/'
test_images_path = './data/test_images/'    
batch_size = 32

os.environ['TF_GPU_ALLOCATOR'] = 'cuda_malloc_async'

base_model = EfficientNetV2M(weights='imagenet', include_top=False, pooling='avg')
base_model.trainable = False


def load_and_preprocess_image(img_path):
    img = tf.io.read_file(img_path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, (480, 480))
    return img

def extract_features_batch(image_paths):
    img_batch = np.stack([load_and_preprocess_image(img_path) for img_path in image_paths])
    features = base_model.predict(img_batch)
    # tf.keras.backend.clear_session()
    return features

image_paths_train = [os.path.join(train_images_path, f"{img_id}.jpeg") for img_id in train_df['id']]
features_list = []
train_dataset = tf.data.Dataset.from_tensor_slices(image_paths_train)
train_dataset = train_dataset.map(load_and_preprocess_image, num_parallel_calls=tf.data.AUTOTUNE)
train_dataset = train_dataset.batch(batch_size)
train_dataset = train_dataset.prefetch(tf.data.AUTOTUNE)

# Käy läpi dataset batcheina ja kerää piirteet
features_list = []
for batch in train_dataset:
    batch_features = base_model.predict(batch)
    features_list.append(batch_features)

all_features = np.vstack(features_list)
train_df['features'] = list(all_features)
print(f'Train df info: {train_df.info()}')
train_df.to_pickle('./data/train_df_with_features.pkl')

image_paths_test = [os.path.join(test_images_path, f"{img_id}.jpeg") for img_id in test_df['id']]
features_list = []
test_dataset = tf.data.Dataset.from_tensor_slices(image_paths_test)
test_dataset = test_dataset.map(load_and_preprocess_image, num_parallel_calls=tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(batch_size)
test_dataset = test_dataset.prefetch(tf.data.AUTOTUNE)

features_list = []
for batch in train_dataset:
    batch_features = base_model.predict(batch)
    features_list.append(batch_features)

all_features = np.vstack(features_list)
test_df['features'] = list(all_features)
print(f'Test df info: {test_df.info()}')
test_df.to_pickle('./data/test_df_with_features.pkl')




