# Check if in Colab and connect to Colab

In [None]:
try:
    from google.colab import drive
    IN_COLAB = True
    print("Running on Google Colab. ")
except:
    IN_COLAB = False
    print("Not running on Google Colab. ")

In [None]:
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive/')

# Selecting images different from used for test set

In [None]:
import pandas as pd
import os

# set the path for the dataset
if IN_COLAB:
  path_dataset = "/content/drive/Shareddrives/AI4CYBSEC/face_dataset"
else:
    path_dataset = "./face_dataset"

identity_meta_NN1_name = "meta_identity_NN1.csv"

path_identity_csv = os.path.join(path_dataset,identity_meta_NN1_name)
identity_meta_NN1 = pd.read_csv(path_identity_csv)

In [None]:
if IN_COLAB:
  path_selected = "/content/drive/Shareddrives/AI4CYBSEC/face_dataset/selected_data.csv"
else:
    path_selected = "./face_dataset/selected_data.csv"

selected_identity_meta = pd.read_csv(path_selected)

In [None]:
selected_identity_meta

In [None]:
identity_meta_NN1

Elimina le righe selezionate per NN1 da quelle totali, per poi prelevare i campioni

In [None]:
identity_filtered = identity_meta_NN1[~identity_meta_NN1['Class_ID'].isin(selected_identity_meta['Class_ID'])]

In [None]:
identity_filtered

# Download VGGFace2_train.tar.gz

In [None]:
import os
import random
import tarfile
import pandas as pd
import gdown

# URL del file vggface2_train.tar.gz
url = "https://drive.google.com/uc?export=download&id=1K56kVYHHDfLA2Anm7ga0tQolMwIPk6R8"
file_name = "vggface2_train.tar.gz"

# Cartella di destinazione per il download
if IN_COLAB:
  download_folder = "/content/drive/Shareddrives/AI4CYBSEC/downloads"
else:
  download_folder = "./downloads"
# download_folder = "./downloads"

# Se la cartella di download non esiste, creala
if not os.path.exists(download_folder):
    os.makedirs(download_folder)

# Percorso completo del file scaricato
file_path = os.path.join(download_folder, file_name)

# Scarica il file se non è già presente nella cartella di download
if not os.path.exists(file_path):
    print(f"Avvio del download di {file_name}")
    gdown.download(url, file_path, quiet=False)
    print("Download completato.")
else:
    print(f"Il file {file_name} è già presente.")

# Extraction 1000 lines from selected_df for training the detector

Salva le identità selezionate nel file selected_data_train_detector.csv

In [None]:
import pandas as pd
import random
seed = 84

# Estrai 1000 righe casuali dal DataFrame
selected_df = identity_filtered.sample(n=1000, random_state=seed)

# Salva le righe selezionate in un nuovo file CSV
if IN_COLAB:
  selected_csv = "/content/drive/Shareddrives/AI4CYBSEC/face_dataset/selected_data_train_detector.csv"
else:
  selected_csv = "./face_dataset/selected_data_train_detector.csv"
# selected_csv = "selected_data_train_detector.csv"
selected_df.to_csv(selected_csv, index=False)

print("1000 righe casuali sono state estratte e salvate in:", selected_csv)

In [None]:
df = pd.read_csv(selected_csv)
df

## Separazione identità 80% train e 20% validation

In [None]:
train_selected_df = df.sample(n=800, random_state=seed)
validation_selected_df = df[~df['Class_ID'].isin(train_selected_df['Class_ID'])]

In [None]:
train_selected_df

In [None]:
validation_selected_df

# Extracting images from archive

In [None]:
# esegui questo se vuoi lavorare con il dataset estratto
if IN_COLAB:
  !tar -xzf "/content/drive/Shareddrives/AI4CYBSEC/downloads/vggface2_train.tar.gz"
else:
  %tar -xzf "./downloads/vggface2_train.tar.gz"

# Selecting images for each selected identity

Per ognuna delle 1000 identità vengono estratte 10 immagini

In [None]:
import os
import shutil
import random
from tqdm import tqdm

seed = 84

def extract_images(root, classID, num_imgs, input_folder, output_folder, seed):
    random.seed(seed)
    # Percorso della cartella contenente le immagini estratte
    input_class_folder = os.path.join(input_folder, str(classID))
    # Percorso della cartella di output per questa classe
    output_class_folder = os.path.join(output_folder, str(classID))

    # Crea la cartella di output per questa classe se non esiste già
    os.makedirs(output_class_folder, exist_ok=True)

    # Elenco dei file immagine nella cartella della classe
    image_files = [f for f in os.listdir(input_class_folder) if os.path.isfile(os.path.join(input_class_folder, f))]

    # Seleziona un massimo di num_imgs immagini in modo casuale (se ce ne sono meno, seleziona tutte)
    selected_images = random.sample(image_files, min(num_imgs, len(image_files)))

    # Copia e salva le immagini selezionate
    for image_name in selected_images:
        input_image_path = os.path.join(input_class_folder, image_name)
        output_image_path = os.path.join(output_class_folder, image_name)
        shutil.copyfile(input_image_path, output_image_path)

root = "train"
num_imgs = 10
input_folder = "./train"
output_folder = "./train_detector_set"
for classID in tqdm(train_selected_df["Class_ID"], desc='Processing classes', unit='class'):
    extract_images(root, classID, num_imgs, input_folder, output_folder, seed)

root = "train"
num_imgs = 10
input_folder = "./train"
output_folder = "./validation_detector_set"
for classID in tqdm(validation_selected_df["Class_ID"], desc='Processing classes', unit='class'):
    extract_images(root, classID, num_imgs, input_folder, output_folder, seed)

Copia delle immagini selezionate nel drive condiviso

In [None]:
!cp -r ./train_detector_set /content/drive/Shareddrives/AI4CYBSEC/face_dataset/train_detector_set

In [None]:
!cp -r ./validation_detector_set /content/drive/Shareddrives/AI4CYBSEC/face_dataset/validation_detector_set

In [None]:
len(os.listdir("/content/drive/Shareddrives/AI4CYBSEC/face_dataset/train_detector_set"))

In [None]:
len(os.listdir("/content/drive/Shareddrives/AI4CYBSEC/face_dataset/validation_detector_set"))

In [None]:
len(os.listdir("/content/drive/Shareddrives/AI4CYBSEC/face_dataset/train_detector_shuffle/0"))