# FG-Net

- The purpose of this notebook is to provide the EDA done before trying to solve this dataset
- Found this dataset in [this webpage](https://yanweifu.github.io/FG_NET_data/)
- We have a [papers with code entry](https://paperswithcode.com/dataset/fg-net) where we can find how some people have used this dataset
- From the *papers with code* entry, I find the **[following paper](https://arxiv.org/abs/1602.06149)**
    - They propose a new dataset, Large Age-Gap dataset (LAG dataset)
    - They talk about **LFW dataset**: is the most famous dataset where there are almost no constraints (lighting, pose...)
    - But they constraint age, which is in what we are interested in!
    - They talk about **FG-NET dataset** as one of the most famous datasets with aging gaps
    - So I think it is **valuable to talk about this paper in my thesis**
- *Papers with code* says that [this github repo](https://github.com/Hzzone/MTLFace) has the best model for the age-invariant recognition problem
    - They have a related [paper](https://arxiv.org/abs/2103.01520)
    - They use attention mechanisms
    - They talk about age-invariant face recognition or _**AIFR**_
    - They have a table with the results of different papers in this dataset, so **it can be interesting to talk about this paper in my thesis**
    - They say that *FG-NET* is the most challenging dataset for *AIFR*
    - They **describe precisely how testing is done**

# Imports

In [None]:
import os
import requests, zipfile, io
import itertools

import numpy as np
import matplotlib.pyplot as plt
import torch
import torchvision
import torchvision.transforms as T

from typing import Union, Tuple, List

# Global parameters of the notebook

In [None]:
# Lib to define paths
import os

# - For ease of use, we are going to store all global parameters into a dict
# - This way, we can pass this dict directly to wandb init, so we can keep track
# of which parameters produced which output

from typing import Dict, Union
GLOBALS: Dict[str, Union[str, int, float, bool]] = dict()

# Define if we are running the notebook in our computer ("local")
# or in Google Colab ("remote")
GLOBALS['RUNNING_ENV'] = "local"

# Base path for the rest of paths defined in the notebook
GLOBALS['BASE_PATH'] = "./" if GLOBALS['RUNNING_ENV'] == "local" else "/content/drive/MyDrive/Colab Notebooks/"

# Path to our lib dir
GLOBALS['LIB_PATH'] = os.path.join(GLOBALS['BASE_PATH'], "lib")

# Path where we store training / test data
GLOBALS['DATA_PATH'] = os.path.join(GLOBALS['BASE_PATH'], "data/FG_NET")

# URL of the zipfile with the dataset
GLOBALS['DATASET_URL'] = "http://yanweifu.github.io/FG_NET_data/FGNET.zip"

# Dataset has images and metadata. Here we store the path to the img dir 
GLOBALS['IMAGE_DIR_PATH'] = os.path.join(GLOBALS['DATA_PATH'], "FGNET/images")

# Auth for Google Drive

In [None]:
if GLOBALS['RUNNING_ENV'] == "remote":
    from google.colab import drive
    drive.mount('/content/drive')

# Dataset downloading 

In [None]:
def download_dataset(path: str, url: str):
    
    # Create the dir if it does not exist
    if os.path.exists(path) is False:
        print(f"Dir {path} does not exist, creating that dir")
        os.mkdir(path)
        
    # Download the dataset and extract it at that path
    try:
        req = requests.get(url)
    except Exception as e:
        print(f"ERROR: could not download data from url")
        print(f"ERROR: error is:\n{e}")
        return
        
    
    zip_file = zipfile.ZipFile(io.BytesIO(req.content))
    zip_file.extractall(path)

    print("Succesful download")

    
download_dataset(
    GLOBALS['DATA_PATH'],
    GLOBALS['DATASET_URL'],
    can_skip_download = True
)

In [None]:
class FGDataset(torch.utils.data.Dataset):
    def __init__(self, path: str, transform = None):
        self.path = path
        self.transform = transform
        self.labels = None
        self.imgs = None
        self.individuals = None
        self.number_images: Union[int, None] = None
        self.file_names: Union[List, None] = None

        # Get the data from the dir
        self.__generate_dataset()

        super(FGDataset, self).__init__()

    
    def __len__(self) -> int:

        # Check that we have the number of images of the dataset
        if self.number_images is None:
            raise Exception("Dataset is not initialized, thus, number of images is unknown")
        
        return self.number_images
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist() 

        # Get the image from the index
        img_name = os.path.join(self.path, self.file_names[idx])
        image = torchvision.io.read_image(img_name)

        # Get the id and age from the file_name
        id, age = self.__id_and_age_from_file_name(self.file_names[idx])

        # Put together all the info
        sample = {
            "image": image,
            "id": id,
            "age": age,
        }

        if self.transform:
            sample = self.transform(sample)

        return sample

    def __generate_dataset(self):

        # Get all the names of the files
        self.file_names = os.listdir(self.path)

        # Use that for computing the size
        self.number_images = len(self.file_names)
            
        self.individuals = dict()

        # Use the names to get the persons IDs and their ages
        for file_name in self.file_names:

            # Split into id and age
            id, age = self.__id_and_age_from_file_name(file_name)

            # Put the data into a dict
            # If the individual has already an entry, append to their list
            # Otherwise create an entry 
            if self.individuals.get(id) is not None:
                self.individuals[id].append(age)
                continue
            
            # Otherwise, create an entry for that individual
            self.individuals[id] = [age]

    def __id_and_age_from_file_name(self, file_name: str) -> Tuple[str, str]:
        # Remove file extension
        file_name_no_extension = file_name.split(".JPG")[0]

        # Split into id and age
        id, age = file_name_no_extension.split("A")

        return id, age

dataset = FGDataset(path = GLOBALS['IMAGE_DIR_PATH'], transform = None)

In [None]:
sample = dataset[20]
img = sample["image"]
age = sample["age"]
id = sample["id"]

print(f"Id {id} at age {age}")

transform = T.ToPILImage()
img = transform(img)
plt.imshow(img)

In [None]:
def plot_histogram(values: List[float], num_bins: int):

    # Make plots bigger
    plt.figure(figsize=(8, 6), dpi=80)

    # Get the data needed for the plot 
    counts, bins = np.histogram(values, bins = num_bins)

    # Plot the data
    plt.hist(bins[:-1], bins, weights=counts)
    plt.plot()

In [None]:
# Get a flat list with all the ages in the dataset
ages = dataset.individuals.values()
ages = list(itertools.chain(*ages))

# Some persons have a few photos at the same age
# In that case, ages are labeled with letters
# For example, 13a, 13b, 13c
# Get rid of that extra car
def remove_label(age_str: str) -> str:
    if len(age_str) > 2:
        return age_str[0:2]
    
    return age_str

ages = [int(remove_label(age)) for age in ages]

# Now, plot the histogram of the ages distribution
plot_histogram(ages, num_bins = 70)

In [None]:
imgs_per_user = [len(user_imgs) for user_imgs in dataset.individuals.values()]

# Now, plot the distribution of the ages
plot_histogram(imgs_per_user, num_bins = 15)