### Visualização de Dados

Notebook destinado à visualização de dados brutos dos datasets. Os datasets escolhidos para treinamento e validação do primeiro modelo da S-GAN são:

* ~~ETH Pedestrian~~ (Repository Offline);
* [UCY Crowds Data](https://graphics.cs.ucy.ac.cy/portfolio);
* ~~[Stanford Drone Dataset (SDD) - Original Dataset](https://cvgl.stanford.edu/projects/uav_data/)~~ (Offline);
* [Stanford Drone Dataset (SDD) - Kaggle Compressed](https://www.kaggle.com/datasets/aryashah2k/stanford-drone-dataset)



In [None]:
!pip install rarfile # .rar extensions manipulation
#!apt-get update
#!apt-get install -y unrar

In [2]:
import os
import subprocess
import socket
import rarfile
import shutil
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation

# Global vars:
colab_usage = True if 'google.colab' in socket.gethostname() else False # Google Colab usage

# Dataset Useful Links:
eth_ucy_dataset = "https://www.dropbox.com/s/8n02xqv3l9q18r1/datasets.zip?dl=0"
ucy_dataset = "https://graphics.cs.ucy.ac.cy/research/downloads/crowd-data.zip"
sdd_campus_dataset = "https://storage.googleapis.com/kaggle-data-sets/1433833/2373439/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20241005%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20241005T231629Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=338abfd23b3e4bf78ba1c1c6656aee077d9d817963bdfe6af1f917922af58e96183922001c13d5049305aa2295e5577a1bc85adfb9ae44433831c360c017b06ca77ac7244e6540bf9c0e606eda2252df8ef85e62dc96ef1b721f912e5b6d54feb76a8b9e599b269593ee8ec855e4840a6f5263430745d314bc39b880ba65aca0f9601d3bb736d305eebc99b6334bf91546d27df8cf8c7dc33e4aaa129dd8288aef8df6574e979e9bad1e1e585ae1514e247498c805cc91fa6c75ed6a9e5e2124220218352025179e2ad31552f24e39dee8122149bf2efc8b257e041506888655fdeb702ff52bfbc14e60adddbc7864f62872c201701353106c218227e82d89f5"


### Data Preparation

In [None]:
# Define the data path prefix
data_path_prefix = "../content/" if colab_usage else "../"

# Create the directory if it doesn't exist in Google Colab
if colab_usage:
    raw_data_path = f"{data_path_prefix}data/raw"
    os.makedirs(raw_data_path, exist_ok=True)

# Define paths for UCY and SDD datasets
eth_ucy_data_path = f"{data_path_prefix}data/raw/ETH-UCY-AO"
ucy_data_path     = f"{data_path_prefix}data/raw/UCY"
sdd_data_path     = f"{data_path_prefix}data/raw/SDD"

# Create UCY & SDD directory
os.makedirs(eth_ucy_data_path, exist_ok=True) # creating ETH-UCY-AO raw data path
os.makedirs(ucy_data_path, exist_ok=True)     # creating UCY raw data path
os.makedirs(sdd_data_path, exist_ok=True)     # creating SDD raw data path
    
# Create a temporary directory:
os.makedirs("./tmp", exist_ok=True) # creating temporary path

# Print paths for verification
print(f"ETH-UCY-AO data path: {eth_ucy_data_path} {'created' if os.path.isdir(eth_ucy_data_path) else 'not created'}.")
print(f"UCY data path: {ucy_data_path} {'created' if os.path.isdir(ucy_data_path) else 'not created'}.")
print(f"SDD data path: {sdd_data_path} {'created' if os.path.isdir(sdd_data_path) else 'not created'}.")

In [None]:
# Prepare ETH-UCY Dataset (annotations only):
# download
if (not os.path.exists("./tmp/eth-ucy-crowds-data.zip")):
    subprocess.run(["wget", "-O", "./tmp/eth-ucy-crowds-data.zip", eth_ucy_dataset])

# extract dataset
if (os.path.exists("./tmp/eth-ucy-crowds-data.zip")):
    subprocess.run(["unzip", "-q", "./tmp/eth-ucy-crowds-data.zip", "-d", "./tmp/"])
    subprocess.run(["mv", "./tmp/datasets", "./tmp/ETH-UCY-AO"]) # Annotation Only (AO)
    shutil.copytree('./tmp/ETH-UCY-AO', eth_ucy_data_path, dirs_exist_ok=True)

# clean ETH-UCY-AO path
subprocess.run(["rm", "-rf", "./tmp/ETH-UCY-AO"])

In [None]:
# Prepare UCY Dataset:
# list of interesting files
ucy_list = ["crowds_zara01","crowds_zara02","students003"] # only data + video list
ucy_list_rar = ["data_zara.rar", "data_university_students.rar"]
ucy_list_ext_copy = [".vsp", ".avi"]

# download
if (not os.path.exists("./tmp/crowd-data.zip")):
    subprocess.run(["wget", "-O", "./tmp/crowd-data.zip", ucy_dataset, "--no-check-certificate"]) # warning for --no-check-certificate usage

# unzip main compressed file
if (not os.path.isdir("./tmp/crowds")):
    subprocess.run(["unzip", "-q", "./tmp/crowd-data.zip", "-d", "./tmp"])

# extract datasets
for ucy_rf in ucy_list_rar:
    with rarfile.RarFile(f'./tmp/crowds/data/{ucy_rf}') as rf:
        rf.extractall('./tmp/crowds/data/') # extract .rar in the same directory
        print(f"Extracted './tmp/crowds/data/{ucy_rf}'")

# copy to raw data path
for ucy_data in ucy_list:
    for ext in ucy_list_ext_copy:
        source_path = f"./tmp/crowds/data/{ucy_data}{ext}"
        destination_path = ucy_data_path

        # Move the file from source to destination
        if (not os.path.exists(f'{destination_path}/{ucy_data}{ext}')):
            shutil.move(source_path, destination_path)
            print(f"Moved {source_path} to {destination_path}")
        else:
            print(f"{destination_path}/{ucy_data}{ext} already exists.")

# clean crowds path
subprocess.run(["rm", "-rf", "./tmp/crowds"])

In [None]:
# Prepare Stanford Drone Dataset:
# list of interesting files
sdd_path_dirs = ["video", "annotations"] # video & annotations path - each internal folder has its own correspondent in the other path

# download
if (not os.path.exists("./tmp/sdd-data.zip")):
    subprocess.run(["wget", "-O", "./tmp/sdd-data.zip", sdd_campus_dataset])

# create auxiliary subdirectory
os.makedirs("./tmp/stanford", exist_ok=True)

# unzip main compressed file
if (
    not os.path.isdir("./tmp/stanford/annotations") and # to consider the two subfolders (correct structure of the dataset)
    not os.path.isdir("./tmp/stanford/video")
):
    # unzipping the main compressed file
    subprocess.run(["unzip", "-q", "./tmp/sdd-data.zip", "-d", "./tmp/stanford"])

# it's not needed to extract internal datasets - it has its own folder structure

# copy to raw data path
for primary_folder in os.listdir("./tmp/stanford/"):
    ''' 
    Since there are always two folders, we can process in one loop together. Additionally, 
    because these folders are mirrored, we only need to copy their internal contents
    to the reference raw data path, simplifying the process.
    '''
    for secondary_folder in os.listdir("./tmp/stanford/" + primary_folder):
        relative_path = f"./tmp/stanford/{primary_folder}/{secondary_folder}"

        print(f"Processing: {relative_path} ", end='')

        # Find and remove .jpeg and .jpg files in the secondary folders (unuseful files)
        subprocess.run(["find", relative_path, "-type", "f", "-name", "*.jpeg", "-or", "-name", "*.jpg", "-delete"])

        # Copy all the subfolders to the reference raw data path - devoted to SDD dataset
        if (os.path.isdir(relative_path)): # to avoid bugs
            shutil.copytree(relative_path, f"{sdd_data_path}/{secondary_folder}", dirs_exist_ok=True)
            print(f"copied to {sdd_data_path}/{secondary_folder}")


# clean stanford dataset path
subprocess.run(["rm", "-rf", "./tmp/stanford"])

### ETH-UCY Crowds Dataset (Annotations Only)

In [7]:
def load_dataset(file_path):
    '''
    Args:
    - data_dir: Directory containing dataset files in the format
    <frame_id> <ped_id> <x> <y>
    - obs_len: Number of time-steps in input trajectories
    - pred_len: Number of time-steps in output trajectories
    - skip: Number of frames to skip while making the dataset
    - threshold: Minimum error to be considered for non linear traj
    when using a linear predictor
    - min_ped: Minimum number of pedestrians that should be in a seqeunce
    - delim: Delimiter in the dataset files

    The structure is: <frame_id> <pedestrian_id> <x> <y>
    '''
    delim = '\t'

    # Read the dataset
    df = pd.read_csv(
        ETHUCY_data_dir['eth']['train'] + '/biwi_hotel_train.txt',
        delimiter=delim, header=None, names=['frame_id', 'ped_id', 'x', 'y']
    )

    # Convert frame_id and ped_id to integers
    df['frame_id'] = df['frame_id'].astype(int)
    df['ped_id'] = df['ped_id'].astype(int)

    # Group data by frame_id and aggregate the pedestrian data
    grouped_data = df.groupby('frame_id').agg({
        'ped_id' : list, # Collect all pedestrian IDs for the frame
        'x'      : list, # Collect all respective x positions for the frame
        'y'      : list, # Collect all respective y positions for the frame
    }).reset_index()

    return grouped_data

In [None]:
# Mapping the datasets in a dictionary structure
ETHUCY_datasets = ["eth", "hotel", "raw", "univ", "zara1", "zara2"]
ETHUCY_data_dir = {
    dataset: {
        'test' : f'{eth_ucy_data_path}/{dataset}/test',
        'train' : f'{eth_ucy_data_path}/{dataset}/train',
        'val' : f'{eth_ucy_data_path}/{dataset}/val'
    } for dataset in ETHUCY_datasets
}

# Check for existing datasets in the ETH-UCY-AO directory
for dataset in ETHUCY_datasets:
    print(dataset)

# Reading & plotting the dataset
'''
Example of reading the dataset
j = 0
frame_step = 10 # frames jump 10 by 10
frame_id   = 10 * j
ped_at_frame = grouped_data[grouped_data['frame_id'] == frame_id]

ped_id = 1
ped_at_index = ped_at_frame['ped_id'].iloc[0].index(ped_id) # ped_id starts from 1.0
x_pos = ped_at_frame['x'].iloc[0][ped_at_index]
y_pos = ped_at_frame['y'].iloc[0][ped_at_index]
print(f'{ped_id}: ({x_pos}, {y_pos})')
'''

'''
TODO: colocar a velocidade no dataframe
TODO: considerar vetor de interação entre os pedestres vetor e.
'''

### UCY Crowds Dataset

### Stanford Drone Dataset

In [None]:
# always must clean the temporary directory to avoid github issues (.git)
subprocess.run(["rm", "-rf", "./tmp"])