### Visualização de Dados

Notebook destinado à visualização de dados brutos dos datasets. Os datasets escolhidos para treinamento e validação do primeiro modelo da S-GAN são:

* ETH Pedestrian;
* [UCY Crowds Data](https://graphics.cs.ucy.ac.cy/portfolio);
* ~~[Stanford Drone Dataset (SDD) - Original Dataset](https://cvgl.stanford.edu/projects/uav_data/)~~ (Offline);
* [Stanford Drone Dataset (SDD) - Kaggle Compressed](https://www.kaggle.com/datasets/aryashah2k/stanford-drone-dataset)



In [9]:
!pip install rarfile # .rar extensions manipulation
#!apt-get update
#!apt-get install -y unrar

Defaulting to user installation because normal site-packages is not writeable


In [10]:
import os
import subprocess
import socket
import rarfile
import shutil
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from IPython.display import HTML

# Global vars:
colab_usage = True if 'google.colab' in socket.gethostname() else False # Google Colab usage
frames_to_follow = 20

# Dataset Useful Links:
eth_ucy_dataset = "https://www.dropbox.com/s/8n02xqv3l9q18r1/datasets.zip?dl=0"
ucy_dataset = "https://graphics.cs.ucy.ac.cy/research/downloads/crowd-data.zip"
sdd_campus_dataset = ""


### Data Preparation

In [11]:
# Define the data path prefix
data_path_prefix = "../content/" if colab_usage else "../"

# Create the directory if it doesn't exist in Google Colab
if colab_usage:
    raw_data_path = f"{data_path_prefix}data/raw"
    os.makedirs(raw_data_path, exist_ok=True)

# Define paths for UCY and SDD datasets
eth_ucy_data_path = f"{data_path_prefix}data/raw/ETH-UCY-AO"
ucy_data_path     = f"{data_path_prefix}data/raw/UCY"
sdd_data_path     = f"{data_path_prefix}data/raw/SDD"

# Create UCY & SDD directory
os.makedirs(eth_ucy_data_path, exist_ok=True) # creating ETH-UCY-AO raw data path
#os.makedirs(ucy_data_path, exist_ok=True)     # creating UCY raw data path
#os.makedirs(sdd_data_path, exist_ok=True)     # creating SDD raw data path
    
# Create a temporary directory:
os.makedirs("./tmp", exist_ok=True) # creating temporary path

# Print paths for verification
print(f"ETH-UCY-AO data path: {eth_ucy_data_path} {'created' if os.path.isdir(eth_ucy_data_path) else 'not created'}.")
#print(f"UCY data path: {ucy_data_path} {'created' if os.path.isdir(ucy_data_path) else 'not created'}.")
#print(f"SDD data path: {sdd_data_path} {'created' if os.path.isdir(sdd_data_path) else 'not created'}.")

ETH-UCY-AO data path: ../data/raw/ETH-UCY-AO created.


In [12]:
# Prepare ETH-UCY Dataset (annotations only):
# download
if (not os.path.exists("./tmp/eth-ucy-crowds-data.zip")):
    subprocess.run(["wget", "-O", "./tmp/eth-ucy-crowds-data.zip", eth_ucy_dataset])

# extract dataset
if (os.path.exists("./tmp/eth-ucy-crowds-data.zip")):
    subprocess.run(["unzip", "-q", "./tmp/eth-ucy-crowds-data.zip", "-d", "./tmp/"])
    subprocess.run(["mv", "./tmp/datasets", "./tmp/ETH-UCY-AO"]) # Annotation Only (AO)
    shutil.copytree('./tmp/ETH-UCY-AO', eth_ucy_data_path, dirs_exist_ok=True)

# clean ETH-UCY-AO path
subprocess.run(["rm", "-rf", "./tmp/ETH-UCY-AO"])

CompletedProcess(args=['rm', '-rf', './tmp/ETH-UCY-AO'], returncode=0)

In [13]:
'''
# Prepare UCY Dataset:
# list of interesting files
ucy_list = ["crowds_zara01","crowds_zara02","students003"] # only data + video list
ucy_list_rar = ["data_zara.rar", "data_university_students.rar"]
ucy_list_ext_copy = [".vsp", ".avi"]

# download
if (not os.path.exists("./tmp/crowd-data.zip")):
    subprocess.run(["wget", "-O", "./tmp/crowd-data.zip", ucy_dataset, "--no-check-certificate"]) # warning for --no-check-certificate usage

# unzip main compressed file
if (not os.path.isdir("./tmp/crowds")):
    subprocess.run(["unzip", "-q", "./tmp/crowd-data.zip", "-d", "./tmp"])

# extract datasets
for ucy_rf in ucy_list_rar:
    with rarfile.RarFile(f'./tmp/crowds/data/{ucy_rf}') as rf:
        rf.extractall('./tmp/crowds/data/') # extract .rar in the same directory
        print(f"Extracted './tmp/crowds/data/{ucy_rf}'")

# copy to raw data path
for ucy_data in ucy_list:
    for ext in ucy_list_ext_copy:
        source_path = f"./tmp/crowds/data/{ucy_data}{ext}"
        destination_path = ucy_data_path

        # Move the file from source to destination
        if (not os.path.exists(f'{destination_path}/{ucy_data}{ext}')):
            shutil.move(source_path, destination_path)
            print(f"Moved {source_path} to {destination_path}")
        else:
            print(f"{destination_path}/{ucy_data}{ext} already exists.")

# clean crowds path
subprocess.run(["rm", "-rf", "./tmp/crowds"])
'''



In [14]:
'''
# Prepare Stanford Drone Dataset:
# list of interesting files
sdd_path_dirs = ["video", "annotations"] # video & annotations path - each internal folder has its own correspondent in the other path

# download
if (not os.path.exists("./tmp/sdd-data.zip")):
    subprocess.run(["wget", "-O", "./tmp/sdd-data.zip", sdd_campus_dataset])

# create auxiliary subdirectory
os.makedirs("./tmp/stanford", exist_ok=True)

# unzip main compressed file
if (
    not os.path.isdir("./tmp/stanford/annotations") and # to consider the two subfolders (correct structure of the dataset)
    not os.path.isdir("./tmp/stanford/video")
):
    # unzipping the main compressed file
    subprocess.run(["unzip", "-q", "./tmp/sdd-data.zip", "-d", "./tmp/stanford"])

# it's not needed to extract internal datasets - it has its own folder structure

# copy to raw data path
for primary_folder in os.listdir("./tmp/stanford/"):
    
    Since there are always two folders, we can process in one loop together. Additionally, 
    because these folders are mirrored, we only need to copy their internal contents
    to the reference raw data path, simplifying the process.
    
    for secondary_folder in os.listdir("./tmp/stanford/" + primary_folder):
        relative_path = f"./tmp/stanford/{primary_folder}/{secondary_folder}"

        print(f"Processing: {relative_path} ", end='')

        # Find and remove .jpeg and .jpg files in the secondary folders (unuseful files)
        subprocess.run(["find", relative_path, "-type", "f", "-name", "*.jpeg", "-or", "-name", "*.jpg", "-delete"])

        # Copy all the subfolders to the reference raw data path - devoted to SDD dataset
        if (os.path.isdir(relative_path)): # to avoid bugs
            shutil.copytree(relative_path, f"{sdd_data_path}/{secondary_folder}", dirs_exist_ok=True)
            print(f"copied to {sdd_data_path}/{secondary_folder}")


# clean stanford dataset path
subprocess.run(["rm", "-rf", "./tmp/stanford"])
'''

'\n# Prepare Stanford Drone Dataset:\n# list of interesting files\nsdd_path_dirs = ["video", "annotations"] # video & annotations path - each internal folder has its own correspondent in the other path\n\n# download\nif (not os.path.exists("./tmp/sdd-data.zip")):\n    subprocess.run(["wget", "-O", "./tmp/sdd-data.zip", sdd_campus_dataset])\n\n# create auxiliary subdirectory\nos.makedirs("./tmp/stanford", exist_ok=True)\n\n# unzip main compressed file\nif (\n    not os.path.isdir("./tmp/stanford/annotations") and # to consider the two subfolders (correct structure of the dataset)\n    not os.path.isdir("./tmp/stanford/video")\n):\n    # unzipping the main compressed file\n    subprocess.run(["unzip", "-q", "./tmp/sdd-data.zip", "-d", "./tmp/stanford"])\n\n# it\'s not needed to extract internal datasets - it has its own folder structure\n\n# copy to raw data path\nfor primary_folder in os.listdir("./tmp/stanford/"):\n    \n    Since there are always two folders, we can process in one loo

### ETH-UCY Crowds Dataset (Annotations Only)

In [15]:
def load_dataset(file_path):
    '''
    Args:
    - data_dir: Directory containing dataset files in the format
    <frame_id> <ped_id> <x> <y>
    - obs_len: Number of time-steps in input trajectories
    - pred_len: Number of time-steps in output trajectories
    - skip: Number of frames to skip while making the dataset
    - threshold: Minimum error to be considered for non linear traj
    when using a linear predictor
    - min_ped: Minimum number of pedestrians that should be in a seqeunce
    - delim: Delimiter in the dataset files

    The structure is: <frame_id> <pedestrian_id> <x> <y>
    '''
    delim = '\t'

    # Read the dataset
    df = pd.read_csv(
        file_path,
        delimiter=delim, header=None, names=['frame_id', 'ped_id', 'x', 'y']
    )

    # Convert frame_id and ped_id to integers
    df['frame_id'] = df['frame_id'].astype(int)
    df['ped_id'] = df['ped_id'].astype(int)

    # Group data by frame_id and aggregate the pedestrian data
    grouped_data = df.groupby('frame_id').agg({
        'ped_id' : list, # Collect all pedestrian IDs for the frame
        'x'      : list, # Collect all respective x positions for the frame
        'y'      : list, # Collect all respective y positions for the frame
    }).reset_index()

    return grouped_data

In [16]:
# Mapping the datasets in a dictionary structure
ETHUCY_datasets = ["eth", "hotel", "raw", "univ", "zara1", "zara2"]
ETHUCY_data_dir = {
    dataset: {
        'test' : f'{eth_ucy_data_path}/{dataset}/test',
        'train' : f'{eth_ucy_data_path}/{dataset}/train',
        'val' : f'{eth_ucy_data_path}/{dataset}/val'
    } for dataset in ETHUCY_datasets
}

# Reading & plotting the dataset
'''
TODO: colocar a velocidade no dataframe
TODO: considerar vetor de interação entre os pedestres vetor e.
'''

# Check for existing datasets in the ETH-UCY-AO directory
gdata = {}
for dataset in ETHUCY_datasets:
    dataset_path = f'{eth_ucy_data_path}/{dataset}'
    
    for dataset_type in os.listdir(dataset_path):
        dataset_type_path = f'{dataset_path}/{dataset_type}'
        
        for data_file in os.listdir(dataset_type_path):
            if dataset not in gdata:
                gdata[dataset] = {}
            if dataset_type not in gdata[dataset]:
                gdata[dataset][dataset_type] = {}
            
            # Load dataset into gdata
            gdata[dataset][dataset_type][data_file.replace('.txt','')] = load_dataset(f'{dataset_type_path}/{data_file}')

# Define markers and colors for up to 15 pedestrians
markers = ['o', 's', 'D', 'v', '^', '<', '>', 'p', '*', 'h', 'X', '+', '1', '2', '3']  # 15 markers
colors  = ['blue', 'green', 'red', 'orange', 'purple', 'brown', 'pink', 'gray', 'cyan', 'magenta',
        'yellow', 'black', 'lightblue', 'lime', 'teal']  # 15 colors

# Start blank figure for the animation
for dataset in gdata['eth']['train']:
    print(f"processing: {dataset}")

    fig = plt.figure()
    axis = plt.axes(xlim = (min(min(gdata['eth']['train'][dataset]['x'])), max(max(gdata['eth']['train'][dataset]['x']))), 
                    ylim = (min(min(gdata['eth']['train'][dataset]['y'])), max(max(gdata['eth']['train'][dataset]['y'])))) 

    # Define line object
    line, = axis.plot([], [], lw = 2) 

    def init(): 
        line.set_data([], []) 
        return line, 
    
    # initializing empty values 
    # for x and y co-ordinates 
    xdata, ydata = [], [] 
    
    # animation function 
    def animate(frame_num):
        f2f = frames_to_follow # frames to follow
        axis.clear()
        
        # Set the axis limits
        axis.set_xlim(min(min(gdata['eth']['train'][dataset]['x'])), max(max(gdata['eth']['train'][dataset]['x'])))
        axis.set_ylim(min(min(gdata['eth']['train'][dataset]['y'])), max(max(gdata['eth']['train'][dataset]['y'])))

        # Ensure the frame_num is in the grouped data
        if (f2f > frame_num):
            f2f = frame_num

        #print(f'{frame_num}/{f2f} - {np.arange(frame_num, frame_num - f2f, - 1)}') # debug

        shown = False

        for fn in np.arange(frame_num, frame_num - f2f, -1): # fn stands for frame number
            if fn in gdata['eth']['train'][dataset]['frame_id']:
                currFrame = gdata['eth']['train'][dataset].iloc[fn]
                nCurrFrame = currFrame['frame_id']

                # Calculate and print progress percentage
                if not shown:
                    max_frame = gdata['eth']['train'][dataset]['frame_id'].max()
                    progress = (nCurrFrame / max_frame) * 100
                    print(f"{progress:.2f}%  ({frame_num})")
                    shown = True
                
                pedId = currFrame['ped_id']
                x_data = currFrame['x']
                y_data = currFrame['y']

                # Plot each pedestrian's position with unique markers and colors, cycling through options
                for i in range(len(pedId)):
                    ped_index = pedId[i] % 15 # Use modulo to cycle through the markers and colors
                    axis.plot(x_data[i], y_data[i],
                            marker=markers[ped_index], color=colors[ped_index], linestyle='None',
                            markersize=(13 if fn == frame_num else 7))

                # Set the title to show the current frame number
                axis.set_title(f'Frame {nCurrFrame}')
    
        return line,


    # Define the range of frames to animate
    frames = np.arange(0, len(gdata['eth']['train'][dataset]['frame_id']), 1)

    # Create animation
    anim = animation.FuncAnimation(fig, animate,
                                init_func=init,
                                frames=frames,
                                interval=1000/14, # Adjust interval to match 14 FPS
                                blit=True,
                                repeat=False)

    # Save the animation as an MP4 file with 14 FPS
    anim.save(f'tmp/{dataset}_plotted.mp4', writer='ffmpeg', fps=10)
    plt.close(fig)

    # Display the animation in the notebook
    #HTML("""
    #<video width="640" height="480" controls autoplay loop>
    #<source src="biwi_hotel_train_plotted.mp4" type="video/mp4">
    #</video>
    #""")

processing: uni_examples_train
0.17%  (1)
0.34%  (2)
0.51%  (3)
0.67%  (4)
0.84%  (5)
1.01%  (6)
1.18%  (7)
1.35%  (8)
1.52%  (9)
1.69%  (10)
1.85%  (11)
2.02%  (12)
2.19%  (13)
2.36%  (14)
2.53%  (15)
2.70%  (16)
2.87%  (17)
3.04%  (18)
3.20%  (19)
3.37%  (20)
3.54%  (21)
3.71%  (22)
3.88%  (23)
4.05%  (24)
4.22%  (25)
4.38%  (26)
4.55%  (27)
4.72%  (28)
4.89%  (29)
5.06%  (30)
5.23%  (31)
5.40%  (32)
5.56%  (33)
5.73%  (34)
5.90%  (35)
6.07%  (36)
6.24%  (37)
6.41%  (38)
6.58%  (39)
6.75%  (40)
6.91%  (41)
7.08%  (42)
7.25%  (43)
7.42%  (44)
7.59%  (45)
7.76%  (46)
7.93%  (47)
8.43%  (48)
8.60%  (49)
8.77%  (50)
8.94%  (51)
9.11%  (52)
9.27%  (53)
9.44%  (54)
9.61%  (55)
9.78%  (56)
9.95%  (57)
10.12%  (58)
10.29%  (59)
10.46%  (60)
10.62%  (61)
10.79%  (62)
10.96%  (63)
11.13%  (64)
11.30%  (65)
11.47%  (66)
11.64%  (67)
11.80%  (68)
11.97%  (69)
12.14%  (70)
12.31%  (71)
12.48%  (72)
12.65%  (73)
12.82%  (74)
12.98%  (75)
13.15%  (76)
13.32%  (77)
13.49%  (78)
13.66%  (79)
13.83%  

### UCY Crowds Dataset

### Stanford Drone Dataset

In [17]:
# always must clean the temporary directory to avoid github issues (.git)
#subprocess.run(["rm", "-rf", "./tmp"])