# File description
Make the final train and validation splits. The format for the splits are 2 `csv` files which contain all annotation relevant information. These dataframes are passed on directly to the Pytorch `Dataset`.<br>
Some notes on how the train/validation splits were made:
    
    1.) ...
    2.) ...
<br>
<br>
The final class distribution of the splits is:
<table><tr>
<img src="../illustration_images/train_split_dist.png" width="800" /> 
<img src="../illustration_images/valid_split_dist.png" width="800" />
</tr></table>

In [None]:
import dutils as U
U.jupyter_ipython.adjust_screen_width(75)
from dutils.jupyter_ipython import show_image as show

# Normal imports
from tqdm.notebook import tqdm
import torch
import torch.nn as nn
import numpy as np
import os
import cv2
import wandb
import pandas as pd

# Settings
torch.set_printoptions(sci_mode=False)
import calendar
import matplotlib.pyplot as plt
import seaborn as sns
import shutil
import math
sns.set_style("whitegrid");

# Setup

In [None]:
df = pd.read_csv("../dataset/data_final/annotations_all.csv")
df = df[df["class_label_combined_name"].isin(['escooter_helmet', "escooter_nohelmet", 'cycle_nohelmet', 'cycle_helmet'])]

In [None]:
all_paths = {
     'escooter_helmet': [],
     'escooter_nohelmet': [],
     'cycle_nohelmet': [],
     'cycle_helmet': [],
}

def get_index(path:str, frame_amout:int, categori, video_path:str):
    global all_paths
    
    to_return = []
    for i in range(frame_amout):
        t = str(int(path.split("_")[-1]) + i)
        s = "_".join(path.split("_")[:-1])
        final = s+"_"+t
        to_return.append(final)
        match = df["frame_name"] == (final + ".png")
        assert sum(match), "Expected only one match"
        all_paths[categori].append(final)
        
    show([os.path.join(video_path, t + ".png") for t in to_return], resize_factor=0.5) 
    return to_return

In [None]:
# Make a folder with all the images from each class
temp_folder_path = "C:/Users/JK/Desktop"
assert os.path.exists(temp_folder_path), "Bad path"

for class_label in tqdm(df["class_label_combined_name"].unique().tolist()):
    image_folder_path = os.path.join(temp_folder_path, f"TEMP_{class_label}")
    if os.path.exists(image_folder_path): continue
    os.mkdir(image_folder_path)
    for frame_name in df[df["class_label_combined_name"] == class_label]["frame_name"].unique().tolist():
        shutil.copy(
            f"../dataset/data_final/data/{frame_name}",
            os.path.join(image_folder_path, frame_name)
        )        

# Escooter_helmet

In [None]:
class_name = "escooter_helmet"
folder_path = os.path.join(temp_folder_path, "TEMP_" + class_name)
folder_path

In [None]:
get_index("lyngbyvej_04-02-2022_09.40_FILE0022_25", 4, categori=class_name, video_path=folder_path)

In [None]:
get_index("lyngbyvej_08-02-2022_11.59_FILE0037_287", 3, categori=class_name, video_path=folder_path)

In [None]:
get_index("lyngbyvej_12-12-2021_12.19_FILE0031_253", 4, categori=class_name, video_path=folder_path)

In [None]:
get_index("valby_14-12-2021_12.26_FILE0039_87", 4, categori=class_name, video_path=folder_path)

In [None]:
get_index("valby_12-02-2022_11.05_FILE0035_38", 3, categori=class_name, video_path=folder_path)

In [None]:
get_index("valby_16-02-2022_11.09_FILE0030_17", 4, categori=class_name, video_path=folder_path)

In [None]:
get_index("valby_17-12-2021_13.23_FILE0050_47", 3, categori=class_name, video_path=folder_path)

In [None]:
get_index("valby_13-02-2022_16.01_FILE0252_67", 3, categori=class_name, video_path=folder_path)

In [None]:
get_index("valby_14-02-2022_13.25_FILE0048_32", 4, categori=class_name, video_path=folder_path)

In [None]:
get_index("valby_13-02-2022_16.25_FILE0255_52", 7, categori=class_name, video_path=folder_path)

In [None]:
get_index("valby_17-02-2022_11.57_FILE0216_0", 4, categori=class_name, video_path=folder_path)

In [None]:
get_index("lyngbyvej_07-02-2022_11.29_FILE0216_143", 5, categori=class_name, video_path=folder_path)

In [None]:
get_index("lyngbyvej_09-12-2021_07.43_FILE0007_64", 5, categori=class_name, video_path=folder_path)

In [None]:
get_index("valby_17-02-2022_08.21_FILE0189_109", 4, categori=class_name, video_path=folder_path)

In [None]:
get_index("valby_15-02-2022_13.41_FILE0230_196", 4, categori=class_name, video_path=folder_path)

In [None]:
len(all_paths["escooter_helmet"])

# Escooter_nohelmet

In [None]:
class_name = "escooter_nohelmet"
folder_path = os.path.join(temp_folder_path, "TEMP_" + class_name)
folder_path

In [None]:
get_index("valby_12-02-2022_11.29_FILE0038_10", 3, categori=class_name, video_path=folder_path)

In [None]:
get_index("valby_14-12-2021_15.14_FILE0060_37", 3, categori=class_name, video_path=folder_path)

In [None]:
get_index("lyngbyvej_07-02-2022_17.05_FILE0258_256", 4, categori=class_name, video_path=folder_path)

In [None]:
get_index("lyngbyvej_09-12-2021_08.07_FILE0010_796", 6, categori=class_name, video_path=folder_path)

In [None]:
get_index("lyngbyvej_12-12-2021_10.43_FILE0019_96", 3, categori=class_name, video_path=folder_path)

In [None]:
get_index("lyngbyvej_08-12-2021_09.26_FILE0019_0", 4, categori=class_name, video_path=folder_path)

In [None]:
get_index("valby_17-12-2021_10.35_FILE0029_23", 2, categori=class_name, video_path=folder_path)

In [None]:
get_index("valby_14-12-2021_12.10_FILE0037_52", 4, categori=class_name, video_path=folder_path)

In [None]:
get_index("valby_15-12-2021_12.39_FILE0046_52", 2, categori=class_name, video_path=folder_path)

In [None]:
get_index("lyngbyvej_08-12-2021_12.54_FILE0045_226", 4, categori=class_name, video_path=folder_path)

In [None]:
get_index("lyngbyvej_12-12-2021_17.15_FILE0068_83", 7, categori=class_name, video_path=folder_path)

In [None]:
get_index("lyngbyvej_10-12-2021_08.04_FILE0006_115", 4, categori=class_name, video_path=folder_path)

In [None]:
get_index("valby_13-02-2022_16.49_FILE0258_39", 3, categori=class_name, video_path=folder_path)

In [None]:
get_index("valby_16-02-2022_18.05_FILE0082_8", 3, categori=class_name, video_path=folder_path)

In [None]:
get_index("lyngbyvej_10-02-2022_13.20_FILE0232_547", 14, categori=class_name, video_path=folder_path)

In [None]:
len(all_paths["escooter_nohelmet"])

# Cycle_helmet

In [None]:
class_name = "cycle_helmet"
folder_path = os.path.join(temp_folder_path, "TEMP_" + class_name)
folder_path

In [None]:
get_index("lyngbyvej_04-02-2022_09.08_FILE0018_207", 2, categori=class_name, video_path=folder_path)

In [None]:
get_index("lyngbyvej_11-12-2021_12.34_FILE0024_109", 1, categori=class_name, video_path=folder_path)

In [None]:
get_index("lyngbyvej_07-02-2022_08.25_FILE0193_674", 3, categori=class_name, video_path=folder_path)

In [None]:
get_index("lyngbyvej_07-02-2022_12.57_FILE0227_585", 2, categori=class_name, video_path=folder_path)
get_index("lyngbyvej_07-02-2022_12.57_FILE0227_590", 1, categori=class_name, video_path=folder_path)

In [None]:
get_index("lyngbyvej_08-02-2022_15.03_FILE0060_1138", 2, categori=class_name, video_path=folder_path)

In [None]:
get_index("lyngbyvej_12-12-2021_10.43_FILE0019_100", 1, categori=class_name, video_path=folder_path)
get_index("lyngbyvej_12-12-2021_10.43_FILE0019_103", 1, categori=class_name, video_path=folder_path)

In [None]:
get_index("lyngbyvej_11-12-2021_12.34_FILE0024_546", 1, categori=class_name, video_path=folder_path)

In [None]:
get_index("valby_12-02-2022_09.21_FILE0022_18", 1, categori=class_name, video_path=folder_path)

In [None]:
get_index("valby_12-02-2022_09.21_FILE0022_82", 1, categori=class_name, video_path=folder_path)

In [None]:
get_index("valby_12-02-2022_16.01_FILE0072_16", 1, categori=class_name, video_path=folder_path)
get_index("valby_12-02-2022_16.01_FILE0072_19", 1, categori=class_name, video_path=folder_path)

In [None]:
get_index("valby_14-02-2022_15.01_FILE0060_82", 2, categori=class_name, video_path=folder_path)
get_index("valby_14-02-2022_15.01_FILE0060_85", 1, categori=class_name, video_path=folder_path)

In [None]:
get_index("valby_14-12-2021_11.38_FILE0033_25", 2, categori=class_name, video_path=folder_path)

In [None]:
get_index("valby_16-02-2022_16.13_FILE0068_140", 1, categori=class_name, video_path=folder_path)

In [None]:
get_index("valby_17-12-2021_10.27_FILE0028_24", 1, categori=class_name, video_path=folder_path)

In [None]:
get_index("valby_18-02-2022_14.49_FILE0058_51", 1, categori=class_name, video_path=folder_path)

In [None]:
get_index("lyngbyvej_07-02-2022_15.37_FILE0247_724", 1, categori=class_name, video_path=folder_path)
get_index("lyngbyvej_07-02-2022_15.37_FILE0247_727", 2, categori=class_name, video_path=folder_path)

In [None]:
get_index("lyngbyvej_10-02-2022_08.40_FILE0197_115", 5, categori=class_name, video_path=folder_path)

In [None]:
get_index("valby_14-02-2022_10.05_FILE0023_62", 4, categori=class_name, video_path=folder_path)

In [None]:
get_index("valby_14-02-2022_13.25_FILE0048_20", 3, categori=class_name, video_path=folder_path)

In [None]:
get_index("valby_14-12-2021_14.02_FILE0051_101", 1, categori=class_name, video_path=folder_path)
get_index("valby_14-12-2021_14.02_FILE0051_103", 1, categori=class_name, video_path=folder_path)

In [None]:
get_index("valby_15-02-2022_12.29_FILE0221_76", 1, categori=class_name, video_path=folder_path)
get_index("valby_15-02-2022_12.29_FILE0221_79", 1, categori=class_name, video_path=folder_path)

In [None]:
len(all_paths["cycle_helmet"])

# Cycle_nohelmet

In [None]:
class_name = "cycle_nohelmet"
folder_path = os.path.join(temp_folder_path, "TEMP_" + class_name)
folder_path

In [None]:
get_index("valby_14-12-2021_11.30_FILE0032_0", 1, categori=class_name, video_path=folder_path)
get_index("valby_14-12-2021_11.30_FILE0032_2", 1, categori=class_name, video_path=folder_path)

In [None]:
get_index("lyngbyvej_12-12-2021_14.51_FILE0050_129", 5, categori=class_name, video_path=folder_path)

In [None]:
get_index("lyngbyvej_07-02-2022_14.49_FILE0241_178", 1, categori=class_name, video_path=folder_path)

In [None]:
get_index("lyngbyvej_04-02-2022_13.16_FILE0049_222", 2, categori=class_name, video_path=folder_path)

In [None]:
get_index("lyngbyvej_06-02-2022_08.58_FILE0017_107", 1, categori=class_name, video_path=folder_path)

In [None]:
get_index("lyngbyvej_07-02-2022_09.37_FILE0202_528", 1, categori=class_name, video_path=folder_path)

In [None]:
get_index("lyngbyvej_07-02-2022_14.01_FILE0235_211", 1, categori=class_name, video_path=folder_path)

In [None]:
get_index("lyngbyvej_08-02-2022_11.43_FILE0035_73", 1, categori=class_name, video_path=folder_path)
get_index("lyngbyvej_08-02-2022_11.43_FILE0035_75", 1, categori=class_name, video_path=folder_path)

In [None]:
get_index("valby_14-12-2021_11.30_FILE0032_31", 1, categori=class_name, video_path=folder_path)

In [None]:
get_index("valby_15-02-2022_15.01_FILE0240_3", 1, categori=class_name, video_path=folder_path)

In [None]:

get_index("valby_16-02-2022_15.09_FILE0060_2", 1, categori=class_name, video_path=folder_path)
get_index("valby_16-02-2022_15.09_FILE0060_4", 2, categori=class_name, video_path=folder_path)

In [None]:
get_index("valby_17-12-2021_13.31_FILE0051_20", 3, categori=class_name, video_path=folder_path)

In [None]:
len(all_paths["cycle_nohelmet"])

# Validation split

In [None]:
df_valid = df.loc[0:-1,:].copy()
for name in all_paths:
    paths = list(set(all_paths[name]))
    to_append = df[df["frame_name"].isin([path + ".png" for path in paths])]
    df_valid = df_valid.append(to_append)

df_valid.to_csv("../dataset/data_final/annotations_valid.csv", index=False)

In [None]:
sns.set(rc={'figure.figsize':(20,7)}, style="whitegrid")
with_names = ["cycle_nohelmet", "cycle_helmet", "escooter_nohelmet", "escooter_helmet"]
counts = df_valid["class_label_combined_name"].value_counts().values.tolist()
counts_percentage = [round(c/sum(counts)*100, 2) for c in counts]
palette = [sns.color_palette()[i] for i in [1, 0, 3, 2]]
barplot = sns.barplot(x=with_names, y=counts_percentage, palette=palette)
barplot.set_title("VALIDATION SPLIT")

for i, (name, value) in enumerate(zip(with_names, counts_percentage)):
    barplot.text(i, value+0.5, str(counts[i]), horizontalalignment="center")
barplot.set_xlabel("Labels")
barplot.set_ylabel("Count (%)");

display(df_valid["class_label_combined_name"].value_counts())
"SUM: ", df_valid["class_label_combined_name"].value_counts().sum()
U.input_output.save_plt_plot("../illustration_images/valid_split_dist.png")

# Training split

In [None]:
df_train = df[~df["frame_path"].isin(df_valid["frame_path"])]
df_train.to_csv("../dataset/data_final/annotations_train.csv", index=False)

In [None]:
sns.set(rc={'figure.figsize':(20,7)}, style="whitegrid")
counts = df_train["class_label_combined_name"].value_counts().values.tolist()
counts_percentage = [round(c/sum(counts)*100, 2) for c in counts]
barplot = sns.barplot(x=with_names, y=counts_percentage, palette=palette)
barplot.set_title("VALIDATION SPLIT")

for i, (name, value) in enumerate(zip(with_names, counts_percentage)):
    barplot.text(i, value+0.5, str(counts[i]), horizontalalignment="center")
barplot.set_xlabel("Labels")
barplot.set_ylabel("Count (%)");

display(df_train["class_label_combined_name"].value_counts())
"SUM: ", df_train["class_label_combined_name"].value_counts().sum()
U.input_output.save_plt_plot("../illustration_images/train_split_dist.png")

# Plots

In [None]:
df_valid = pd.read_csv("../dataset/data_final/annotations_valid.csv")
df_train = pd.read_csv("../dataset/data_final/annotations_train.csv")

days = [day for day in list(calendar.day_name)]
sns.set_style("whitegrid")
name_map = {
    'cycle_nohelmet':'Cyclist without helmet', 
    'cycle_helmet':'Cyclist with helmet',
    'escooter_nohelmet': "E-scooter without helmet",
    'escooter_helmet':"Escooter with helmet"
}
with_names = list(name_map.values())

for i, (df_name, df) in enumerate([("Validation", df_valid), ("Train", df_train)]):
    
    # Setup
    df = df.copy()
    df["class_label_combined_name"] = df["class_label_combined_name"].apply(lambda x: name_map[x])
    fig, axes = plt.subplots(3, 1, figsize=(20,14))
    plt.subplots_adjust(hspace=0.3)
    
    # Overall
    ax = axes[0]
    counts = df["class_label_combined_name"].value_counts().values.tolist()
    counts_percentage = [round(c/sum(counts)*100, 2) for c in counts]
    barplot = sns.barplot(x=with_names, y=counts_percentage, palette=palette, ax=ax)
    
    for i, (name, value) in enumerate(zip(with_names, counts_percentage)):
        ax.text(i, value+0.5, str(counts[i]), horizontalalignment="center")
    ax.set_title(f"{df_name} Label distribution", loc='left')
    ax.set_xlabel("Labels")
    ax.set_ylabel("Count (%)")
    ax.set_ylim(0,60)
    if df_name == "Validation": ax.set_ylim(0,30)
    
    # Weekdays
    ax = axes[1]
    df_ = df.groupby(["week_day", "class_label_combined_name"]).apply(len).reset_index()
    df_ = df_.rename(columns={0:"counts"})
    df_["week_day"] = df_["week_day"].apply(lambda x: x.capitalize() if isinstance(x, str) else "")
    sns.barplot(data=df_, x="week_day", y="counts", hue="class_label_combined_name", order=days, 
                hue_order=with_names, palette=palette, ax=ax)
    ax.set_title(f"{df_name} Label distribution - Weekdays", loc='left')
    ax.legend(loc='upper right')
    ax.set_xlabel("Week Day")
    ax.set_ylabel("Count")
    ax.set_ylim(0,400)
    if df_name == "Validation": ax.set_ylim(0,27)
    
    # Hourly
    ax = axes[2]
    df_ = df.groupby(["date_hour", "class_label_combined_name"]).apply(len).reset_index()
    df_ = df_.rename(columns={0:"counts"})
    df_["date_hour"] = df_[~df_["date_hour"].isna()]["date_hour"].apply(lambda x: str(int(x)) + ":00")
    ax = sns.barplot(data=df_, x="date_hour", y="counts", hue="class_label_combined_name", 
                     hue_order=with_names, palette=palette, ax=ax)
    ax.set_title(f"{df_name} Label distribution - Hourly", loc='left')
    ax.set_xlabel("Hour")
    ax.set_ylabel("Count")
    ax.legend(loc='upper right')
    ax.set_ylim(0,400)
    if df_name == "Validation": ax.set_ylim(0,27)