# File description
Loops over all the collected videos and extract information about `location`, `date`, `time` and `names`. This is going to be used later to enrich the bounding box annotations with additional information.<br>
Extracting the time information was surprisingly difficult at least on Windows. What ended up working was the tool [exiftool](https://www.exiftool.org/) which could be used to extract a video attribute called `media created` that contains everything related to time of recording. The important thing about this attribute is it remains unchanged even after copying files or alike.
<br>
<br>
The resulting dataframe looks something like this
<img src="../illustration_images/df_video_info_example.png" width="800" /> 

In [None]:
# pip install git+https://github.com/Jako-K/utils <-- dutils
import dutils as U
U.jupyter_ipython.adjust_screen_width(75)

import matplotlib.pyplot as plts
import seaborn as sns; sns.set_style("whitegrid")
from tqdm.notebook import tqdm
import random
from natsort import natsorted
import pandas as pd
import numpy as np
from glob import glob
import subprocess
import cv2
import re
import sys
import os
import datetime

# Load video paths

In [None]:
folder_paths = glob("E:/Egmont/*") + glob("E:/Valby/*") + glob("E:/Lyngbyvej/*")
video_paths_old = []

for folder_path in folder_paths:
    assert os.path.exists(folder_path)
    video_paths = glob(os.path.join(folder_path, "*"))
    video_paths = [os.path.abspath(path) for path in video_paths]
    assert all([path[-4:].lower() == ".mp4" for path in video_paths]), "Expected only .mp4 files"
    assert all([" " not in path for path in video_paths]), "exiftool cannot handle spaces the way the code is currenly written"
    
    video_paths_old += video_paths
len(video_paths_old)

# Helpers function to extract "media created" data

In [None]:
def get_media_created(video_path:str):
    assert U.system_info.on_windows(), "exiftool is a Windows only build"
    assert os.path.exists("./exiftool-12.40/exiftool(-k).exe"), "Cannot find exiftool"
    assert os.path.exists(video_path), "Received bad video path"
    
    cmd = f'"./exiftool-12.40/exiftool(-k).exe" "{video_path.lower()}"'
    p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
    lines = [str(line.decode('UTF-8', 'ignore')) for line in iter(p.stdout.readline, b'')]
    
    if len(lines) == 0:
        raise ValueError(f"exiftool was unable to read the video file at: {video_path}")
    
    media_created_line = [l for l in lines if re.search("Media Create Date", l) is not None][0]
    year, month, day, hour, minut = "_".join(media_created_line.split(":")[1:5]).strip().replace(" ", "_").split("_")
    combined = f"{day}-{month}-{year}_{hour}.{minut}"
    
    return {
        "path":os.path.abspath(video_path),
        "day":str(int(day)), # str/int shenanigans is just to remove leading zeros
        "month":str(int(month)),
        "year":year,
        "hour":hour,
        "minut":minut,
        "combined":combined
    }

# Create a dictionary that maps from old_name to new_name 
NOTE: This takes 30+ minutes

In [None]:
rename_map = {}
for old_video_path in tqdm(video_paths_old):
    # Split path
    old_video_path_split = os.path.normpath(old_video_path).split(os.sep)
    assert len(old_video_path_split) >= 2, "Something wrong with the path-split"
    
    # Extract loaction and date from path 
    location, date = old_video_path_split[-2].split("_")
    location = location.lower()
    assert location in ["valby", "lyngbyvej", "egmont"], "Something is wrong with the location extracted from the video path"
    
    # Extract media created information from the video itself
    media_created = get_media_created(old_video_path)
    day_media, month_media, year_media = media_created["day"], media_created["month"], media_created["year"]
    
    is_recorded_in_december_2021 = (year_media == "2021") and (month_media == "12")
    is_recorded_in_february_2022 = (year_media == "2022") and (month_media == "2")
    assert is_recorded_in_december_2021 or is_recorded_in_february_2022, "Received `media_created` outside expected period"
    
    # Extract date from the video path. All the map-lambda shinanigans is just to to remove leadning zeros e.g. "04" -> 4 -> "4"
    day_path, month_path, year_path = list(map(lambda x: str(int(x)), date.split("-")))
    
    # The cameras malfunctioned on the "LYNGBYVEJ_08-12-2021" and "LYNGBYVEJ_09-12-2021", 
    # Which means day part has been encoded as 14'th instead of the 8'th and 15'th instead of the 9'th
    if old_video_path_split[-2] == "LYNGBYVEJ_08-12-2021":
        media_created["day"] = "8"
        media_created["combined"] = "08" + media_created["combined"][2:]
    elif old_video_path_split[-2] == "LYNGBYVEJ_09-12-2021":
        media_created["day"] = "9"
        media_created["combined"] = "09" + media_created["combined"][2:]
    elif old_video_path_split[-2] == "LYNGBYVEJ_10-12-2021":
        media_created["day"] = "10"
        media_created["combined"] = "10" + media_created["combined"][2:]
    
    # the date in `media_created` and the date extracted from the video path most agree
    dates_matches = (day_path == media_created["day"]) and (month_path == media_created["month"]) and (year_path == media_created["year"])
    assert dates_matches, "the date in `media_created` and the date extracted from the video path most agree"
    
    # Update mapper
    new_video_name = "_".join([location, media_created["combined"], os.path.basename(old_video_path)])
    key = os.path.normpath("/".join(old_video_path.split(os.sep)[-2:])) # folder name + file_name e.g. "EGMONT_04-02-2022/FILE0009.MP4"
    rename_map[key] = new_video_name

# Create dataframe and save it as a csv file

In [None]:
# Prepare data for dataframe
for_pandas = []
week_day_map = {0:'Monday', 1:'Tuesday', 2:'Wednesday', 3:'Thursday', 4:'Friday', 5:'Saturday', 6:'Sunday'}

for old_file_name, new_file_name in rename_map.items():
    location, date, time, file_name = new_file_name.split("_")
    
    # Date
    day, month, year = map(int, date.split("-"))
    hour, minut = map(int, time.split("."))
    week_day_name = week_day_map[datetime.datetime(year, month, day).weekday()].lower()
    date_all = f"{date} {time.replace('.', ':')}"
    
    # File name
    file_name, file_extension = file_name.split(".")    
    old_file_name_clean = "/".join(old_file_name.split(os.sep))
    
    for_pandas.append([location, week_day_name, date_all, day, month, year, 
                       hour, minut, file_name, old_file_name_clean, new_file_name])
    
# Save video data as a csv file
df = pd.DataFrame(
    for_pandas, 
    columns=[
        "location", "week_day", "date_all", "date_day", "date_month", "date_year", 
        "date_hour", "date_minut", "video_original_file_name", "mapping_key", "video_file_name"
    ]
)
df.to_csv("../video_data/video_info.csv", index=False)

# Change names
NOTE: This function is only here to check that the extracted date+time is correct

In [None]:
def get_new_video_name(df, path):
    assert os.path.exists(path) and os.path.isfile(path), "Bad path"
    assert path[-4:].lower() == ".mp4", "Expected .MP4 extension"
    
    key = "/".join(path.split(os.sep)[-2:])
    return df[df["mapping_key"] == key]["video_file_name"].values[0]
path = random.choice(glob("C:/Users/JK/Desktop/reduced_yolo/**/*.MP4"))
print(path.replace("\\", "/").replace("_yolo", "_initial"))
print(get_new_video_name(df, path))

# Testing

In [None]:
old = [p.split(os.sep)[-1] for p in glob("E:/Egmont/*") + glob("E:/Valby/*") + glob("E:/Lyngbyvej/*")]
old = [folder_name for folder_name in old if "EGMONT" not in folder_name]

new = [p.split(os.sep)[-1] for p in glob("C:/Users/JK/Desktop/reduced_initial/**")]
new = [folder_name for folder_name in new if "EGMONT" not in folder_name]

if len(new) != len(old):
    print("There's at least one mismatch between the new and the old video folder")
    for (o, n) in list(zip(natsorted(old), natsorted(new))):
        if o != n:
            print("! --> ", o, n)
        else:
            print(o, n)