In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import os
import shutil
import cv2

from my_utils import *

Data Preparation

In [2]:
starts = [1 + 5000 * i for i in range(10)]


# Fields to be used
kpi_cols = ["image_id", "precision", "recall", "accuracy"]

metadata_cols = [
    "image_id",
    "country_code",
    "weather",
    "road_type",
    "road_condition",
    "time_of_day",
    "num_vehicles",
    "longitude",
    "latitude",
    "solar_angle_elevation",
]

calibration_cols = [
    "image_id",
    "focal_length_x",
    "focal_length_y",
    "principle_point_x",
    "principle_point_y",
    "camera_pose_x",
    "camera_pose_y",
    "camera_pose_yaw",
    "camera_pose_pitch",
    "camera_pose_roll",
    "camera_pose_z",
    "horizontal_fov",
    "vertical_fov",
]

ego_motion_cols = [
    "image_id",
    "ego_pose_x",
    "ego_pose_y",
    "ego_pose_yaw",
    "ego_pose_pitch",
    "ego_pose_roll",
    "speed_var",
    "mean_jerk",
    "max_jerk",
    "st_jerk",
    "mean_angular_acc",
    "max_angular_acc",
    "st_angular_acc",
    "mean_lateral_acc",
    "max_lateral_acc",
    "st_lateral_acc",
]

# Containers for subsets
kpi_subs = []
metadata_subs = []
calibration_subs = []
ego_motion_subs = []
pred_subs = []
gt_subs = []

for start in starts:
    end = start + 5000 - 1
    kpi_subs.append(pd.read_csv(f"outputs/{start}_{end}/kpi/kpi.csv", usecols=kpi_cols))
    metadata_subs.append(
        pd.read_csv(
            f"outputs/{start}_{end}/metadata/metadata.csv",
            usecols=metadata_cols,
        )
    )
    calibration_subs.append(
        pd.read_csv(
            f"outputs/{start}_{end}/metadata/calibration.csv", usecols=calibration_cols
        )
    )
    ego_motion_subs.append(
        pd.read_csv(
            f"outputs/{start}_{end}/metadata/ego_motion.csv",
            usecols=ego_motion_cols,
        )
    )
    pred_subs.append(pd.read_csv(f"outputs/{start}_{end}/predictions.csv"))
    gt_subs.append(pd.read_csv(f"outputs/{start}_{end}/ground_truths.csv"))

kpi = pd.concat(kpi_subs, axis=0)
metadata = pd.concat(metadata_subs, axis=0)
calibration = pd.concat(calibration_subs, axis=0)
ego_motion = pd.concat(ego_motion_subs, axis=0)
preds = pd.concat(pred_subs, axis=0)
gts = pd.concat(gt_subs, axis=0)
gts = gts[gts["class"]=="Car"]

### A - Analyse KPI Distribution

In [None]:
def plot_fit_distribution(data, title, dist_name=None, bins=50):

    # Plot histogram and PDF
    plt.figure(figsize=(8, 5))
    plt.hist(data, bins=bins, density=True, alpha=0.5, label="Data Histogram")
    
    if dist_name is not None:
        dist = getattr(stats, dist_name)  # Get distribution class from scipy.stats
        params = dist.fit(data)  # Fit distribution to data
        
        x = np.linspace(min(data), max(data), 1000) # Generate x values
        pdf_fitted = dist.pdf(x, *params)
        
        plt.plot(x, pdf_fitted, label=f"{dist_name} fit", linewidth=2)
        
        print(f"Fitted parameters for {dist_name}: {params}")

    plt.title(title)
    plt.xlabel("Value")
    plt.ylabel("Density")
    plt.legend()
    plt.grid(True)
    plt.show()
    

A1 - Recall

In [None]:
data = kpi["recall"].to_numpy()
# data = kpi[kpi["recall"] < 1]["recall"]

plot_fit_distribution(data, "Recall Distribution")


In [None]:
# When recell = 0, the distribution of num_vehicles
selector = kpi["recall"]==0
data = metadata["num_vehicles"][selector]

# Summary statistics
print(f"Num_Vehicle Summary (When Recall=0): \n{data.describe()}\n")

# Because when there's no vehicle, recall is calculated as zero
# How many images contain zero vehicle?
print(len(data[data==0]))

In [8]:
# Copy all zero recall images to an folder
selector = (kpi["accuracy"]<=0.4) & (kpi["accuracy"]>=0.2)
print(sum(selector))
for img_id in kpi["image_id"][selector].to_list()[:100]:
    img_folder = f"single_frames_img/{img_id:06d}/camera_front_blur"
    img_name = os.listdir(img_folder)[0]

    img_src = os.path.join(img_folder, img_name)
    img_dst = os.path.join("outputs/grouped_images/low_accuracy", img_name)
    
    shutil.copy(img_src, img_dst)
    

1169


In [9]:
img_folder = "outputs/grouped_images/low_accuracy"
for img_path in os.listdir(img_folder):
    img_id = img_path.split("_")[0]

    selector = (preds["image_id"] == int(img_id))
    img_pred = preds[selector].iloc[:, 1:5].to_numpy()
    selector = (gts["image_id"] == int(img_id))
    img_gt = gts[selector].iloc[:, 1:5].to_numpy()

    img_bboxes = visualize_bboxes(os.path.join(img_folder, img_path), img_pred, img_gt)
    img_bboxes = cv2.cvtColor(img_bboxes, cv2.COLOR_RGB2BGR)
    cv2.imwrite(
        os.path.join("outputs/grouped_images/low_accuracy", img_path),
        img_bboxes,
    )

A2 - Precision

In [None]:
data = kpi["precision"].to_numpy()
# data = kpi[kpi["precision"] < 1]["precision"]

plot_fit_distribution(data, "Precision Distribution")     # Normal

A3 - Accuracy

In [None]:
data = kpi["accuracy"].to_numpy()
# data = kpi[kpi["accuracy"] < 1]["accuracy"]

plot_fit_distribution(data, "Accuracy Distribution")     # Normal

### B - Analyse Categorical Metadata Distribution

In [None]:
def plot_categorical_distribution(data, title="Categorical Distribution", rotate_xticks=True):
    """
    Plots a bar chart of categorical data frequencies.

    Parameters:
        data (list, pd.Series): Categorical data to visualize.
        title (str): Title of the plot.
        rotate_xticks (bool): Whether to rotate x-axis labels for readability.
    """
    # Convert to pandas Series if needed
    if not isinstance(data, pd.Series):
        data = pd.Series(data)

    # Count category frequencies
    counts = data.value_counts()

    # Plot
    plt.figure(figsize=(6, 3))
    sns.barplot(x=counts.index, y=counts.values)
    plt.title(title)
    plt.xlabel("Category")
    plt.ylabel("Count")
    if rotate_xticks:
        plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.show()

B1 - Country Code

In [None]:
data = metadata["country_code"]
plot_categorical_distribution(data, title="Country Code Count")

Note: makes sense to use only data in Poland(0) or Germany(1) to remove geographical bias

B2 - Weather

In [None]:
data = metadata["weather"]
plot_categorical_distribution(data, title="Weather Count")

B2.1 Check images tagged with "rain", "snow", and "fog"

In [None]:
target_weathers = {"rain": 6, "snow": 7, "fog": 8}

for weather, code in target_weathers.items():
    # Step 1: Find all file names
    selector = metadata["weather"] == code
    image_ids = metadata["image_id"][selector]

    image_files = []
    for image_id in image_ids:
        folder = os.path.join(
            "single_frames_img", f"{image_id:06d}", "camera_front_blur"
        )
        image_file = os.path.join(folder, os.listdir(folder)[0])
        image_files.append(image_file)

    # Step 2: Save these file names as 3 csv files
    image_files = pd.DataFrame(image_files, columns=["file_name"])
    image_files.to_csv(
        os.path.join("outputs", "grouped_images", f"by_weather_{weather}.csv"),
        index=False,
    )

In [None]:
# Step 3: Naviage the images using the utility function
images_rain = pd.read_csv("outputs/grouped_images/by_weather_rain.csv")
images_snow = pd.read_csv("outputs/grouped_images/by_weather_snow.csv")
images_fog = pd.read_csv("outputs/grouped_images/by_weather_fog.csv")

images_rain = images_rain["file_name"].to_list()
images_snow = images_snow["file_name"].to_list()
images_fog = images_fog["file_name"].to_list()

navigate_images(images_rain, 1190)

B2.2 Check images (6 samples) with wiper appearance

In [None]:
# Check some sample images with wiper appearance
images_wiper = pd.read_csv("outputs/grouped_images/by_wiper.csv")
images_wiper = images_wiper["file_name"].to_list()

image_ids = [fn.split("/")[1] for fn in images_wiper]
image_ids = [int(image_id) for image_id in image_ids]

for idx, image_id in enumerate(image_ids):
    gt_bboxes = gts[gts["image_id"]==image_id].iloc[:, 1:5].to_numpy()
    pred_bboxes = preds[preds["image_id"]==image_id].iloc[:, 1:5].to_numpy()

    img_bboxes = visualize_bboxes(images_wiper[idx], pred_bboxes, gt_bboxes)
    img_bboxes = cv2.cvtColor(img_bboxes, cv2.COLOR_RGB2BGR)
    cv2.imwrite(
        os.path.join("outputs/grouped_images/by_wiper", os.path.split(images_wiper[idx])[1]),
        img_bboxes,
    )  

In [None]:
folder = "outputs/grouped_images/by_wiper"
images_wiper_bboxes = [os.path.join(folder, fn) for fn in os.listdir(folder)]

navigate_images(images_wiper_bboxes) 

B2.3 Images with weird color

In [None]:
# load image ids
image_ids = pd.read_csv("outputs/grouped_images/by_weird_color.csv")
image_ids = image_ids["image_id"].to_list()

image_paths = []

# filter by image ids

for fp in pd.read_csv("outputs/grouped_images/by_weather_rain.csv")["file_name"].to_list():
    if int(fp.split("/")[1]) in image_ids:
        image_paths.append(fp)

image_collectors = [fp.split("_")[-2] for fp in image_paths]

new_data = {
    "image_id": image_ids,
    "image_path": image_paths,
    "collector": image_collectors
}

new_data = pd.DataFrame(new_data)
new_data.to_csv("outputs/grouped_images/by_weird_color.csv")

# Save original images to a folder
for fp in image_paths:
    img_name = os.path.split(fp)[1]
    img_dst = os.path.join("outputs/grouped_images/by_weird_color", img_name)
    
    shutil.copy(fp, img_dst)

B3 - Road Condition

In [None]:
data = metadata["road_type"]
plot_categorical_distribution(data, title="Road Type Count")

Note: makes sense to use road type = city (0) only

B4 - Road Condition

In [None]:
data = metadata["road_condition"]
plot_categorical_distribution(data, title="Road Coundition Count")

B5 -  Time of Day

In [None]:
data = metadata["time_of_day"]
plot_categorical_distribution(data, title="Time of Day Count")

### D - Apply Filtering on KPI & Distribution by Categories

In [None]:
selector = (metadata["country_code"] == 0) & (metadata["road_type"] == 0)

C1.1 - Fit Filtered Recall

In [None]:
data = kpi[selector & (kpi["recall"] > 0.05) & (kpi["recall"] < 0.95)]["recall"]
plot_fit_distribution(data, dist_name="beta")

C1.2 - Fit Filtered Precision

In [None]:
data = kpi[selector & (kpi["precision"] > 0.05) & (kpi["precision"] < 0.95)]["precision"]
plot_fit_distribution(data, dist_name="beta")

C1.3 - Fit Filtered Accuracy

In [None]:
data = kpi[selector]["accuracy"]
plot_fit_distribution(data, dist_name="beta")

C2.1 - Recall Depending on Weather

In [None]:
weather_selector = metadata["weather"] == 4
data = kpi[
    selector & weather_selector & (kpi["recall"] > 0.05) & (kpi["recall"] < 0.95)
]["recall"]
plot_fit_distribution(data, dist_name="beta")

C2.2 - Precision Depending on Data

In [None]:
weather_selector = metadata["weather"] == 1
data = kpi[
    selector & weather_selector & (kpi["precision"] > 0.05) & (kpi["precision"] < 0.95)
]["precision"]
plot_fit_distribution(data, dist_name="uniform")

C2.3 - Accuracy Depending on Data

In [None]:
weather_selector = metadata["weather"] == 7
data = kpi[selector & weather_selector]["accuracy"]
plot_fit_distribution(data, dist_name="beta")