# EDA on BDD100k

In [None]:
import os
import random

import cv2
from PIL import Image, ImageDraw, ImageFont
import json

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 10000)

## Helper Functions

In [None]:
color_map = {
    "pedestrian": "blue",
    "rider": "green",
    "car": "red",
    "truck": "sienna",
    "bus": "cyan",
    "train": "pink",
    "motorcycle": "yellow",
    "bicycle": "orange",
    "traffic light": "lime",
    "traffic sign": "magenta"
}
def draw_bounding_boxes(image_path, labels):
    with Image.open(image_path) as img:
        draw = ImageDraw.Draw(img)
        '''
        font = ImageFont.load_default()
        try:
            # Adjust the size here (e.g., size 16)
            font = ImageFont.truetype("arial.ttf", 16)
        except IOError:
            # Fallback to default font if .ttf is not found
            font = ImageFont.load_default()
        '''
        for label in labels:
            category = label['category']
            color = color_map.get(category, "white")  # Default to white if category not found
            box = label['box2d']
            draw.rectangle([box['x1'], box['y1'], box['x2'], box['y2']], outline=color, width=2)
            '''
            # Calculate text position (slightly above the top left corner)
            text_position = (box['x1'], box['y1'] + 1)
            
            # Draw the text
            draw.text(text_position, category, fill=color, font=font)
            '''
        return img

def plot_image(image_name, labels):
    image_path = os.path.join(images_folder_train, image_name)
    img = draw_bounding_boxes(image_path, labels)
    plt.figure(figsize=(15, 12))
    plt.imshow(img)
    plt.axis('off')
    plt.title(f"Image: {image_name}") 
    plt.show()

In [None]:
def vis_image(image_name):
    # Load the image using OpenCV
    image_path = os.path.join(images_folder_train, image_name)
    image = cv2.imread(image_path)

    if image is None:
        print(f"Failed to load image: {image_name}")
        return

    # Convert the image from BGR to RGB (OpenCV uses BGR by default, Matplotlib uses RGB)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

    # Display the image
    plt.imshow(image)
    plt.title("Image name: " + image_name)
    plt.axis('off')  # Hide axis
    plt.show()

In [None]:
def get_random_images(data, n=5):
    return random.sample(data, n)

## Paths

### Validation

In [None]:
images_folder_val = "100k_images_val/val"
json_labels_val = "bdd100k_det_20_labels_trainval/det_20/det_val.json"

### Train

In [None]:
images_folder_train = "100k_images_train/bdd100k/images/100k/train"
json_labels_train = "bdd100k_det_20_labels_trainval/det_20/det_train.json"

## Load Data

In [None]:
with open(json_labels_val, 'r') as file:
    data_val = json.load(file)

In [None]:
with open(json_labels_train, 'r') as file:
    data_train = json.load(file)

## Data Visualisation

In [None]:
no_label_images_list = []
for item in data_train:
    if 'labels' not in item:
        no_label_images_list.append(item['name'])
len(no_label_images_list)

In [None]:
# Visualise images in jupyter notebook
for image_name in no_label_images_list:
    vis_image(image_name)

### Explore Labeled Image Data

#### No labels

#### Labels with errors

In [None]:
labels_w_errors = ["fddc9505-6dca33bd.jpg", "01632f31-ab7e17e3.jpg",  "69c3f055-23f9a8f4.jpg"]

In [None]:
for label_name in labels_w_errors:
    labels_for_image = next((entry['labels'] for entry in data_train if entry['name'] == label_name), None)
    plot_image(label_name, labels_for_image)

#### Random labeled images

In [None]:
random_images = get_random_images(data=data_train, n=10)
random_images_names = [image["name"] for image in random_images]
random_images_names

In [None]:
for sample in random_images:
    image_name = sample['name']
    labels = sample['labels']
    plot_image(image_name, labels)

## Explanatory Data Analysis

### Image-level

In [None]:
flattened_data = []
for item in data_train:
    # Create a dictionary to store counts of each category in this image
    category_counts = {}
    if 'labels' in item:
        for label in item['labels']:
            category = label['category']
            if category in category_counts:
                category_counts[category] += 1
            else:
                category_counts[category] = 1
    
    entry = {
        'image_name': item['name'],
        'weather': item['attributes'].get('weather', np.nan),
        'timeofday': item['attributes'].get('timeofday', np.nan),
        'scene': item['attributes'].get('scene', np.nan)
    }
    entry.update(category_counts)  # Merge the category counts into the entry
    flattened_data.append(entry)

# Create a DataFrame
image_df = pd.DataFrame(flattened_data)

In [None]:
image_df

In [None]:
image_df.info()

In [None]:
image_df.describe(include='all')

In [None]:
image_df['weather'].value_counts()

In [None]:
image_df['timeofday'].value_counts()

In [None]:
image_df['scene'].value_counts()

### Label-level

In [None]:
def flatten_data(data):
    flattened_data = []

    for entry in data:
        image_name = entry["name"]
        weather = entry["attributes"].get("weather", np.nan)
        timeofday = entry["attributes"].get("timeofday", np.nan)
        scene = entry["attributes"].get("scene", np.nan)

        if "labels" in entry:
            for label in entry["labels"]:
                label_id = label.get("id", np.nan)
                occluded = label["attributes"].get("occluded", np.nan)
                truncated = label["attributes"].get("truncated", np.nan)
                traffic_light_color = label["attributes"].get("trafficLightColor", np.nan)
                category = label.get("category", np.nan)
                x1 = label["box2d"].get("x1", np.nan)
                y1 = label["box2d"].get("y1", np.nan)
                x2 = label["box2d"].get("x2", np.nan)
                y2 = label["box2d"].get("y2", np.nan)

                flattened_data.append({
                    "image_name": image_name,
                    "weather": weather,
                    "timeofday": timeofday,
                    "scene": scene,
                    "label_id": label_id,
                    "occluded": occluded,
                    "truncated": truncated,
                    "traffic_light_color": traffic_light_color,
                    "category": category,
                    "x1": x1,
                    "y1": y1,
                    "x2": x2,
                    "y2": y2
                })

    return flattened_data

flattened_data = flatten_data(data_train)
initial_df = pd.DataFrame(flattened_data)

In [None]:
initial_df

##### 2D boxes

In [None]:
box2d_df = initial_df.drop(columns=["traffic_light_color", "truncated", "occluded"])
box2d_df = box2d_df.reset_index(drop=True)

In [None]:
box2d_df

In [None]:
categories = {'bicycle',
       'bus',
       'car',
       'motorcycle',
       'other person',
       'other vehicle',
       'pedestrian',
       'rider',
       'traffic light',
       'traffic sign',
       'trailer',
       'train',
       'truck'
    }

In [None]:
def plot_category_frequencies(df):
    # Calculate category counts
    category_counts = df['category'].value_counts()
    total = category_counts.sum()

    # Normalize counts for visual scaling
    normalized_counts = category_counts / total

    # Create the plot
    plt.figure(figsize=(10, 4))
    ax = normalized_counts.plot(kind='bar', color='skyblue')
    plt.xlabel('Category')
    plt.ylabel('Proportion')  # Changed from 'Normalized Frequency' for clarity
    plt.title('Normalized Frequency of Categories')

    # Annotate bars with the actual count
    for i, v in enumerate(category_counts):
        ax.text(i, normalized_counts[i] + 0.01, str(v), color='black', ha='center')  # Adjusted vertical offset for clarity

    # Rotate x-axis labels for better readability
    plt.xticks(rotation=45)
    plt.ylim(0, 0.6)  # Ensure y-axis goes from 0 to 1 for normalized proportions
    plt.show()

In [None]:
plot_category_frequencies(box2d_df)

##### Heatmaps

###### for overall bounding box locations

In [None]:
# Generate heatmaps for bounding box locations
def create_heatmaps(df, img_size=(1024, 1024), category=None):
    heatmap = np.zeros(img_size)
    selected_df = df[df['category'] == category] if category else df
    for _, row in selected_df.iterrows():
        x1, y1, x2, y2 = map(int, [row['x1'], row['y1'], row['x2'], row['y2']])
        heatmap[y1:y2, x1:x2] += 1
    return heatmap

# Plot heatmap
def plot_heatmap(heatmap, title):
    plt.figure(figsize=(10, 8))
    sns.heatmap(heatmap, cmap='viridis')
    plt.title(title)
    plt.show()

In [None]:
overall_heatmap = create_heatmaps(box2d_df)
plot_heatmap(overall_heatmap, 'Overall Bounding Box Heatmap')

In [None]:
category = "bus"
category_heatmap = create_heatmaps(box2d_df, category=category)
plot_heatmap(category_heatmap, f'{category} Heatmap')

In [None]:
category = "traffic light"
category_heatmap = create_heatmaps(box2d_df, category=category)
plot_heatmap(category_heatmap, f'{category} Heatmap')

In [None]:
category = "pedestrian"
category_heatmap = create_heatmaps(box2d_df, category=category)
plot_heatmap(category_heatmap, f'{category} Heatmap')

In [None]:
category = "car"
category_heatmap = create_heatmaps(box2d_df, category=category)
plot_heatmap(category_heatmap, f'{category} Heatmap')

##### Object co-occurrence

In [None]:
def image_to_categories(df):
    category_dict = {}
    for index, row in df.iterrows():
        if row['image_name'] not in category_dict:
            category_dict[row['image_name']] = set()
        category_dict[row['image_name']].add(row['category'])
    return category_dict

# Generate co-occurrence matrix
def generate_co_occurrence_matrix(category_dict):
    categories = sorted(set.union(*category_dict.values()))
    co_occurrence = pd.DataFrame(0, index=categories, columns=categories)
    
    for categories in category_dict.values():
        for cat1 in categories:
            for cat2 in categories:
                if cat1 != cat2:
                    co_occurrence.at[cat1, cat2] += 1
                    co_occurrence.at[cat2, cat1] += 1
    
    return co_occurrence

# Plot the co-occurrence matrix
def plot_co_occurrence_matrix(co_occurrence):
    plt.figure(figsize=(12, 10))
    sns.heatmap(co_occurrence, annot=True, fmt="d", cmap='viridis')
    plt.title('Object Co-occurrence Matrix')
    plt.xlabel('Category')
    plt.ylabel('Category')
    plt.show()

In [None]:
category_dict = image_to_categories(box2d_df)
co_occurrence_matrix = generate_co_occurrence_matrix(category_dict)
plot_co_occurrence_matrix(co_occurrence_matrix)

#####  Size and Aspect Ratio Analysis

In [None]:
# Function to calculate width, height, area, and aspect ratio of bounding boxes
def calculate_metrics(df):
    df['width'] = df['x2'] - df['x1']
    df['height'] = df['y2'] - df['y1']
    df['area'] = df['width'] * df['height']
    df['aspect_ratio'] = df['width'] / df['height']
    return df

# Function to plot area and aspect ratio distributions for each category
def plot_distributions_by_category(df):
    categories = df['category'].unique()
    for cat in categories:
        subset = df[df['category'] == cat]
        
        # Plot area distribution
        plt.figure(figsize=(12, 6))
        sns.histplot(subset['area'], bins=100, kde=True, color='blue')
        plt.title(f'Area Distribution for {cat}')
        plt.xlabel('Area')
        plt.ylabel('Frequency')
        plt.grid(True)
        plt.show()
        
        # Plot aspect ratio distribution
        plt.figure(figsize=(12, 6))
        sns.histplot(subset['aspect_ratio'], bins=100, kde=True, color='green')
        plt.title(f'Aspect Ratio Distribution for {cat}')
        plt.xlabel('Aspect Ratio (Width/Height)')
        plt.ylabel('Frequency')
        plt.grid(True)
        plt.show()

In [None]:
size_ration_df = calculate_metrics(box2d_df)

In [None]:
plot_distributions_by_category(size_ration_df)

##### Common sizes

In [None]:
from collections import Counter
# Assuming df is your DataFrame loaded from data_to_dataframe function
def analyze_sizes(df):
    # Calculate width, height, and area
    df['width'] = df['x2'] - df['x1']
    df['height'] = df['y2'] - df['y1']
    df['area'] = df['width'] * df['height']

    # Group by category and calculate mean area and find most common sizes
    category_analysis = df.groupby('category').agg(
        mean_area=pd.NamedAgg(column='area', aggfunc='mean'),
        mean_width=pd.NamedAgg(column='width', aggfunc='mean'),
        mean_height=pd.NamedAgg(column='height', aggfunc='mean')
    ).reset_index()

    # Most common size for each category
    most_common_sizes = {}
    for category in df['category'].unique():
        sizes = Counter(zip(df[df['category'] == category]['width'], df[df['category'] == category]['height']))
        most_common_size = sizes.most_common(1)[0][0]
        most_common_sizes[category] = most_common_size

    # Add most common size to category analysis
    category_analysis['most_common_width'] = category_analysis['category'].map(lambda x: most_common_sizes[x][0])
    category_analysis['most_common_height'] = category_analysis['category'].map(lambda x: most_common_sizes[x][1])

    # Print the table of results
    print(category_analysis)

In [None]:
analyze_sizes(box2d_df)

In [None]:
import pandas as pd
import numpy as np
from collections import Counter

def calculate_box_metrics(df):
    """ Calculate width, height, and area for each bounding box and add as new columns """
    df['width'] = df['x2'] - df['x1']
    df['height'] = df['y2'] - df['y1']
    df['area'] = df['width'] * df['height']
    return df

def get_category_analysis(df):
    """ Calculate mean and most common dimensions and sizes for each category """
    category_analysis = df.groupby('category').agg(
        mean_area=pd.NamedAgg(column='area', aggfunc='mean'),
        mean_width=pd.NamedAgg(column='width', aggfunc='mean'),
        mean_height=pd.NamedAgg(column='height', aggfunc='mean')
    ).reset_index()

    most_common_sizes = {}
    for category in df['category'].unique():
        sizes = Counter(zip(df[df['category'] == category]['width'], df[df['category'] == category]['height']))
        most_common_size = sizes.most_common(1)[0][0]
        most_common_sizes[category] = most_common_size

    category_analysis['most_common_width'] = category_analysis['category'].apply(lambda x: most_common_sizes[x][0])
    category_analysis['most_common_height'] = category_analysis['category'].apply(lambda x: most_common_sizes[x][1])
    return category_analysis


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def plot_mean_area_by_category(category_analysis):
    """ Plot mean area of bounding boxes by category """
    plt.figure(figsize=(10, 4))
    sns.barplot(x='category', y='mean_area', data=category_analysis, order=category_analysis.sort_values('mean_area', ascending=False)['category'])
    plt.title('Mean Area of Bounding Boxes by Category')
    plt.xlabel('Category')
    plt.ylabel('Mean Area')
    plt.xticks(rotation=45)
    plt.grid(True)
    plt.show()

def plot_distribution_of_common_sizes(df, category_analysis, top_n=1):
    """ Plot distributions for areas of top N most common box dimensions per category """
    plt.figure(figsize=(12, 8))
    for category in df['category'].unique():
        category_df = df[df['category'] == category]
        sizes = Counter(zip(category_df['width'], category_df['height']))
        most_common_sizes = sizes.most_common(top_n)
        
        for size in most_common_sizes:
            common_width, common_height = size[0]
            common_area_data = category_df[(category_df['width'] == common_width) &
                                           (category_df['height'] == common_height)]['area']
            # Ensure there is data to plot
            if not common_area_data.empty:
                sns.histplot(common_area_data, kde=False, label=f'{category}: w={common_width}, h={common_height}', 
                             bins=30, stat="density", element="step", common_norm=False)

    plt.title(f'Distribution of Box Areas for Top {top_n} Most Common Sizes by Category')
    plt.xlabel('Area')
    plt.ylabel('Density')
    plt.legend(title='Category and Size')
    plt.grid(True)
    plt.show()


In [None]:
sizes_df = calculate_box_metrics(box2d_df)
category_analysis = get_category_analysis(sizes_df)

In [None]:
plot_mean_area_by_category(category_analysis)

In [None]:
plot_distribution_of_most_common_sizes(sizes_df, category_analysis)