In [None]:
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
data_path = os.path.join("..", "data")
skimmia_path = os.path.join(data_path, "skimmia")
visem_path = os.path.join(data_path, "visem")

In [None]:
skimmia_images_train_dir = os.path.join(skimmia_path, "images/train")
skimmia_images_valid_dir = os.path.join(skimmia_path, "images/valid")
skimmia_images_test_dir = os.path.join(skimmia_path, "images/test")

visem_images_train_dir = os.path.join(visem_path, "images/train")
visem_images_valid_dir = os.path.join(visem_path, "images/valid")
visem_images_test_dir = os.path.join(visem_path, "images/test")


print("Skimmia images - train:", len(os.listdir(skimmia_images_train_dir)))
print("Skimmia images - valid:", len(os.listdir(skimmia_images_valid_dir)))
print("Skimmia images - test:", len(os.listdir(skimmia_images_test_dir)))

print("Visem images - train:", len(os.listdir(visem_images_train_dir)))
print("Visem images - valid:", len(os.listdir(visem_images_valid_dir)))
print("Visem images - test:", len(os.listdir(visem_images_test_dir)))

In [None]:
skimmia_annotations_path_train = os.path.join(skimmia_path, "annotations/annotations_train.csv")
skimmia_annotations_path_valid = os.path.join(skimmia_path, "annotations/annotations_valid.csv")
skimmia_annotations_path_test = os.path.join(skimmia_path, "annotations/annotations_test.csv")

visem_annotations_path_train = os.path.join(visem_path, "annotations/annotations_train.csv")
visem_annotations_path_valid = os.path.join(visem_path, "annotations/annotations_valid.csv")
visem_annotations_path_test = os.path.join(visem_path, "annotations/annotations_test.csv")

skimmia_df_train = pd.read_csv(skimmia_annotations_path_train)
skimmia_df_valid = pd.read_csv(skimmia_annotations_path_valid)
skimmia_df_test = pd.read_csv(skimmia_annotations_path_test)

visem_df_train = pd.read_csv(visem_annotations_path_train)
visem_df_valid = pd.read_csv(visem_annotations_path_valid)
visem_df_test = pd.read_csv(visem_annotations_path_test)

print("Skimmia annotations train shape:", skimmia_df_train.shape)
print("Skimmia annotations valid shape:", skimmia_df_valid.shape)
print("Skimmia annotations test shape:", skimmia_df_test.shape)

print("Visem annotations train shape:", visem_df_train.shape)
print("Visem annotations valid shape:", visem_df_valid.shape)
print("Visem annotations test shape:", visem_df_test.shape)

In [None]:
skimmia_df_train.head()

In [None]:
print("Skimmia total samples:", skimmia_df_train.shape[0] + skimmia_df_valid.shape[0] + skimmia_df_test.shape[0])
print("Visem total samples:", visem_df_train.shape[0] + visem_df_valid.shape[0] + visem_df_test.shape[0])

In [None]:
display(skimmia_df_train[['class_id', 'width', 'height']].groupby('class_id').agg(['min','mean','max']).astype(int))
display(skimmia_df_valid[['class_id', 'width', 'height']].groupby('class_id').agg(['min','mean','max']).astype(int))
display(skimmia_df_test[['class_id', 'width', 'height']].groupby('class_id').agg(['min','mean','max']).astype(int))

In [None]:
# 0 - sperm
# 1 - cluster
# 2 - small or pinhead 
display(visem_df_train[['class_id', 'width', 'height']].groupby('class_id').agg(['min','mean','max']).astype(int))
display(visem_df_valid[['class_id', 'width', 'height']].groupby('class_id').agg(['min','mean','max']).astype(int))
display(visem_df_test[['class_id', 'width', 'height']].groupby('class_id').agg(['min','mean','max']).astype(int))

In [None]:
skimmia_df = pd.concat([skimmia_df_train, skimmia_df_valid, skimmia_df_test], ignore_index=True)
visem_df = pd.concat([visem_df_train, visem_df_valid, visem_df_test], ignore_index=True)

In [None]:
plots_dir_path = os.path.join("..", "plots")
plots_dir_path

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

skimmia_df.hist(column='width', bins=20, rwidth=0.9, color='skyblue', edgecolor='black', grid=False, ax=ax1)
skimmia_df.hist(column='height', bins=20, rwidth=0.9, color='skyblue', edgecolor='black', grid=False, ax=ax2)

ax1.set_title('Rozkład szerokości', fontsize=10)
ax2.set_title('Rozkład wysokości', fontsize=10)

plt.suptitle('Skimmia', fontsize=12)

path = os.path.join(plots_dir_path, 'skimmia_bboxs_distribution.png')
plt.savefig(path)

plt.show()

In [None]:
visem_labels = {0: "sperm", 
                1: "cluster", 
                2: "small or pinhead"}

for class_id, label in visem_labels.items():
    subset = visem_df[visem_df['class_id'] == class_id]   
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

    subset.hist(column='width', bins=15, rwidth=0.9, color='skyblue', edgecolor='black', grid=False, ax=ax1)
    subset.hist(column='height', bins=15, rwidth=0.9, color='skyblue', edgecolor='black', grid=False, ax=ax2)

    ax1.set_title('Rozkład szerokości', fontsize=10)
    ax2.set_title('Rozkład wysokości', fontsize=10)

    plt.suptitle(f'Visem ({label})', fontsize=12)

    path = os.path.join(plots_dir_path, f'visem_bboxs_distribution_{label}.png')
    plt.savefig(path)

    plt.show()