In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import os

# Read dataset

In [None]:
segmentations = pd.read_csv(os.path.join('data', 'train_ship_segmentations_v2.csv'))
segmentations['EncodedPixels'] = segmentations['EncodedPixels'].astype('string')

# Data Description

EncodedPixels - a list of pixels for ship segmentation in a compressed format (in run-length encoding format).
EncodedPixels сonsists of pairs of values that contain a start position and a run length. E.g. '1 3' implies starting a pixel 1 and running a total of 3 pixels (1,2,3).
The pixels are one-indexed and numbered from top to bottom, then left to right: 1 is pixel (1,1), 2 is pixel (2,1), etc.
A prediction of "no ship in image" have a blank value in the EncodedPixels column.
Object segments do not overlap.

# Feature Extraction

Let's add new features:

1. ImageHeight - image height
2. ImageWidth - image weight
3. ShipAreaPercentage - percentage of the ship segmentation area in the image. Measured as a percentage, from 0 to 100%

In [None]:
def get_train_image(name: str):
    path = 'train_v2/{name}'
    return cv2.imread(path)

def extract_features_from_image(row: pd.Series) -> pd.Series:
    image = np.zeros((768, 768, 3))# get_train_image(row['ImageId'])
    row['ImageHeight'], row['ImageWidth'], _ = image.shape
    return row

segmentations = segmentations.apply(lambda x: extract_features_from_image(x), axis=1)

def pixels_number(encoded_pixels: str) -> int:
    if pd.isna(encoded_pixels):
        return 0
    return np.array(encoded_pixels.split()[1::2], dtype=int).sum()

segmentations['ShipAreaPercentage'] = segmentations.apply(lambda x: pixels_number(x['EncodedPixels']) / (x['ImageHeight'] * x['ImageWidth']) * 100, axis=1)

# Exploring the data

In [None]:
print(f'There are {segmentations.shape[0]} rows.')
segmentations.head(10)

In [None]:
train_images_number = segmentations['ImageId'].nunique()
print(f'There are {train_images_number} train images.')

# Image resolution

In [None]:
print(segmentations['ImageHeight'].value_counts())
print(segmentations['ImageWidth'].value_counts())

# Distribution of the number of ships in images

In [None]:
images_without_ships = segmentations['EncodedPixels'].isna().sum()
print(f'There are {images_without_ships} images without ships.')

In [None]:
segmentations['ShipCount'] = segmentations.apply(lambda x: 0 if pd.isna(x['EncodedPixels']) else 1, axis=1)
ships_numbers = segmentations[['ImageId','ShipCount']].groupby(['ImageId']).sum()
print(ships_numbers.value_counts())

f,ax=plt.subplots(1, 2,figsize=(20,10))

ships_numbers.hist(bins = 15, ax=ax[0])

y = ships_numbers.value_counts().values
percent = 100.*y/y.sum()
labels = ['{0} - {1:1.2f} %'.format(i,j) for i,j in zip(ships_numbers.value_counts().index.get_level_values(0), percent)]
ships_numbers.value_counts().plot.pie(labels=None, ax=ax[1])
ax[1].legend(labels, bbox_to_anchor=(1., 1.), fontsize=14)
ax[1].yaxis.set_visible(False)
ax[1].set_title('Distribution number of ships')

plt.show()

We can see that almost 78% of images don't have any ships. Only 14% have one ship. And on the remaining 8 of the images there are from 2 to 15 ships.

# Ship segmentation area distribution

In [None]:
f,ax=plt.subplots(1, 2,figsize=(30,10))

segmentations['ShipAreaPercentage'].hist(bins=20, ax=ax[0])
segmentations['ShipAreaPercentage'].plot.box(ax=ax[1])
ax[1].set_ylabel('Ship Area Percentage')
ax[1].set_xlabel('')

All ships are very small in the images. Ships occupy no more than 4% of the image. Most of the ships take up less than 0.1% of the image.

# Analysis result
First, we can notice that all the ship segment labels in the training dataset are bounding boxes (rectangles). Moreoverthe ships in the images have a slightly different shape from a rectangular one.
It was found that most of the images (~78%) do not contain ships. All the ships in the images take up a very small part (less than 0.5%), the pictures contain a lot of background. Also, target segmentation for labeled pictures consists of bounding boxes (rectangles).
