# Severstal Steel Defect EDA

In [None]:
TRAIN_IMAGE_DIR = './severstal-steel-defect-detection/train_images/'
TRAIN_ANNOTATIONS_FILE = './severstal-steel-defect-detection/train.csv'

In [None]:
from collections import defaultdict

import numpy as np

def rle_to_dense(rle, img_height, img_width):
    if rle is None or rle == '':
        return np.zeros((img_height, img_width))
    rle_list = rle.strip().split(' ')
    rle_pairs = [(int(rle_list[i]), int(rle_list[i+1])) for i in range(0, len(rle_list), 2)]

    dense_1d_array = np.zeros(img_height * img_width)
    for rle_start, rle_run in rle_pairs:
        # Subtract 1 from indices because pixel indices start at 1 rather than 0
        dense_1d_array[rle_start - 1:rle_start + rle_run - 1] = 1
    
    # Use Fortran ordering, meaning that the first index changes fastest (sort of unconventional)
    dense_2d_array = np.reshape(dense_1d_array, (img_height, img_width), order='F')
    return dense_2d_array


def load_annotations(ann_file_name):
    anns = defaultdict(dict)

    with open(ann_file_name) as f:
        for line in f:
            file_name, rle_ann = line.split(',')
            if file_name == 'ImageId_ClassId': # Skip header
                continue

            img_id, cls_id = file_name.split('_')
            anns[img_id][cls_id] = rle_ann.strip()
    return anns


In [None]:
anns = load_annotations(TRAIN_ANNOTATIONS_FILE)

In [None]:
anns[list(anns.keys())[1]]

## Visualize Examples

In [None]:
import os

from PIL import Image
import matplotlib.pyplot as plt
%matplotlib inline

def visualize_img_with_masks(img_path, ann_dict):
    num_classes = 4
    img = np.array(Image.open(img_path))
    _, axs = plt.subplots(num_classes + 1, 1, figsize=(15, 18))
    axs[0].imshow(img)

    cmaps = ['Reds', 'Blues', 'Greens', 'Purples']
    for cls_id in range(1, num_classes + 1):
        mask = rle_to_dense(ann_dict[str(cls_id)], img_height=img.shape[0], img_width=img.shape[1])
        axs[cls_id].imshow(img)
        axs[cls_id].imshow(mask, alpha=0.4, cmap=cmaps[cls_id - 1])
    plt.show()

def visualize_imgs(anns):
    count = 5
    for img_id, ann in anns.items():   
        if ann['1'] != '' or ann['2'] != '' or ann['3'] != '' or ann['4']:
            print(f'**********{img_id}**********')
            visualize_img_with_masks(os.path.join(TRAIN_IMAGE_DIR, img_id), ann)
            count -= 1
        if count <= 0:
            return


visualize_imgs(anns)


## Training Set Class Breakdown

In [None]:
counts = {'no_labels': 0,
          '1_only': 0,
          '2_only': 0,
          '3_only': 0,
          '4_only': 0,
          'multiple_labels_2': 0,
          'multiple_labels_3': 0,
          'multiple_labels_4': 0}

for ann in anns.values():
    labels = []
    for k, v in ann.items():
        if v != '':
            labels.append(k)

    if len(labels) == 0:
        counts['no_labels'] += 1
    elif len(labels) == 1:
        counts[labels[0] + '_only'] += 1
    elif len(labels) >= 1:
        counts['multiple_labels_' + str(len(labels))] += 1

for k, v in counts.items():
    print(f'{k}: {v}')