# Preparation

### set directory path

In [None]:
import os

In [None]:
train_img_dir = '../input/train/images/'
train_mask_dir = '../input/train/masks/'
test_img_dir = '../input/test/images/'

### get train_image name list

In [None]:
train_img_names = [x.split('.')[0] for x in os.listdir(train_img_dir)]

In [None]:
train_img_names[:5],len(train_img_names)

### create dict of  filename to number and number to filename  from train_image name list

In [None]:
train_img_dict_i_to_names = dict()
train_img_dict_names_to_i = dict()
for i in range(len(train_img_names)):
    train_img_dict_i_to_names[i] = train_img_names[i]
    train_img_dict_names_to_i[train_img_names[i]] = i

### load train images and masks

In [None]:
from skimage.data import imread

In [None]:
train_img_shape = imread(train_img_dir + train_img_names[0]+'.png').shape
train_mask_shape = imread(train_mask_dir + train_img_names[0]+'.png').shape

In [None]:
import numpy as np

In [None]:
train_img = np.zeros((len(train_img_names), train_img_shape[0], train_img_shape[1], train_img_shape[2]))
train_mask = np.zeros((len(train_img_names), train_mask_shape[0], train_mask_shape[1]))

In [None]:
for i in range(len(train_img_names)):
    train_img[i] = i
    train_mask[i] = i
    train_img[i,:,:,:] = imread(train_img_dir + train_img_names[i]+'.png')
    train_mask[i,:,:] = imread(train_mask_dir + train_img_names[i]+'.png')

In [None]:
train_img.shape,train_mask.shape

In [None]:
train_img[50,:,:,0],train_mask[50,:,:]

### change color image to monochrome image

In [None]:
train_img_mono = np.zeros((len(train_img_names), train_img_shape[0], train_img_shape[1]))

In [None]:
train_img_mono = train_img[:,:,:,0]

In [None]:
train_img_mono.shape

### change 16bit to 8bit for train_mask

In [None]:
train_mask_8bit = np.zeros((train_mask.shape[0],train_mask.shape[1],train_mask.shape[1]))

In [None]:
for i in range(len(train_img_names)):
    train_mask_8bit[i,:,:]= np.maximum(train_mask[i,:,:]/255-2,0)

In [None]:
train_mask_8bit[50,:,:]

### load train.csv

In [None]:
import pandas as pd

In [None]:
train_dir = '../input/'

In [None]:
train = pd.read_csv(train_dir + 'train.csv')

In [None]:
train.head(3)

In [None]:
train.shape

### load depths.csv

In [None]:
depths = pd.read_csv(train_dir + 'depths.csv')

In [None]:
depths.head(3)

In [None]:
depths.shape

### merge

In [None]:
train = pd.merge(train, depths, on='id',how='left')

In [None]:
train.head(3)

In [None]:
train.shape

### calculate area

create rle_decode (rle to mask) function
- input: rle_list  [start, length, start, length, ...]
- input: SHAPE (101,101)

In [None]:
def rle_to_mask(rle_list, SHAPE):
    tmp_flat = np.zeros(SHAPE[0]*SHAPE[1])
    if len(rle_list) == 1:
        mask = np.reshape(tmp_flat, SHAPE).T
    else:
        strt = rle_list[::2]
        length = rle_list[1::2]
        for i,v in zip(strt,length):
            tmp_flat[(int(i)-1):(int(i)-1)+int(v)] = 255
        mask = np.reshape(tmp_flat, SHAPE).T
    return mask

create function of area calculation per image
- input: rle_str    'start lengh start length ...'
- use rle_decode (rle to mask) function

In [None]:
def calc_area_for_rle(rle_str):
    rle_list = str(rle_str).split()
    mask = rle_to_mask(rle_list, (101,101))
    area = mask.sum()/255.0
    return area

calculate area

In [None]:
train['area'] = train['rle_mask'].apply(calc_area_for_rle)

In [None]:
train.head(3)

### calculate color brightness mean and std

create function of brightness mean per image
- input:  file name of image

In [None]:
def calc_mean_img(name):
    i = train_img_dict_names_to_i[name]
    img = train_img_mono[i]
    mean = img.mean()
    return mean

calculate brightness mean

In [None]:
train['mean'] = train['id'].apply(calc_mean_img)

In [None]:
train.head(3)

create function of brightness std per image
- input:  file name of image

In [None]:
def calc_std_img(name):
    i = train_img_dict_names_to_i[name]
    img = train_img_mono[i]
    std = img.std()
    return std

In [None]:
train['std'] = train['id'].apply(calc_std_img)

In [None]:
train.head(3)

### split train to is-salt or no-salt

In [None]:
train_issalt = train[train['rle_mask'].notnull()]

In [None]:
train_nosalt = train[train['rle_mask'].isnull()]

In [None]:
train.shape,train_issalt.shape,train_nosalt.shape

# EDA! EDA!

### calculate no-salt ratio

In [None]:
train_nosalt.shape[0]/train.shape[0]

comment
- no-salt ratio is large

### visualize area histgram

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(7*3,5))
axes[0].hist(train['area'], bins=20)
axes[0].set_title('train')
axes[0].set_xlabel('area')
axes[0].set_ylabel('frequency')
axes[1].hist(train_issalt['area'], bins=20)
axes[1].set_title('train_issalt')
axes[1].set_xlabel('area')
axes[1].set_ylabel('frequency')
axes[2].hist(train_nosalt['area'], bins=20)
axes[2].set_title('train_nosalt')
axes[2].set_xlabel('area')
axes[2].set_ylabel('frequency')

comment
- Can you predict a small area well?
- How do you predict almost salt images?

### check small area images

get image list of small(<1%) area

In [None]:
small_area_image_list = train_issalt[train_issalt['area'] < 101*101*0.01]['id'].tolist()

In [None]:
small_area_image_list[:5],len(small_area_image_list)

display image and mask ( 5 examples of 171 images)

In [None]:
image_list = small_area_image_list[:5]
fig, axes = plt.subplots(len(image_list), 2, figsize=(5,5*len(image_list)))
fig.subplots_adjust(left=0.075,right=0.95,bottom=0.05,top=0.52,wspace=0.2,hspace=0.10)
for i in range(len(image_list)):
    img = imread(train_img_dir + image_list[i] +'.png')
    mask = imread(train_mask_dir + image_list[i] +'.png')
    axes[i, 0].imshow(img)
    axes[i, 1].imshow(mask)

comment
- 🤔

### check almost salt images

get image list of large(>99%) area

In [None]:
large_area_image_list = train_issalt[train_issalt['area'] > 101*101*0.99]['id'].tolist()

In [None]:
large_area_image_list[:5],len(large_area_image_list)

display image and mask ( 5 examples of 56 images)

In [None]:
image_list = large_area_image_list[:5]
fig, axes = plt.subplots(len(image_list), 2, figsize=(5,5*len(image_list)))
fig.subplots_adjust(left=0.075,right=0.95,bottom=0.05,top=0.52,wspace=0.2,hspace=0.10)
for i in range(len(image_list)):
    img = imread(train_img_dir + image_list[i] +'.png')
    mask = imread(train_mask_dir + image_list[i] +'.png')
    axes[i, 0].imshow(img)
    axes[i, 1].imshow(mask)

comment
- 🤔🤔

### visualize depth histgram

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(7*3,5))
axes[0].hist(train['z'], bins=20)
axes[0].set_title('train')
axes[0].set_xlabel('depth')
axes[0].set_ylabel('frequency')
axes[0].set_ylim(0,350)
axes[1].hist(train_issalt['z'], bins=20)
axes[1].set_title('train_issalt')
axes[1].set_xlabel('depth')
axes[1].set_ylabel('frequency')
axes[1].set_ylim(0,350)
axes[2].hist(train_nosalt['z'], bins=20)
axes[2].set_title('train_nosalt')
axes[2].set_xlabel('depth')
axes[2].set_ylabel('frequency')
axes[2].set_ylim(0,350)

comment
- the depth distribution of is-salt is normal.
- the depth distribution of no-salt is uniform.

### visualize color brightness mean histgram

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(7*3,5))
axes[0].hist(train['mean'], bins=20)
axes[0].set_title('train')
axes[0].set_xlabel('brightness mean')
axes[0].set_ylabel('frequency')
axes[0].set_ylim(0,1000)
axes[1].hist(train_issalt['mean'], bins=20)
axes[1].set_title('train_issalt')
axes[1].set_xlabel('brightness mean')
axes[1].set_ylabel('frequency')
axes[1].set_ylim(0,1000)
axes[2].hist(train_nosalt['mean'], bins=20)
axes[2].set_title('train_nosalt')
axes[2].set_xlabel('brightness mean')
axes[2].set_ylabel('frequency')
axes[2].set_ylim(0,1000)

comment
- the brightness mean distribution of no-salt is very skewed.

### visualize color brightness std histgram

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(7*3,5))
axes[0].hist(train['std'], bins=20)
axes[0].set_title('train')
axes[0].set_xlabel('brightness std')
axes[0].set_ylabel('frequency')
axes[0].set_ylim(0,500)
axes[1].hist(train_issalt['std'], bins=20)
axes[1].set_title('train_issalt')
axes[1].set_xlabel('brightness std')
axes[1].set_ylabel('frequency')
axes[1].set_ylim(0,500)
axes[2].hist(train_nosalt['std'], bins=20)
axes[2].set_title('train_nosalt')
axes[2].set_xlabel('brightness std')
axes[2].set_ylabel('frequency')
axes[2].set_ylim(0,500)

comment
- no-salt image is more complicated than is-salt image.

### visualize depth vs area

In [None]:
xlabel = 'depth'
ylabel = 'area'
x = 'z'
y = 'area'
fig, axes = plt.subplots(1, 3, figsize=(7*3,5))
axes[0].scatter(train[x], train[y])
axes[0].set_title('train')
axes[0].set_xlabel(xlabel)
axes[0].set_ylabel(ylabel)
axes[1].scatter(train_issalt[x], train_issalt[y])
axes[1].set_title('train_issalt')
axes[1].set_xlabel(xlabel)
axes[1].set_ylabel(ylabel)
axes[2].scatter(train_nosalt[x], train_nosalt[y])
axes[2].set_title('train_nosalt')
axes[2].set_xlabel(xlabel)
axes[2].set_ylabel(ylabel)

### visualize depth vs brightness mean

In [None]:
xlabel = 'depth'
ylabel = 'brightness mean'
x = 'z'
y = 'mean'
fig, axes = plt.subplots(1, 3, figsize=(7*3,5))
axes[0].scatter(train[x], train[y])
axes[0].set_title('train')
axes[0].set_xlabel(xlabel)
axes[0].set_ylabel(ylabel)
axes[0].set_ylim(0,260)
axes[1].scatter(train_issalt[x], train_issalt[y])
axes[1].set_title('train_issalt')
axes[1].set_xlabel(xlabel)
axes[1].set_ylabel(ylabel)
axes[1].set_ylim(0,260)
axes[2].scatter(train_nosalt[x], train_nosalt[y])
axes[2].set_title('train_nosalt')
axes[2].set_xlabel(xlabel)
axes[2].set_ylabel(ylabel)
axes[2].set_ylim(0,260)

### visualize depth vs brightness std

In [None]:
xlabel = 'depth'
ylabel = 'brightness std'
x = 'z'
y = 'std'
fig, axes = plt.subplots(1, 3, figsize=(7*3,5))
axes[0].scatter(train[x], train[y])
axes[0].set_title('train')
axes[0].set_xlabel(xlabel)
axes[0].set_ylabel(ylabel)
axes[0].set_ylim(0,80)
axes[1].scatter(train_issalt[x], train_issalt[y])
axes[1].set_title('train_issalt')
axes[1].set_xlabel(xlabel)
axes[1].set_ylabel(ylabel)
axes[1].set_ylim(0,80)
axes[2].scatter(train_nosalt[x], train_nosalt[y])
axes[2].set_title('train_nosalt')
axes[2].set_xlabel(xlabel)
axes[2].set_ylabel(ylabel)
axes[2].set_ylim(0,80)

### visualize area vs brightness mean

In [None]:
xlabel = 'area'
ylabel = 'brightness mean'
x = 'area'
y = 'mean'
fig, axes = plt.subplots(1, 3, figsize=(7*3,5))
axes[0].scatter(train[x], train[y])
axes[0].set_title('train')
axes[0].set_xlabel(xlabel)
axes[0].set_ylabel(ylabel)
axes[0].set_ylim(0,300)
axes[1].scatter(train_issalt[x], train_issalt[y])
axes[1].set_title('train_issalt')
axes[1].set_xlabel(xlabel)
axes[1].set_ylabel(ylabel)
axes[1].set_ylim(0,300)
axes[2].scatter(train_nosalt[x], train_nosalt[y])
axes[2].set_title('train_nosalt')
axes[2].set_xlabel(xlabel)
axes[2].set_ylabel(ylabel)
axes[2].set_ylim(0,300)

### visualize area vs brightness std

In [None]:
xlabel = 'area'
ylabel = 'brightness std'
x = 'area'
y = 'std'
fig, axes = plt.subplots(1, 3, figsize=(7*3,5))
axes[0].scatter(train[x], train[y])
axes[0].set_title('train')
axes[0].set_xlabel(xlabel)
axes[0].set_ylabel(ylabel)
axes[0].set_ylim(0,80)
axes[1].scatter(train_issalt[x], train_issalt[y])
axes[1].set_title('train_issalt')
axes[1].set_xlabel(xlabel)
axes[1].set_ylabel(ylabel)
axes[1].set_ylim(0,80)
axes[2].scatter(train_nosalt[x], train_nosalt[y])
axes[2].set_title('train_nosalt')
axes[2].set_xlabel(xlabel)
axes[2].set_ylabel(ylabel)
axes[2].set_ylim(0,80)

### visualize brightness mean vs brightness std

In [None]:
xlabel = 'brightness mean'
ylabel = 'brightness std'
x = 'mean'
y = 'std'
fig, axes = plt.subplots(1, 3, figsize=(7*3,5))
axes[0].scatter(train[x], train[y])
axes[0].set_title('train')
axes[0].set_xlabel(xlabel)
axes[0].set_ylabel(ylabel)
axes[0].set_ylim(0,80)
axes[1].scatter(train_issalt[x], train_issalt[y])
axes[1].set_title('train_issalt')
axes[1].set_xlabel(xlabel)
axes[1].set_ylabel(ylabel)
axes[1].set_ylim(0,80)
axes[2].scatter(train_nosalt[x], train_nosalt[y])
axes[2].set_title('train_nosalt')
axes[2].set_xlabel(xlabel)
axes[2].set_ylabel(ylabel)
axes[2].set_ylim(0,80)

### find suspicious images

create function

In [None]:
def suspicious_img_c(ids):
    mask = imread(train_mask_dir + ids +'.png')
    if len(np.unique(mask.sum(axis=1)))==1:
        if mask.sum() == 101*101*65535:
            return 0
        elif mask.sum() == 0:
            return 0
        else:
            return 1
    else:
        return 0

In [None]:
def suspicious_img_r(ids):
    mask = imread(train_mask_dir + ids +'.png')
    if len(np.unique(mask.sum(axis=0)))==1:
        if mask.sum() == 101*101*65535:
            return 0
        elif mask.sum() == 0:
            return 0
        else:
            return 1
    else:
        return 0

In [None]:
train['suspicious_c'] = train['id'].map(suspicious_img_c)
train['suspicious_r'] = train['id'].map(suspicious_img_r)

In [None]:
train_suspicious_c = train[train['suspicious_c']==1]
train_suspicious_r = train[train['suspicious_r']==1]

In [None]:
train_suspicious_c.shape[0], train_suspicious_r.shape[0]

In [None]:
train_suspicious_list = train_suspicious_c['id'].tolist()

In [None]:
image_list = train_suspicious_list[:30]
fig, axes = plt.subplots(len(image_list), 2, figsize=(5,5*len(image_list)))
fig.subplots_adjust(left=0.075,right=0.95,bottom=0.05,top=0.52,wspace=0.2,hspace=0.10)
for i in range(len(image_list)):
    img = imread(train_img_dir + image_list[i] +'.png')
    mask = imread(train_mask_dir + image_list[i] +'.png')
    axes[i, 0].imshow(img)
    axes[i, 1].imshow(mask)

comment
- 🤔🤔🤔🤔🤔🤔