In [None]:
# https://www.kaggle.com/pednoi/training-mask-r-cnn-to-be-a-fashionista-lb-0-07
# https://www.kaggle.com/go1dfish/updated4-29-fgvc6-simple-eda
import numpy as np # linear algebra
import pandas as pd
pd.set_option("display.max_rows", 101)
import os
print(os.listdir("../input/imaterialist-fashion-2019-FGVC6"))
import cv2
import json
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams["font.size"] = 15
import seaborn as sns
from collections import Counter
from PIL import Image
import math
import seaborn as sns
# ../input/imaterialist-fashion-2019-FGVC6
# label_descriptions.json  sample_submission.csv	test  train  train.csv

In [None]:
input_dir = "../input/imaterialist-fashion-2019-FGVC6/"
def classid2label(class_id):
    category, *attribute = class_id.split("_")
    return category, attribute

def print_dict(dictionary, name_dict):
    print("{}{}{}{}{}".format("rank".ljust(5), "id".center(8), "name".center(40), "amount".rjust(10), "ratio(%)".rjust(10)))
    all_num = sum(dictionary.values())
    for i, (key, val) in enumerate(sorted(dictionary.items(), key=lambda x: -x[1])):
        print("{:<5}{:^8}{:^40}{:>10}{:>10.3%}".format(i+1, key, name_dict[key], val, val/all_num))

In [None]:
def print_img_with_labels(img_name, labels, category_name_dict, attribute_name_dict, ax):
    img = np.asarray(Image.open(input_dir + "train/" + img_name))
    label_interval = (img.shape[0] * 0.9) / len(labels)
    ax.imshow(img)
    for num, attribute_id in enumerate(labels):
        x_pos = img.shape[1] * 1.1
        y_pos = (img.shape[0] * 0.9) / len(labels) * (num + 2) + (img.shape[0] * 0.1)
        if(num == 0):
            ax.text(x_pos, y_pos-label_interval*2, "category", fontsize=12)
            ax.text(x_pos, y_pos-label_interval, category_name_dict[attribute_id], fontsize=12)
            if(len(labels) > 1):
                ax.text(x_pos, y_pos, "attribute", fontsize=12)
        else:
            ax.text(x_pos, y_pos, attribute_name_dict[attribute_id], fontsize=12)

In [None]:
def print_img(img_name, ax):
    img_df = train_df[train_df.ImageId == img_name]
    labels = list(set(img_df["ClassId"].values))
    print_img_with_labels(img_name, labels, category_name_dict, attribute_name_dict, ax)
    
def json2df(data):
    df = pd.DataFrame()
    for index, el in enumerate(data):
        for key, val in el.items():
            df.loc[index, key] = val
    return df

In [None]:
train_df = pd.read_csv(input_dir + "train.csv")
train_df.head()

In [None]:
with open(input_dir + "label_descriptions.json") as f:
    label_description = json.load(f)

In [None]:
category_df = json2df(label_description["categories"])
category_df["id"] = category_df["id"].astype(int)
category_df["level"] = category_df["level"].astype(int)
attribute_df = json2df(label_description["attributes"])
attribute_df["id"] = attribute_df["id"].astype(int)
attribute_df["level"] = attribute_df["level"].astype(int)

In [None]:
print("Category Labels")
category_df

In [None]:
print("Attribute Labels")
attribute_df

In [None]:
print("We have {} categories, and {} attributes.".format(len(label_description['categories']), len(label_description['attributes'])))
print("Each label　have ID, name, supercategory, and level.")

In [None]:
train_df.head(10)

In [None]:
image_label_num_df = train_df.groupby("ImageId")["ClassId"].count()

fig, ax = plt.subplots(figsize=(25, 7))
x = image_label_num_df.value_counts().index.values
y = image_label_num_df.value_counts().values
z = zip(x, y)
z = sorted(z)
x, y = zip(*z)
index = 0
x_list = []
y_list = []
for i in range(1, max(x)+1):
    if(i not in x):
        x_list.append(i)
        y_list.append(0)
    else:
        x_list.append(i)
        y_list.append(y[index])
        index += 1
for i, j in zip(x_list, y_list):
    ax.text(i-1, j, j, ha="center", va="bottom", fontsize=13)
sns.barplot(x=x_list, y=y_list, ax=ax)
ax.set_xticks(list(range(0, len(x_list), 5)))
ax.set_xticklabels(list(range(1, len(x_list), 5)))
ax.set_title("the number of labels per image")
ax.set_xlabel("the number of labels")
ax.set_ylabel("amout");

In [None]:
counter_category = Counter()
counter_attribute = Counter()
for class_id in train_df["ClassId"]:
    category, attribute = classid2label(class_id)
    counter_category.update([category])
    counter_attribute.update(attribute)
len(counter_category)

In [None]:
len(counter_attribute)

In [None]:
category_name_dict = {}
for i in label_description["categories"]:
    category_name_dict[str(i["id"])] = i["name"]
attribute_name_dict = {}
for i in label_description["attributes"]:
    attribute_name_dict[str(i["id"])] = i["name"]

print("Category label frequency")
print_dict(counter_category, category_name_dict)

In [None]:
print("Attribute label frequency")
print_dict(counter_attribute, attribute_name_dict)

In [None]:
train_df.ClassId.max()

In [None]:
attribute_num_dict = {}
none_key = str(len(counter_attribute))
k = list(map(str, range(len(counter_attribute) + 1)))
v = [0] * (len(counter_attribute) + 1)
zipped = zip(k, v)
init_dict = dict(zipped)
for class_id in train_df["ClassId"].values:
    category, attributes = classid2label(class_id)
    if category not in attribute_num_dict.keys():
        attribute_num_dict[category] = init_dict.copy()
    if attributes == []:
        attribute_num_dict[category][none_key] += 1
        continue
    for attribute in attributes:
        attribute_num_dict[category][attribute] += 1

In [None]:
fig, ax = plt.subplots(math.ceil(len(counter_category)/2), 2,\
                       figsize=(8*2, 6*math.ceil(len(counter_category)/2)), sharey=True)
for index, key in enumerate(sorted(map(int, attribute_num_dict.keys()))):
    x = list(map(int, attribute_num_dict[str(key)].keys()))
    total = sum(attribute_num_dict[str(key)].values())
    y = list(map(lambda x: x / total, attribute_num_dict[str(key)].values()))
    sns.barplot(x, y, ax=ax[index//2, index%2])
    ax[index//2, index%2].set_title("category:{}({})".format(key, category_name_dict[str(key)]))
    ax[index//2, index%2].set_xticks(list(range(0, int(none_key), 5)))
    ax[index//2, index%2].set_xticklabels(list(range(0, int(none_key), 5)))
print("the ratio of attribute per category(x=92 means no attribute)")

In [None]:
image_shape_df = train_df.groupby("ImageId")["Height", "Width"].first()

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 5))
ax1.hist(image_shape_df.Height, bins=100)
ax1.set_title("Height distribution")
ax2.hist(image_shape_df.Width, bins=100)
ax2.set_title("Width distribution");

In [None]:
img_name = image_shape_df.Height.idxmin()
height, width = image_shape_df.loc[img_name, :]
print("Minimam height image is {},\n(H, W) = ({}, {})".format(img_name, height, width))
fig, ax = plt.subplots()
print_img(img_name, ax)

In [None]:
img_name = image_shape_df.Height.idxmax()
height, width = image_shape_df.loc[img_name, :]
print("Maximum height image is {},\n(H, W) = ({}, {})".format(img_name, height, width))
fig, ax = plt.subplots()
print_img(img_name, ax)

In [None]:
img_name = image_shape_df.Width.idxmin()
height, width = image_shape_df.loc[img_name, :]
print("Minimam width image is {},\n(H, W) = ({}, {})".format(img_name, height, width))
fig, ax = plt.subplots()
print_img(img_name, ax)

In [None]:
img_name = image_shape_df.Width.idxmax()
height, width = image_shape_df.loc[img_name, :]
print("Maximum width image is {},\n(H, W) = ({}, {})".format(img_name, height, width))
fig, ax = plt.subplots()
print_img(img_name, ax)

In [None]:
pallete =  [
    'Pastel1', 'Pastel2', 'Paired', 'Accent', 'Dark2',
    'Set1', 'Set2', 'Set3', 'tab10', 'tab20', 'tab20b', 'tab20c']


def make_mask_img(segment_df):
    category_num = len(counter_category)
    seg_width = segment_df.at[0, "Width"]
    seg_height = segment_df.at[0, "Height"]
    seg_img = np.full(seg_width*seg_height, category_num-1, dtype=np.uint8)
    for encoded_pixels, class_id in zip(segment_df["EncodedPixels"].values, segment_df["ClassId"].values):
        pixel_list = list(map(int, encoded_pixels.split(" ")))
        for i in range(0, len(pixel_list), 2):
            start_index = pixel_list[i] - 1
            index_len = pixel_list[i+1] - 1
            seg_img[start_index:start_index+index_len] =\
                int(int(class_id.split("_")[0]) / (category_num-1) * 255)
    seg_img = seg_img.reshape((seg_height, seg_width), order='F')
    return seg_img


def train_generator(df, batch_size):
    img_ind_num = df.groupby("ImageId")["ClassId"].count()
    index = df.index.values[0]
    trn_images = []
    seg_images = []
    for i, (img_name, ind_num) in enumerate(img_ind_num.items()):
        img = cv2.imread("../input/train/" + img_name)
        segment_df = (df.loc[index:index+ind_num-1, :]).reset_index(drop=True)
        index += ind_num
        if segment_df["ImageId"].nunique() != 1:
            raise Exception("Index Range Error")
        seg_img = make_mask_img(segment_df)
        
        # HWC -> CHW
        img = img.transpose((2, 0, 1))
        
        trn_images.append(img)
        seg_images.append(seg_img)
        if((i+1) % batch_size == 0):
            return trn_images, seg_images

In [None]:
def cv2plt(img, isColor=True):
    original_img = img
    original_img = original_img.transpose(1, 2, 0)
    original_img = cv2.cvtColor(original_img, cv2.COLOR_BGR2RGB)
    return original_img

In [None]:
original, segmented = train_generator(train_df, 6)
fig, ax = plt.subplots(3, 2, figsize=(16, 18))
for i, (img, seg) in enumerate(zip(original, segmented)):
    ax[i//2, i%2].imshow(cv2plt(img))
    seg[seg == 45] = 255
    ax[i//2, i%2].imshow(seg, cmap='tab20_r', alpha=0.6)
    ax[i//2, i%2].set_title("Sample {}".format(i))

In [None]:
sample_df = pd.read_csv(input_dir + "sample_submission.csv")

In [None]:
sample_df.head(20)