# Разработка системы распознавания предметов интерьера в потоковом видео

## Часть2: Выявление частоты классов в датасете

In [39]:
%matplotlib inline

#System
import os
import shutil
import random

#Base
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2

#Seed
seed = 99
np.random.seed(seed)
tf.set_random_seed(seed)
random.seed(seed)

### Ранее написанные функции

 Загрузим ранее написанные функции для удобной работы с именами файлов и преобразования файлов с описанием в pandas.DataFrame

In [40]:
img_dir = 'data/ADE20K_filtred/images/'

In [41]:
def get_format(file_name):
    name_split = file_name.split("_")
    file_format = name_split[0:-1] + name_split[-1].split(".")
    ade, train_or_val, name = file_format[:3]
    extension = file_format[-1]
    
    description = 0
    parts_num = 0
    if extension == 'jpg':
        description = 'img'
    elif extension == 'txt':
        description = 'text'
    else: #png
        description = file_format[3]
        if file_format[4].isdigit():
            parts_num = int(file_format[4])
    return [name, description, train_or_val, parts_num]

Но преобразуем их с учётом того, что мы будем работать только с верхними уровнями объектов

In [42]:
def get_seg_description(path):
    description = []
    with open(path) as f:
        for line in f:
            (instance_n, part_level, occluded,
            class_name, original_name, attributes_list) = line.rstrip().split(" # ")
            
            if part_level != '0':  #добавляем только элементы верхнего уровня
                continue
            
            description.append([class_name, original_name, attributes_list.strip("\"")])
    description = pd.DataFrame(description).rename(columns={0: "class_name", 1: "original_name", 2: "attributes" })
    return description

In [63]:
get_seg_description(img_dir + 'train/a/alcove/ADE_train_00001217_atr.txt').head()

Unnamed: 0,class_name,original_name,attributes
0,wall,wall,
1,wall,wall,
2,wall,wall,
3,wall,wall,
4,wall,wall,


### Получение полного списка классов

Для определения того, какие классы нам нужно детектировать выделим все классы из описаний фотографий.

In [44]:
def get_full_df_description(dir_path):
    df = pd.DataFrame(columns=["class_name", "original_name", "attributes"])
    s = 0
    for path, dirs, files in os.walk(dir_path):
        if files:
            for file in files:
                name, description, train_or_val, parts_num = get_format(file)
                if description == 'text':
                    s += 1
                    if s % 100 == 0:
                        print(str(s) + " done, size: " + str(df.shape) )
                        
                    df = df.append(get_seg_description(path +'/' + file), ignore_index=True)
    return df

In [45]:
%%time
full_df = get_full_df_description(img_dir)

100 done, size: (1538, 3)
200 done, size: (4064, 3)
300 done, size: (7190, 3)
400 done, size: (10455, 3)
500 done, size: (13909, 3)
600 done, size: (17057, 3)
700 done, size: (20342, 3)
800 done, size: (23606, 3)
900 done, size: (26670, 3)
1000 done, size: (28989, 3)
1100 done, size: (31311, 3)
1200 done, size: (33564, 3)
1300 done, size: (36022, 3)
1400 done, size: (38986, 3)
1500 done, size: (41238, 3)
1600 done, size: (44139, 3)
1700 done, size: (47081, 3)
1800 done, size: (50156, 3)
1900 done, size: (53103, 3)
2000 done, size: (55474, 3)
2100 done, size: (57733, 3)
2200 done, size: (59438, 3)
2300 done, size: (61357, 3)
2400 done, size: (64021, 3)
2500 done, size: (65825, 3)
2600 done, size: (67819, 3)
2700 done, size: (69667, 3)
2800 done, size: (71569, 3)
2900 done, size: (74665, 3)
3000 done, size: (77696, 3)
3100 done, size: (81417, 3)
3200 done, size: (83983, 3)
3300 done, size: (86552, 3)
3400 done, size: (88910, 3)
3500 done, size: (91387, 3)
3600 done, size: (93755, 3)
3700

Посмотрим на получившиеся значения

In [38]:
full_df

Unnamed: 0,class_name,original_name,attributes
0,sky,sky,
1,wall,wall,
2,wall,wall,
3,wall,wall,
4,wall,wall,
...,...,...,...
174895,cushion,cushion,
174896,lamp,table lamp,"direct,general,off,volumetric"
174897,lamp,table lamp,off
174898,lamp,table lamp,"direct,general,off,volumetric"


### Посмотрим на самые частые

Выявление самых частых классов

In [27]:
full_df.class_name.value_counts()[:15]

wall                                     20374
chair                                    10156
floor, flooring                           7137
painting, picture                         6955
cabinet                                   6838
windowpane, window                        6689
table                                     6221
light, light source                       5830
ceiling                                   5318
lamp                                      5099
cushion                                   4756
curtain, drape, drapery, mantle, pall     3999
book                                      3820
door                                      2703
plant, flora, plant life                  2616
Name: class_name, dtype: int64

Посмотрим количество уникальных значений

In [25]:
full_df.class_name.unique().shape

(1263,)

Определим минимальное количество слов

In [60]:
min_obj = 25 #Минимальное количество объектов для равпознавания
indexes = list(full_df.class_name.value_counts() > min_obj)
full_df.class_name.value_counts()[indexes]
class_list = list(full_df.class_name.value_counts()[indexes].index)
len(class_list)

261

In [62]:
class_list

['wall',
 'chair',
 'floor, flooring',
 'painting, picture',
 'cabinet',
 'windowpane, window',
 'table',
 'light, light source',
 'ceiling',
 'lamp',
 'cushion',
 'curtain, drape, drapery, mantle, pall',
 'book',
 'door',
 'plant, flora, plant life',
 'bed',
 'pillow',
 'armchair',
 'bottle',
 'person, individual, someone, somebody, mortal, soul',
 'vase',
 'box',
 'sofa, couch, lounge',
 'wall socket, wall plug, electric outlet, electrical outlet, outlet, electric receptacle',
 'shelf',
 'flower',
 'plate',
 'glass, drinking glass',
 'rug, carpet, carpeting',
 'sink',
 'sconce',
 'mirror',
 'pot, flowerpot',
 'work surface',
 'towel',
 'desk',
 'swivel chair',
 'coffee table, cocktail table',
 'bowl',
 'plaything, toy',
 'basket, handbasket',
 'pot',
 'candlestick, candle holder',
 'stool',
 'switch, electric switch, electrical switch',
 'chest of drawers, chest, bureau, dresser',
 'television receiver, television, television set, tv, tv set, idiot box, boob tube, telly, goggle box',

Удалим те, которые нам точно не нужны

In [23]:
exclusion_list = ['wall', 'floor', 'ceiling']

In [24]:
def remove_exclusion(class_list, exclusion_list):
    for val in exclusion_list:
        for class_ in class_list:
            class_arr = class_.split(', ')
            for class_inst in class_arr:
                if val == class_inst:
                    class_list.remove(class_)

In [25]:
remove_exclusion(class_list, exclusion_list)

И добавим "пустой" класс

In [26]:
class_list = ["-"] + class_list
class_list

['-',
 'chair',
 'painting, picture',
 'cabinet',
 'windowpane, window',
 'table',
 'light, light source',
 'lamp',
 'cushion',
 'curtain, drape, drapery, mantle, pall',
 'book',
 'door',
 'plant, flora, plant life',
 'bed',
 'pillow',
 'armchair',
 'bottle',
 'person, individual, someone, somebody, mortal, soul',
 'vase',
 'box',
 'sofa, couch, lounge',
 'wall socket, wall plug, electric outlet, electrical outlet, outlet, electric receptacle',
 'shelf',
 'flower',
 'plate',
 'glass, drinking glass',
 'rug, carpet, carpeting',
 'sink',
 'sconce',
 'mirror',
 'pot, flowerpot',
 'work surface',
 'towel',
 'desk',
 'swivel chair',
 'coffee table, cocktail table',
 'bowl',
 'plaything, toy',
 'basket, handbasket',
 'pot',
 'candlestick, candle holder',
 'stool',
 'switch, electric switch, electrical switch',
 'chest of drawers, chest, bureau, dresser',
 'television receiver, television, television set, tv, tv set, idiot box, boob tube, telly, goggle box',
 'blind, screen',
 'clock',
 'chan

Мы получили список допустимых для нас классов

### Совмещение с индексами классов

In [27]:
obj_names = 0
with open('data/ADE20K_filtred/objectnames.txt', 'r') as f:
    obj_names = f.readline().split("\t")
    obj_names = list(map(lambda name: name.strip("\'"), obj_names))

In [28]:
obj_names.index('wall')

2977

In [29]:
class_index_dict = dict()
index_old_to_new = dict()
for class_name, i in zip(class_list, range(len(class_list))):
    class_index_dict[class_name] = obj_names.index(class_name)
    index_old_to_new[obj_names.index(class_name)] = i

In [30]:
class_index_dict.values()

dict_values([0, 470, 1734, 349, 3054, 2683, 1450, 1394, 688, 686, 235, 773, 1909, 164, 1868, 56, 248, 1830, 2931, 265, 2472, 2981, 2328, 977, 1918, 1097, 2177, 2387, 2242, 1563, 1980, 3086, 2820, 723, 2678, 570, 258, 1929, 145, 1973, 377, 2585, 2675, 490, 2732, 211, 529, 479, 2597, 580, 917, 2508, 1755, 942, 2849, 2052, 590, 2729, 2792, 154, 1701, 893, 2095, 1484, 1348, 2984, 1643, 1550, 1032, 628, 2271, 238, 782, 1618, 1943, 777, 2340, 1582, 1259, 1968, 746, 205, 375, 1707, 1473, 1947, 94, 1891, 1895, 676, 2045, 981, 568])

Мы получили отображение имени класса в его номер

In [31]:
index_old_to_new.items()

dict_items([(0, 0), (470, 1), (1734, 2), (349, 3), (3054, 4), (2683, 5), (1450, 6), (1394, 7), (688, 8), (686, 9), (235, 10), (773, 11), (1909, 12), (164, 13), (1868, 14), (56, 15), (248, 16), (1830, 17), (2931, 18), (265, 19), (2472, 20), (2981, 21), (2328, 22), (977, 23), (1918, 24), (1097, 25), (2177, 26), (2387, 27), (2242, 28), (1563, 29), (1980, 30), (3086, 31), (2820, 32), (723, 33), (2678, 34), (570, 35), (258, 36), (1929, 37), (145, 38), (1973, 39), (377, 40), (2585, 41), (2675, 42), (490, 43), (2732, 44), (211, 45), (529, 46), (479, 47), (2597, 48), (580, 49), (917, 50), (2508, 51), (1755, 52), (942, 53), (2849, 54), (2052, 55), (590, 56), (2729, 57), (2792, 58), (154, 59), (1701, 60), (893, 61), (2095, 62), (1484, 63), (1348, 64), (2984, 65), (1643, 66), (1550, 67), (1032, 68), (628, 69), (2271, 70), (238, 71), (782, 72), (1618, 73), (1943, 74), (777, 75), (2340, 76), (1582, 77), (1259, 78), (1968, 79), (746, 80), (205, 81), (375, 82), (1707, 83), (1473, 84), (1947, 85), (94

И отображение из старой кодировки в новую

In [32]:
import csv

with open('data/ADE20K_filtred/index_old_to_new.csv', 'w') as f:
    w = csv.writer(f)
    w.writerows(index_old_to_new.items())

In [33]:
with open('data/ADE20K_filtred/class_index_dict.csv', 'w') as f:
    w = csv.writer(f)
    w.writerows(class_index_dict.items())

In [34]:
len(index_old_to_new)

93

In [56]:
list_old_to_new = list(index_old_to_new.items())
list_class_to_old = list(class_index_dict.items())

In [63]:
with open('data/ADE20K_filtred/class_old_new.csv', 'w') as f:
    for i in range(len(list_old_to_new)):
        class_ = list_class_to_old[i][0]
        old = list_old_to_new[i][0]
        new = list_old_to_new[i][1]
        line = class_ + ";" + str(old) + ";" + str(new)
        f.write(line+"\n")

In [None]:
class ClassDescription():
    def __init__(self, path):
        