# Разработка системы распознавания предметов интерьера в потоковом видео

## Часть 2: Выявление частоты классов в датасете

In [2]:
%matplotlib inline

#System
import os
import shutil
import random

#Base
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import seaborn as sns
from skimage.io import imread

#Seed
seed = 99
np.random.seed(seed)
random.seed(seed)

### 1. Ранее написанные функции

 Загрузим ранее написанные функции для удобной работы с именами файлов и преобразования файлов с описанием в pandas.DataFrame

In [5]:
main_dir = 'data/ADE20K_filtred/images/'

In [4]:
def get_format(file_name):
    name_split = file_name.split("_")
    file_format = name_split[0:-1] + name_split[-1].split(".")
    ade, train_or_val, name = file_format[:3]
    extension = file_format[-1]
    
    description = 0
    parts_num = 0
    if extension == 'jpg':
        description = 'img'
    elif extension == 'txt':
        description = 'text'
    elif extension == 'png':
        description = file_format[3]
        if file_format[4].isdigit():
            parts_num = int(file_format[4])
            
    return [name, description, train_or_val, parts_num]

Но преобразуем их с учётом того, что мы будем работать только с верхними уровнями объектов

In [10]:
def get_seg_description(description_path):
    description = []
    with open(description_path) as f:
        for line in f:
            
            (instance_n, part_level, occluded,
            class_name, original_name, attributes_list) = line.rstrip().split(" # ")
            
            if part_level != '0':  #добавляем только элементы верхнего уровня
                continue
            
            description.append([class_name, original_name, attributes_list.strip("\"")])
    
    return pd.DataFrame(description).rename(columns={0: "class_name", 1: "original_name", 2: "attributes" })

### 2. Получение полного списка классов

Для определения того, какие классы нам нужно детектировать выделим все классы из описаний фотографий.

In [13]:
def get_full_df_description(dir_path, progress_bar=False):
    df = pd.DataFrame(columns=["class_name", "original_name", "attributes"])
    progress_counter = 0
    
    for path, dirs, files in os.walk(dir_path):
        if files:
            for file in files:
                name, description, train_or_val, parts_num = get_format(file)
                if description == 'text':
                    df = df.append(get_seg_description(path +'/' + file), ignore_index=True)
                    
                    if progress_bar:
                        progress_counter += 1
                        if progress_counter % 500 == 0:
                            print("Done: " + str(progress_counter))
    return df

In [16]:
%%time
full_obj_class_list = get_full_df_description(img_dir, progress_bar=True)

Done: 500
Done: 1000
Done: 1500
Done: 2000
Done: 2500
Done: 3000
Done: 3500
Done: 4000
Done: 4500
Done: 5000
Done: 5500
Done: 6000
Done: 6500
Done: 7000
CPU times: user 58.3 s, sys: 247 ms, total: 58.6 s
Wall time: 59 s


Посмотрим на получившиеся значения

In [17]:
full_obj_class_list

Unnamed: 0,class_name,original_name,attributes
0,sky,sky,
1,wall,wall,
2,wall,wall,
3,wall,wall,
4,wall,wall,
...,...,...,...
174895,cushion,cushion,
174896,lamp,table lamp,"direct,general,off,volumetric"
174897,lamp,table lamp,off
174898,lamp,table lamp,"direct,general,off,volumetric"


### 3. Выделение самых частых значений

Выявление самых частых классов

In [19]:
full_obj_class_list.class_name.value_counts()[:15]

wall                                     20374
chair                                    10156
floor, flooring                           7137
painting, picture                         6955
cabinet                                   6838
windowpane, window                        6689
table                                     6221
light, light source                       5830
ceiling                                   5318
lamp                                      5099
cushion                                   4756
curtain, drape, drapery, mantle, pall     3999
book                                      3820
door                                      2703
plant, flora, plant life                  2616
Name: class_name, dtype: int64

Посмотрим количество уникальных значений

In [20]:
full_obj_class_list.class_name.unique().shape

(1263,)

Определим минимальное количество слов

In [22]:
min_obj = 25 #Минимальное количество объектов для равпознавания
indexes = list(full_obj_class_list.class_name.value_counts() > min_obj)
class_list = list(full_obj_class_list.class_name.value_counts()[indexes].index)
len(class_list)

261

In [23]:
class_list

['wall',
 'chair',
 'floor, flooring',
 'painting, picture',
 'cabinet',
 'windowpane, window',
 'table',
 'light, light source',
 'ceiling',
 'lamp',
 'cushion',
 'curtain, drape, drapery, mantle, pall',
 'book',
 'door',
 'plant, flora, plant life',
 'bed',
 'pillow',
 'armchair',
 'bottle',
 'person, individual, someone, somebody, mortal, soul',
 'vase',
 'box',
 'sofa, couch, lounge',
 'wall socket, wall plug, electric outlet, electrical outlet, outlet, electric receptacle',
 'shelf',
 'flower',
 'plate',
 'glass, drinking glass',
 'rug, carpet, carpeting',
 'sink',
 'sconce',
 'mirror',
 'pot, flowerpot',
 'work surface',
 'towel',
 'desk',
 'swivel chair',
 'coffee table, cocktail table',
 'bowl',
 'plaything, toy',
 'basket, handbasket',
 'pot',
 'candlestick, candle holder',
 'stool',
 'switch, electric switch, electrical switch',
 'chest of drawers, chest, bureau, dresser',
 'television receiver, television, television set, tv, tv set, idiot box, boob tube, telly, goggle box',

### 4. Класс для работы с данными

In [426]:
class ClassList:
    def __init__(self, dir_path, min_obj=25, progress_bar=False):
        df = self.get_full_df(dir_path, progress_bar=progress_bar)
        
        indexes = list(df.class_name.value_counts() > min_obj)
        self.class_list = ['-'] + list(df.class_name.value_counts()[indexes].index)
        
        self.class_list = list(map(lambda x: [x], self.class_list))
       
    
    def get_full_df(self, dir_path, progress_bar):
        df = pd.DataFrame(columns=["class_name", "original_name", "attributes"])
        progress_counter = 0

        for path, dirs, files in os.walk(dir_path):
            if files:
                for file in files:
                    name, description, train_or_val, parts_num = get_format(file)
                    if description == 'text':
                        df = df.append(get_seg_description(path +'/' + file), ignore_index=True)

                        if progress_bar:
                            progress_counter += 1
                            if progress_counter % 500 == 0:
                                print("Done: " + str(progress_counter))
        if progress_bar:
            print("Done!")
        return df
    
        
    def remove_class(self, class_name):
        for i in range(len(self.class_list)):
            if class_name in self.class_list[i]:
                self.class_list[i].remove(class_name)
                
                if [] in self.class_list:
                    self.class_list.remove([])
                    
                return True
        return False
    
    
    def find_i(self, class_name):
        for i in range(len(self.class_list)):
            if class_name in self.class_list[i]:
                return i
        return -1
    
    
    def join(self, class_name_from, class_name_to):
        i_from = self.find_i(class_name_from)
        i_to = self.find_i(class_name_to)
        
        if i_from == -1 or i_to == -1 or i_from == i_to:
            return False
        
        for i in self.class_list[i_from]:
            self.class_list[i_to].append(i)
            
        self.class_list.remove(self.class_list[i_from])
        return True
        
    def size(self):
        return len(self.class_list) 
        
    def save_in_file(self, path):
        with open(path, 'w') as f:
            for line in self.class_list:
                f.write(str(line) + '\n')
        
cl = ClassList(img_dir, min_obj=50, progress_bar=True)                

Done: 500
Done: 1000
Done: 1500
Done: 2000
Done: 2500
Done: 3000
Done: 3500
Done: 4000
Done: 4500
Done: 5000
Done: 5500
Done: 6000
Done: 6500
Done: 7000
Done


### 5. Удаление лишних классов

In [427]:
cl.remove_class('curtain, drape, drapery, mantle, pall' )
cl.remove_class('plant, flora, plant life')
cl.remove_class('bottle')
cl.remove_class('vase')
cl.remove_class('box')
cl.remove_class('wall socket, wall plug, electric outlet, electrical outlet, outlet, electric receptacle')
cl.remove_class('flower')
cl.remove_class('plate')
cl.remove_class('glass, drinking glass')
cl.remove_class('mirror')
cl.remove_class('pot, flowerpot')
cl.remove_class('towel')
cl.remove_class('bowl')
cl.remove_class('plaything, toy')
cl.remove_class('basket, handbasket')
cl.remove_class('pot')
cl.remove_class('switch, electric switch, electrical switch')
cl.remove_class('column, pillar')
cl.remove_class('figurine, statuette')
cl.remove_class('tray')
cl.remove_class('railing, rail')
cl.remove_class('jar')
cl.remove_class('napkin, table napkin, serviette')
cl.remove_class('fruit')
cl.remove_class('mug')
cl.remove_class('pool ball')
cl.remove_class('shoe')
cl.remove_class('hood, exhaust hood')
cl.remove_class('pool table, billiard table, snooker table')
cl.remove_class('bag')
cl.remove_class('pitcher, ewer')
cl.remove_class('place mat')
cl.remove_class('cup')
cl.remove_class('bucket, pail')
cl.remove_class('knife')
cl.remove_class('toilet tissue, toilet paper, bathroom tissue')
cl.remove_class('ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin')
cl.remove_class('can, tin, tin can')
cl.remove_class('tree')
cl.remove_class('sculpture')
cl.remove_class('signboard, sign')
cl.remove_class('soap dispenser')
cl.remove_class('pen')
cl.remove_class('cue, cue stick, pool cue, pool stick')
cl.remove_class('rod')
cl.remove_class('teacup')
cl.remove_class('food, solid food')
cl.remove_class('pottery, clayware')
cl.remove_class('arcade machine')
cl.remove_class('dishrag, dishcloth')
cl.remove_class('ball')
cl.remove_class('towel rack, towel horse')
cl.remove_class('soap')
cl.remove_class('canister, cannister, tin')
cl.remove_class('bag, handbag, pocketbook, purse')
cl.remove_class('paper towel')
cl.remove_class('sky')
cl.remove_class('fork')
cl.remove_class('pipe, pipage, piping')
cl.remove_class('saucepan')
cl.remove_class('soap dish')
cl.remove_class('ashtray')
cl.remove_class('bag, traveling bag, travelling bag, grip, suitcase')
cl.remove_class('building, edifice')
cl.remove_class('spoon')
cl.remove_class('partition, divider')
cl.remove_class('coffee cup')
cl.remove_class('shutter')
cl.remove_class('backpack, back pack, knapsack, packsack, rucksack, haversack')
cl.remove_class('poster, posting, placard, notice, bill, card')
cl.remove_class('bouquet, corsage, posy, nosegay')
cl.remove_class('microphone, mike')
cl.remove_class('mousepad, mouse mat')
cl.remove_class('machine')
cl.remove_class('remote control, remote')
cl.remove_class('vent, venthole, vent-hole, blowhole')
cl.remove_class('spectacles, specs, eyeglasses, glasses')
cl.remove_class('hanger')
cl.remove_class('countertop')
cl.remove_class('beam')
cl.remove_class('heater, warmer')
cl.remove_class('kettle, boiler')
cl.remove_class('bannister, banister, balustrade, balusters, handrail')
cl.remove_class('toaster')
cl.remove_class('washer, automatic washer, washing machine')
cl.remove_class('piano, pianoforte, forte-piano')
cl.remove_class('printer')
cl.remove_class('dishwasher, dish washer, dishwashing machine')
cl.remove_class('loudspeaker, speaker, speaker unit, loudspeaker system, speaker system')

True

In [428]:
cl.size()

104

### 6. Объединение классов

In [429]:
cl.class_list

[['-'],
 ['wall'],
 ['chair'],
 ['floor, flooring'],
 ['painting, picture'],
 ['cabinet'],
 ['windowpane, window'],
 ['table'],
 ['light, light source'],
 ['ceiling'],
 ['lamp'],
 ['cushion'],
 ['book'],
 ['door'],
 ['bed'],
 ['pillow'],
 ['armchair'],
 ['person, individual, someone, somebody, mortal, soul'],
 ['sofa, couch, lounge'],
 ['shelf'],
 ['rug, carpet, carpeting'],
 ['sink'],
 ['sconce'],
 ['work surface'],
 ['desk'],
 ['swivel chair'],
 ['coffee table, cocktail table'],
 ['candlestick, candle holder'],
 ['stool'],
 ['chest of drawers, chest, bureau, dresser'],
 ['television receiver, television, television set, tv, tv set, idiot box, boob tube, telly, goggle box'],
 ['blind, screen'],
 ['clock'],
 ['stove, kitchen stove, range, kitchen range, cooking stove'],
 ['chandelier, pendant, pendent'],
 ['spotlight, spot'],
 ['paper'],
 ['fireplace, hearth, open fireplace'],
 ['computer, computing machine, computing device, data processor, electronic computer, information processing 

In [430]:
cl.join('floor, flooring', 'wall')
cl.join('ceiling', 'wall')

cl.join('cushion', 'bed')
cl.join('pillow', 'bed')
cl.join('eiderdown, duvet, continental quilt', 'bed')
cl.join('blanket, cover', 'bed')
cl.join('cradle', 'bed')

cl.join('swivel chair', 'chair')
cl.join('seat', 'chair')
cl.join('armchair', 'chair')
cl.join('stool', 'chair')
cl.join('bench', 'chair')

cl.join('book', 'paper')
cl.join('magazine', 'paper')
cl.join('notebook', 'paper')
cl.join('document, written document, papers', 'paper')
cl.join('booklet, brochure, folder, leaflet, pamphlet', 'paper')
cl.join('ottoman, pouf, pouffe, puff, hassock', 'paper')

cl.join('double door', 'door')
cl.join('doorframe, doorcase', 'door')
cl.join('screen door, screen', 'door')

cl.join('lamp', 'light, light source')
cl.join('sconce', 'light, light source')
cl.join('candlestick, candle holder', 'light, light source')
cl.join('spotlight, spot', 'light, light source')
cl.join('candle, taper, wax light', 'light, light source')
cl.join('fluorescent, fluorescent fixture', 'light, light source')
cl.join('light bulb, lightbulb, bulb, incandescent lamp, electric light, electric-light bulb', 'light, light source')
cl.join('candelabrum, candelabra', 'light, light source')
cl.join('chandelier, pendant, pendent', 'light, light source')

cl.join('work surface', 'table')
cl.join('desk', 'table')
cl.join('coffee table, cocktail table', 'table')
cl.join('kitchen island', 'table')
cl.join('counter', 'table')
cl.join('console table, console', 'table')

cl.join('blind, screen', 'windowpane, window')

cl.join('television receiver, television, television set, tv, tv set, idiot box, boob tube, telly, goggle box', 'monitor, monitoring device')
cl.join('screen, crt screen', 'monitor, monitoring device')
cl.join('screen, silver screen, projection screen', 'monitor, monitoring device')

cl.join('faucet, spigot', 'sink')

cl.join('sofa, couch, lounge', 'bed')

cl.join('bulletin board, notice board', 'blackboard, chalkboard')
cl.join('board, plank', 'blackboard, chalkboard')

cl.join('skylight, fanlight', 'windowpane, window')
cl.join('pane, pane of glass, window glass', 'windowpane, window')

cl.join('laptop, laptop computer', 'computer, computing machine, computing device, data processor, electronic computer, information processing system')
cl.join('keyboard', 'computer, computing machine, computing device, data processor, electronic computer, information processing system')
cl.join('system', 'computer, computing machine, computing device, data processor, electronic computer, information processing system')

cl.join('jacket', 'shirt')
cl.join('sweater, jumper', 'shirt')
cl.join('trouser, pant', 'shirt')
cl.join('hat, chapeau, lid', 'shirt')
cl.join('shirt', 'apparel, wearing apparel, dress, clothes')

cl.join('grill, grille, grillwork', 'stove, kitchen stove, range, kitchen range, cooking stove')

cl.join('microwave, microwave oven', 'oven')

cl.join('shelf', 'cabinet')
cl.join('bookcase', 'cabinet')
cl.join('wardrobe, closet, press', 'cabinet')
cl.join('rack', 'cabinet')

cl.join('stairway, staircase', 'stairs, steps')
cl.join('step, stair', 'stairs, steps')

cl.join('tapestry, tapis', 'rug, carpet, carpeting')

cl.join('drawing', 'painting, picture')

cl.join('chest', 'chest of drawers, chest, bureau, dresser')
cl.join('drawer', 'chest of drawers, chest, bureau, dresser')

cl.join('buffet, counter, sideboard', 'bar')
cl.join('booth, cubicle, stall, kiosk', 'bar')

cl.join('shower stall, shower bath', 'shower')

cl.join('teapot', 'coffee maker')

cl.join('radiator', 'air conditioner, air conditioning')
cl.join('fan', 'air conditioner, air conditioning')

True

In [431]:
cl.size()

32

In [432]:
cl.save_in_file('data/ADE20K_filtred/class_list.txt')

### 7. Совмещение с индексами классов

In [433]:
obj_names = 0
with open('data/ADE20K_filtred/objectnames.txt', 'r') as f:
    obj_names = f.readline().split("\t")
    obj_names = list(map(lambda name: name.strip("\'"), obj_names))

Запишем правила перекодирования классов в файл

In [453]:
with open("data/ADE20K_filtred/class_encode.txt", 'w') as f:
    for i, class_arr in zip(range(len(cl.class_list)), cl.class_list):
        index_arr = []
        for class_name in class_arr:
            index_arr.append(obj_names.index(class_name))
        index_arr = list(map(str, index_arr))
        f.write(str(i) + ';' + "|".join(index_arr) + ';' + "|".join(class_arr) + '\n')