## Checking  dataset consistency

In [None]:
#!g1.1 #noqa
import pickle
import random
from typing import List, Set, Tuple

import matplotlib.pyplot as plt
import pandas as pd

from processingDataSet import get_not_RGB_pic,  ImageNetDataset, PreprocessingData

In [None]:
#!g1.1  #noqa
random_seed = 10
data_path = '/home/jupyter/mnt/datasets/ImageNet/ILSVRC/Data/CLS-LOC/train'

In [None]:
#!g1.1  #noqa
random.seed(random_seed)

In [None]:
#!g1.1  #noqa
prData = PreprocessingData()
train_data, val_data = prData.get_data(data_path, random_seed)  # all dataset

In [None]:
#!g1.1  #noqa
train_set = ImageNetDataset(data_path, train_data)
val_set = ImageNetDataset(data_path, val_data)

In [None]:
#!g1.1  #noqa
gray_train = get_not_RGB_pic(train_set)

In [None]:
#!g1.1  #noqa
len(gray_train)

In [None]:
#!g1.1  #noqa
gray_val = get_not_RGB_pic(val_set)

In [None]:
#!g1.1  #noqa
len(gray_val)

In [None]:
#!g1.1  #noqa
def data_filter(data: List[Tuple[str, int]], excluded: Set) -> List[Tuple[str, int]]:
    """Filter gray pictures."""
    result = []
    for i in range(len(data)):
        if i not in excluded:
            result.append(data[i])
    return result

In [None]:
#!g1.1  #noqa
colored_train_data = data_filter(train_data, gray_train)
colored_val_data = data_filter(val_data, gray_val)

In [None]:
#!g1.1  #noqa
with open('filtered_data.pkl', 'wb') as file:
    pickle.dump(colored_train_data, file)
    pickle.dump(colored_val_data, file)

In [None]:
#!g1.1  #noqa
def data_counter(indexes: Set, data: ImageNetDataset) -> List[int]:
    """Count amount of gray photo at each class."""
    ret = [0] * 1000
    for ind in indexes:
        ret[data[ind][1]] += 1
    return ret

In [None]:
#!g1.1  #noqa
amount_train_gray = data_counter(gray_train, train_set)
amount_val_gray = data_counter(gray_val, val_set)

In [None]:
#!g1.1  #noqa
amount_train_c = [0] * 1000
for _, ind in colored_train_data:
    amount_train_c[ind] += 1

amount_val_c = [0] * 1000
for _, ind in colored_val_data:
    amount_val_c[ind] += 1

In [None]:
#!g1.1  #noqa
frame = pd.DataFrame({'Colored_train': amount_train_c, 'Colored_val': amount_val_c,
                      'Gray_train': amount_train_gray, 'Gray_val': amount_val_gray})
frame.to_csv('dataset_statistics.csv')

## Let's look at the charts to estimate amount of gray photos

In [None]:
#!g1.1  #noqa
fig, axis = plt.subplots(5, 2, figsize=(30, 30))

start = 0
for i, end in enumerate(range(100, 1001, 100)):
    axis[i % 5][i % 2].set_ylim([600, 1300])
    axis[i % 5][i % 2].bar(range(start, end), frame.Colored_train[start: end],
                           color = (66/255, 170/255, 255/255))
    axis[i % 5][i % 2].bar(range(start, end), frame.Gray_train[start: end],
                           bottom = frame.Colored_train[start: end], color = (106/255, 90/255, 205/255))
    axis[i % 5][i % 2].legend(['Amount of colored photo', 'Amount of gray photo'])
    axis[i % 5][i % 2].set_xlabel('Class index')
    axis[i % 5][i % 2].set_ylabel('Number of photo')
    start = end
fig.suptitle('Training data volume')

plt.show()

In [None]:
#!g1.1  #noqa
fig, axis = plt.subplots(5, 2, figsize=(30, 30))

start = 0
for i, end in enumerate(range(100, 1001, 100)):
    axis[i % 5][i % 2].bar(range(start, end), frame.Colored_val[start: end],
                           color = (66/255, 170/255, 255/255))
    axis[i % 5][i % 2].bar(range(start, end), frame.Gray_val[start: end],
                           bottom = frame.Colored_val[start: end], color = (106/255, 90/255, 205/255))
    axis[i % 5][i % 2].legend(['Amount of colored photo', 'Amount of gray photo'])
    axis[i % 5][i % 2].set_xlabel('Class index')
    axis[i % 5][i % 2].set_ylabel('Number of photo')
    start = end
fig.suptitle('Validation data volume')

plt.show()