In [6]:
import os
from pathlib import Path
import numpy as np

In [7]:
data_folder_path = Path('/home/pshulzhenko/prim2024_depth_compression/proj/data/')

Original train folder was delabelled (folder structure deleted) and renamed to unlabelled folder

In [18]:
unlabelled_files = os.listdir(data_folder_path / 'unlabelled')
print(f'Number of unlabelled images: {len(unlabelled_files)}')

original_valid_fire_files = os.listdir(data_folder_path / 'orig_valid/wildfire/')
print(f'Number of original valid fire images: {len(original_valid_fire_files)}')
original_valid_no_fire_files = os.listdir(data_folder_path / 'orig_valid/nowildfire/')
print(f'Number of original valid no-fire images: {len(original_valid_no_fire_files)}')

test_fire_files = os.listdir(data_folder_path / 'test/wildfire/')
print(f'Number of original test fire images: {len(test_fire_files)}')
test_no_fire_files = os.listdir(data_folder_path / 'test/nowildfire/')
print(f'Number of original test no-fire images: {len(test_no_fire_files)}')

Number of unlabelled images: 30250
Number of original valid fire images: 3480
Number of original valid no-fire images: 2820
Number of original test fire images: 3480
Number of original test no-fire images: 2820


In [None]:
# DELETE corrupted data/test/nowildfire/-113.997195,51.155464.jpg
#  data/test/nowildfire/-73.589232,45.461726.jpg
# data/test/nowildfire/-122.760985,49.132847.jpg
# data/test/nowildfire/-122.754494,49.228093.jpg
# data/test/nowildfire/-79.560356,43.643023.jpg

### Let's split original valid set into train/val. 

Val will have 15% of original_valid images. Keep proportion of fire/no-fire unchanged

In [5]:
rng = np.random.default_rng(seed=42)

In [6]:
val_fire_indices = rng.choice(
    list(range(len(original_valid_fire_files))), 
    size=int(0.15 * len(original_valid_fire_files)), 
    replace=False
)
val_no_fire_indices = rng.choice(
    list(range(len(original_valid_no_fire_files))), 
    size=int(0.15 * len(original_valid_no_fire_files)), 
    replace=False
)
train_fire_indices = [i for i in range(len(original_valid_fire_files)) if i not in val_fire_indices]
train_no_fire_indices = [i for i in range(len(original_valid_no_fire_files)) if i not in val_no_fire_indices]

len(train_fire_indices), len(train_no_fire_indices), len(val_fire_indices), len(val_no_fire_indices)

(2958, 2397, 522, 423)

In [7]:
import shutil

destination_folder = data_folder_path / 'val/wildfire'
os.makedirs(destination_folder, exist_ok=True)
for i in val_fire_indices:
    file_name = original_valid_fire_files[i]
    source_path = data_folder_path / 'orig_valid/wildfire' / file_name
    destination_path = destination_folder / file_name
    
    if os.path.exists(source_path):
        shutil.copy2(source_path, destination_path)
        print(f'Copied: {file_name}')
    else:
        print(f'File not found: {file_name}')

destination_folder = data_folder_path / 'val/nowildfire'
os.makedirs(destination_folder, exist_ok=True)
for i in val_no_fire_indices:
    file_name = original_valid_no_fire_files[i]
    source_path = data_folder_path / 'orig_valid/nowildfire' / file_name
    destination_path = destination_folder / file_name
    
    if os.path.exists(source_path):
        shutil.copy2(source_path, destination_path)
        print(f'Copied: {file_name}')
    else:
        print(f'File not found: {file_name}')

# TRAIN


destination_folder = data_folder_path / 'train/wildfire'
os.makedirs(destination_folder, exist_ok=True)
for i in train_fire_indices:
    file_name = original_valid_fire_files[i]
    source_path = data_folder_path / 'orig_valid/wildfire' / file_name
    destination_path = destination_folder / file_name
    
    if os.path.exists(source_path):
        shutil.copy2(source_path, destination_path)
        print(f'Copied: {file_name}')
    else:
        print(f'File not found: {file_name}')

destination_folder = data_folder_path / 'train/nowildfire'
os.makedirs(destination_folder, exist_ok=True)
for i in train_no_fire_indices:
    file_name = original_valid_no_fire_files[i]
    source_path = data_folder_path / 'orig_valid/nowildfire' / file_name
    destination_path = destination_folder / file_name
    
    if os.path.exists(source_path):
        shutil.copy2(source_path, destination_path)
        print(f'Copied: {file_name}')
    else:
        print(f'File not found: {file_name}')

Copied: -75.7893,47.6247.jpg
Copied: -70.04671,48.47153.jpg
Copied: -70.73924,48.02068.jpg
Copied: -70.0404,49.6715.jpg
Copied: -67.2507,47.9425.jpg
Copied: -74.60877,45.65258.jpg
Copied: -72.65985,48.69365.jpg
Copied: -69.276,48.79227.jpg
Copied: -76.87399,45.93825.jpg
Copied: -60.65964,50.24311.jpg
Copied: -69.78687,47.35042.jpg
Copied: -76.17378,46.81835.jpg
Copied: -75.872,45.6595.jpg
Copied: -66.55373,48.05666.jpg
Copied: -77.53457,53.83423.jpg
Copied: -78.9817,46.7906.jpg
Copied: -66.09926,49.17206.jpg
Copied: -74.4115,45.84143.jpg
Copied: -72.38135,50.45983.jpg
Copied: -77.96159,49.04047.jpg
Copied: -79.0523,48.19724.jpg
Copied: -76.05216,46.18465.jpg
Copied: -74.0956,45.2581.jpg
Copied: -74.19649,47.7545.jpg
Copied: -69.8788,48.158.jpg
Copied: -78.77205,53.01079.jpg
Copied: -78.44086,52.53867.jpg
Copied: -79.02927,48.29227.jpg
Copied: -74.80601,46.26709.jpg
Copied: -78.13948,48.58783.jpg
Copied: -73.7228,46.0513.jpg
Copied: -74.9469,48.6608.jpg
Copied: -64.60509,48.90888.jpg
Co

In [8]:
val_fire_files = os.listdir(data_folder_path / 'val/wildfire/')
print(f'Number of val fire images: {len(val_fire_files)}')
val_no_fire_files = os.listdir(data_folder_path / 'val/nowildfire/')
print(f'Number of val no-fire images: {len(val_no_fire_files)}')

train_fire_files = os.listdir(data_folder_path / 'train/wildfire/')
print(f'Number of train fire images: {len(train_fire_files)}')
train_no_fire_files = os.listdir(data_folder_path / 'train/nowildfire/')
print(f'Number of train no-fire images: {len(train_no_fire_files)}')

Number of val fire images: 522
Number of val no-fire images: 423
Number of train fire images: 2958
Number of train no-fire images: 2397


### Image lvl stats

In [6]:
from PIL import Image
import numpy as np

image_path = data_folder_path / 'unlabelled' / unlabelled_files[0]
image = Image.open(image_path)
image_array = np.array(image)
print(image_array.shape)

NameError: name 'unlabelled_files' is not defined

Calculate means on train set:

In [None]:
from tqdm import tqdm 
from PIL import Image
import numpy as np

means_r, means_g, means_b = [], [], []
count_r, count_g, count_b = [], [], []

for split_path in ['train/wildfire', 'train/nowildfire']:
    if split_path == 'train/wildfire':
        file_names = train_fire_files
    else:
        file_names = train_no_fire_files

    for file_name in tqdm(file_names):
        image_path = data_folder_path / split_path / file_name
        image = Image.open(image_path)
        image_array = np.array(image)
        means_r.append(image_array[:, :, 0].mean())
        means_g.append(image_array[:, :, 1].mean())
        means_b.append(image_array[:, :, 2].mean())
        count_r.append(image_array[:, :, 0].size)
        count_g.append(image_array[:, :, 1].size)
        count_b.append(image_array[:, :, 2].size)

for i, (ms, cs) in enumerate([(means_r, count_r), (means_g, count_g), (means_b, count_b)]):
    global_count = np.sum(cs)
    global_mean = 0
    for m, c in zip(ms, cs):
        global_mean += m * (c / global_count)
    
    print(f'Channel {i} mean = {global_mean}')

100%|██████████| 2958/2958 [00:05<00:00, 497.48it/s]
100%|██████████| 2397/2397 [00:04<00:00, 505.64it/s]

Channel 0 mean = 75.46934495703044
Channel 1 mean = 88.53264077440481
Channel 2 mean = 64.13317987614084





In [9]:
from tqdm import tqdm 
from PIL import Image
import numpy as np

mean_r, mean_g, mean_b = 75.46934495703044, 88.53264077440481, 64.13317987614084

vars_r, vars_g, vars_b = [], [], []
count_r, count_g, count_b = [], [], []

for split_path in ['train/wildfire', 'train/nowildfire']:
    if split_path == 'train/wildfire':
        file_names = train_fire_files
    else:
        file_names = train_no_fire_files

    for file_name in tqdm(file_names):
        image_path = data_folder_path / split_path / file_name
        try:
            image = Image.open(image_path)
            image_array = np.array(image)
        except OSError as e:
            print(f"Error opening image: {file_name}")
            continue

        vars_r.append(np.mean(abs(image_array[:, :, 0] - mean_r)**2))
        vars_g.append(np.mean(abs(image_array[:, :, 1] - mean_g)**2))
        vars_b.append(np.mean(abs(image_array[:, :, 2] - mean_b)**2))
        count_r.append(image_array[:, :, 0].size)
        count_g.append(image_array[:, :, 1].size)
        count_b.append(image_array[:, :, 2].size)

for i, (vs, cs) in enumerate([(vars_r, count_r), (vars_g, count_g), (vars_b, count_b)]):
    global_count = np.sum(cs)
    global_var = 0
    for v, c in zip(vs, cs):
        global_var += v * (c / global_count)

    global_std = np.sqrt(global_var)
    
    print(f'Channel {i} std = {global_std}')

100%|██████████| 2958/2958 [00:13<00:00, 211.75it/s]
100%|██████████| 2397/2397 [00:11<00:00, 217.29it/s]

Channel 0 std = 49.21239737079363
Channel 1 std = 41.20182281347267
Channel 2 std = 42.39381611518958





In [10]:
# Train dataset stats:
means = np.array([75.46934495703044, 88.53264077440481, 64.13317987614084])
stds = np.array([49.21239737079363, 41.20182281347267, 42.39381611518958])

print('Means: ', np.round(means / 255., 3))
print('Stds: ', np.round(stds / 255., 3))

Means:  [0.296 0.347 0.252]
Stds:  [0.193 0.162 0.166]


Calculate means on unlabelled set:

In [20]:
from tqdm import tqdm 
from PIL import Image
import numpy as np

means_r, means_g, means_b = [], [], []
count_r, count_g, count_b = [], [], []

for file_name in tqdm(unlabelled_files):
    image_path = data_folder_path / 'unlabelled' / file_name
    try:
        image = Image.open(image_path)
        image_array = np.array(image)
    except OSError as e:
        print(f"Error opening image: {file_name}")
        continue
    means_r.append(image_array[:, :, 0].mean())
    means_g.append(image_array[:, :, 1].mean())
    means_b.append(image_array[:, :, 2].mean())
    count_r.append(image_array[:, :, 0].size)
    count_g.append(image_array[:, :, 1].size)
    count_b.append(image_array[:, :, 2].size)

for i, (ms, cs) in enumerate([(means_r, count_r), (means_g, count_g), (means_b, count_b)]):
    global_count = np.sum(cs)
    global_mean = 0
    for m, c in zip(ms, cs):
        global_mean += m * (c / global_count)
    
    print(f'Channel {i} mean = {global_mean}')

 78%|███████▊  | 23627/30250 [00:46<00:14, 471.51it/s]

Error opening image: -114.152378,51.027198.jpg


100%|██████████| 30250/30250 [01:11<00:00, 422.38it/s]


Channel 0 mean = 75.69275777900538
Channel 1 mean = 88.83299182634428
Channel 2 mean = 64.5060034581001


In [None]:
# Accurate enough
c / global_count

np.float64(3.305894409732553e-05)

In [23]:
#  Remove corrupted image

os.remove(data_folder_path / 'unlabelled' / '-114.152378,51.027198.jpg')

Calculate stds:

In [24]:
from tqdm import tqdm 
from PIL import Image
import numpy as np

mean_r, mean_g, mean_b = 75.69275777900538, 88.83299182634428, 64.5060034581001

vars_r, vars_g, vars_b = [], [], []
count_r, count_g, count_b = [], [], []

for file_name in tqdm(unlabelled_files):
    image_path = data_folder_path / 'unlabelled' / file_name
    try:
        image = Image.open(image_path)
        image_array = np.array(image)
    except OSError as e:
        print(f"Error opening image: {file_name}")
        continue

    vars_r.append(np.mean(abs(image_array[:, :, 0] - mean_r)**2))
    vars_g.append(np.mean(abs(image_array[:, :, 1] - mean_g)**2))
    vars_b.append(np.mean(abs(image_array[:, :, 2] - mean_b)**2))
    count_r.append(image_array[:, :, 0].size)
    count_g.append(image_array[:, :, 1].size)
    count_b.append(image_array[:, :, 2].size)

for i, (vs, cs) in enumerate([(vars_r, count_r), (vars_g, count_g), (vars_b, count_b)]):
    global_count = np.sum(cs)
    global_var = 0
    for v, c in zip(vs, cs):
        global_var += v * (c / global_count)

    global_std = np.sqrt(global_var)
    
    print(f'Channel {i} std = {global_std}')

 78%|███████▊  | 23623/30250 [01:53<00:30, 220.49it/s]

Error opening image: -114.152378,51.027198.jpg


100%|██████████| 30250/30250 [02:24<00:00, 208.68it/s]


Channel 0 std = 49.69764568077921
Channel 1 std = 41.67910325403557
Channel 2 std = 42.814960561762504


In [26]:
# Dataset stats:
means = np.array([75.69275777900538, 88.83299182634428, 64.5060034581001])
stds = np.array([49.69764568077921, 41.67910325403557, 42.814960561762504])

print('Means: ', np.round(means / 255., 3))
print('Stds: ', np.round(stds / 255., 3))

Means:  [0.297 0.348 0.253]
Stds:  [0.195 0.163 0.168]


Rather import difference compared to Imagenet: mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]