# **Clean train**<br/>
**Master's Degree in Data Science (A.Y. 2023/2024)**<br/>
**University of Milano - Bicocca**<br/>

Vittorio Haardt, Luca Porcelli

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!unzip "/content/drive/MyDrive/VIPM/Dataset/train_set.zip" -d train

In [None]:
import pandas as pd
from datasets import concatenate_datasets, load_dataset, load_from_disk
from datasets import DatasetDict
import math
import cv2
import numpy as np
import matplotlib.pyplot as plt
import operator
from skimage.color import rgb2gray
from skimage.feature import local_binary_pattern
from tqdm import tqdm
import os
import shutil

# Data Import

In [None]:
dataset = load_dataset('imagefolder', data_dir="/content/train/train_set")

Resolving data files:   0%|          | 0/118475 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

# Support Functions

In [None]:
def coOrdinateTransform(arr, width, height):
        Lab = np.zeros(3 * width * height).reshape(width, height, 3)
        for i in range(width):
            for j in range(height):
                Lab[i][j][0] = arr[i][j][0]
                Lab[i][j][1] = arr[i][j][1]
                Lab[i][j][2] = arr[i][j][2]

                Lab[i][j][1] = arr[i][j][1] + 127.0
                if Lab[i][j][1] >= 254.0:
                    Lab[i][j][1] = 254.0 - 1.0
                if Lab[i][j][1] < 0:
                    Lab[i][j][1] = 0

                Lab[i][j][2] = arr[i][j][2] + 127.0
                if Lab[i][j][2] >= 254.0:
                    Lab[i][j][2] = 254.0 - 1.0
                if Lab[i][j][2] < 0:
                    Lab[i][j][2] = 0
        return Lab

def maxgrad_and_mingrad_Lab(arr,num,wid,hei):
        gxx = gyy = gxy = 0.0
        rh = gh = bh = 0.0
        rv = gv = bv = 0.0
        theta = 0.0
        ori = np.zeros(wid * hei).reshape(wid, hei)
        for i in range(1, wid - 1):
            for j in range(1, hei - 1):
                rh=arr[i-1,j+1,0] + 2*arr[i,j + 1,0] + arr[i+1, j+1,0] - (arr[i-1, j - 1, 0] + 2 * arr[i,j-1, 0] + arr[i + 1, j - 1, 0])
                gh=arr[i-1,j+1,1] + 2*arr[i,j + 1,1] + arr[i+ 1,j+1,1] - (arr[i-1, j - 1, 1] + 2 * arr[i,j-1, 1] + arr[i + 1, j - 1, 1])
                bh=arr[i-1,j+1,2] + 2*arr[i,j + 1,2] + arr[i+ 1,j+1,2] - (arr[i-1, j - 1, 2] + 2 * arr[i,j-1, 2] + arr[i + 1, j - 1, 2])
                rv=arr[i+1,j-1,0] + 2*arr[i+1, j, 0] + arr[i+ 1,j+1,0] - (arr[i-1, j - 1, 0] + 2 * arr[i-1,j, 0] + arr[i - 1, j + 1, 0])
                gv=arr[i+1,j-1,1] + 2*arr[i+1, j, 1] + arr[i+ 1,j+1,1] - (arr[i-1, j - 1, 1] + 2 * arr[i-1,j, 1] + arr[i - 1, j + 1, 1])
                bv=arr[i+1,j-1,2] + 2*arr[i+1, j, 2] + arr[i+ 1,j+1,2] - (arr[i-1, j - 1, 2] + 2 * arr[i-1,j, 2] + arr[i - 1, j + 1, 2])

                gxx = rh * rh + gh * gh + bh * bh
                gyy = rv * rv + gv * gv + bv * bv
                gxy = rh * rv + gh * gv + bh * bv

                theta = round(math.atan(2.0 * gxy / (gxx - gyy + 0.00001))/ 2.0, 4)
                G1 = G2 = 0.0

                G1 = math.sqrt(abs(0.5 * ((gxx + gyy) + (gxx - gyy) * math.cos(2.0 * theta) + 2.0 * gxy * math.sin(2.0 * theta))))
                G2=math.sqrt(abs(0.5*((gxx+gyy)+(gxx-gyy)*math.cos(2.0*(theta +(math.pi/2.0)))+ 2.0 * gxy * math.sin(2.0*(theta+ (math.pi / 2.0))))))

                dir = 0

                if max(G1, G2) == G1:
                    dir = 90.0 + theta * 180.0 / math.pi
                    ori[i, j] = int(dir * num / 360.0)
                else:
                    dir = 180.0 + (theta + math.pi / 2.0) * 180.0 / math.pi
                    ori[i, j] = int(dir * num / 360.0)
                if ori[i, j] >= num - 1:
                    ori[i, j] = num - 1
        return ori

def compute(ColorX,ori,Lab,wid,hei,CSA,CSB,D):
        Arr = np.zeros(3 * wid * hei).reshape(wid, hei, 3)
        Arr = coOrdinateTransform(Lab, wid, hei)
        Matrix = np.zeros(CSA + CSB).reshape(CSA + CSB)
        hist = np.zeros(CSA + CSB).reshape(CSA + CSB)

        # -------------------calculate the color difference of different directions------------

        # ----------direction=0--------------------

        for i in range(wid):
            for j in range(hei - D):
                value = 0.0
                if ori[i, j + D] == ori[i, j]:
                    value = math.sqrt(math.pow(Arr[i, j + D, 0] - Arr[i, j,0], 2) + math.pow(Arr[i, j + D, 1]- Arr[i, j, 1], 2) + math.pow(Arr[i,j + D, 2] - Arr[i, j, 2], 2))
                    Matrix[int(ColorX[i, j])] += value
                if ColorX[i, j + D] == ColorX[i, j]:
                    value = math.sqrt(math.pow(Arr[i, j + D, 0] - Arr[i, j,0], 2) + math.pow(Arr[i, j + D, 1]- Arr[i, j, 1], 2) + math.pow(Arr[i,j + D, 2] - Arr[i, j, 2], 2))
                    Matrix[int(ori[i, j] + CSA)] += value

         # -----------direction=90---------------------

        for i in range(wid - D):
            for j in range(hei):
                value = 0.0
                if ori[i + D, j] == ori[i, j]:
                    value = math.sqrt(math.pow(Arr[i + D, j, 0] - Arr[i, j,0], 2) + math.pow(Arr[i + D, j, 1]- Arr[i, j, 1], 2) + math.pow(Arr[i+ D, j, 2] - Arr[i, j, 2], 2))
                    Matrix[int(ColorX[i, j])] += value
                if ColorX[i + D, j] == ColorX[i, j]:
                    value = math.sqrt(math.pow(Arr[i + D, j, 0] - Arr[i, j,0], 2) + math.pow(Arr[i + D, j, 1]- Arr[i, j, 1], 2) + math.pow(Arr[i+ D, j, 2] - Arr[i, j, 2], 2))
                    Matrix[int(ori[i, j] + CSA)] += value

       # -----------direction=135---------------------
        for i in range(wid - D):
            for j in range(hei - D):
                value = 0.0
                if ori[i + D, j + D] == ori[i, j]:
                    value = math.sqrt(math.pow(Arr[i + D, j + D, 0]- Arr[i, j, 0], 2) + math.pow(Arr[i+ D, j + D, 1] - Arr[i, j, 1], 2)+ math.pow(Arr[i + D, j + D, 2]- Arr[i, j, 2], 2))
                    Matrix[int(ColorX[i, j])] += value
                if ColorX[i + D, j + D] == ColorX[i, j]:
                    value = math.sqrt(math.pow(Arr[i + D, j + D, 0]- Arr[i, j, 0], 2) + math.pow(Arr[i+ D, j + D, 1] - Arr[i, j, 1], 2)+ math.pow(Arr[i + D, j + D, 2]- Arr[i, j, 2], 2))
                    Matrix[int(ori[i, j] + CSA)] += value

        # -----------direction=45---------------------

        for i in range(D, wid):
            for j in range(hei - D):
                value = 0.0
                if ori[i - D, j + D] == ori[i, j]:
                    value = math.sqrt(math.pow(Arr[i - D, j + D, 0]- Arr[i, j, 0], 2) + math.pow(Arr[i- D, j + D, 1] - Arr[i, j, 1], 2)+ math.pow(Arr[i - D, j + D, 2]- Arr[i, j, 2], 2))
                    Matrix[int(ColorX[i, j])] += value
                if ColorX[i - D, j + D] == ColorX[i, j]:
                    value = math.sqrt(math.pow(Arr[i - D, j + D, 0]- Arr[i, j, 0], 2) + math.pow(Arr[i- D, j + D, 1] - Arr[i, j, 1], 2)+ math.pow(Arr[i - D, j + D, 2]- Arr[i, j, 2], 2))
                    Matrix[int(ori[i, j] + CSA)] += value

        for i in range(CSA + CSB):
            hist[i] = (Matrix[i]) / 4.0

        return hist


In [None]:
def img_center(arr, dim1, dim2):
    dimension = (dim1,dim2)
    dim = (dimension[0]/1.5,dimension[1]/1.5) #1.5
    width, height = arr.shape[1], arr.shape[0]
    crop_width = dim[0] if dim[0]<arr.shape[1] else arr.shape[1]
    crop_height = dim[1] if dim[1]<arr.shape[0] else arr.shape[0]
    mid_x, mid_y = int(width/2), int(height/2)
    cw2, ch2 = int(crop_width/2), int(crop_height/2)
    crop_img = arr[mid_y-ch2:mid_y+ch2, mid_x-cw2:mid_x+cw2]
    return crop_img

In [None]:
def extractLBP(img):
    lbp = local_binary_pattern(img, 24,3, method="uniform")
    (hist, _) = np.histogram(lbp.ravel(),bins=np.arange(0, 27),range=(0, 26))
    hist = hist.astype("float")
    hist /= (hist.sum() + (1e-7))
    return lbp,hist

# Clean train

In [None]:
classi = list(range(0,251))

In [None]:
dataset = DatasetDict({'train': dataset['train'].filter(lambda example: example['y'] in classi)})

In [None]:
labels = dataset['train']['y']
labels = list(set(labels))

In [None]:
# Initialization of the variable to 0
unione = 0

# Iterating through labels using tqdm for progress visualization
for l in tqdm(labels):

    # Filtering dataset based on the class
    filtered_data_dict = DatasetDict({'train': dataset['train'].filter(lambda example: example['y'] == l)})

    # COLOR--------------------
    ## Color parameters
    lnum = 5
    anum = 2
    bnum = 2
    cnum = lnum * anum * bnum
    onum = 18
    N = cnum + onum

    ## Color dictionary initialization
    diz = {}

    # Iterating through images to compute color histograms
    for k in tqdm(range(len(filtered_data_dict["train"]["image"]))):
        # Preprocessing image
        img = np.array(filtered_data_dict["train"]["image"][k])
        dim = (64, 64)
        img = cv2.resize(img, dim)
        img = img_center(img, dim[0], dim[1])

        # Converting image to LAB color space
        Lab = cv2.cvtColor(img, cv2.COLOR_RGB2LAB)

        # Initializing histogram and QuantizedImage
        hist = np.zeros(cnum + onum)
        width, height, channels = img.shape
        QuantizedImage = np.zeros(width * height).reshape(width, height)

        L = a = b = 0
        for i in range(width):
            for j in range(height):
                L = int(Lab[i][j][0] * lnum / 100.0)
                if L >= lnum - 1:
                    L = lnum - 1
                elif L < 0:
                    L = 0

                a = int(Lab[i][j][1] * anum / 254.0)
                if a >= anum - 1:
                    a = anum - 1
                elif a < 0:
                    a = 0

                b = int(Lab[i][j][2] * bnum / 254.0)
                if b >= anum - 1:
                    b = anum - 1
                elif b < 0:
                    b = 0

                QuantizedImage[i][j] = bnum * anum * L + bnum * a + b

        lab = coOrdinateTransform(Lab, width, height)
        ori = maxgrad_and_mingrad_Lab(lab, onum, width, height)
        D = 1

        # Computing histograms
        hist = compute(QuantizedImage, ori, Lab, width, height, cnum, onum, D)
        diz[k] = hist

    # Mean histogram calculation
    array_values = np.array(list(diz.values()))
    median_array = np.median(array_values, axis=0)
    n_img = len(filtered_data_dict["train"]["image"])

    # Distance calculation
    ut = np.sum(array_values, axis=1)
    uq = np.sum(median_array)
    distance = np.zeros(n_img * N).reshape(n_img, N)
    for i in range(n_img):
        for j in range(N):
            distance[i, j] = (abs(diz[i][j] - median_array[j])) / (
                        abs(diz[i][j] + ut[i]) + abs(median_array[j] + uq))
    distanceSum = np.sum(distance, axis=1)

    # Calculation of values to remove
    keys = np.arange(len(distanceSum), dtype=int)
    Imagedictionary = dict(zip(keys, distanceSum))
    sorted_images = sorted(Imagedictionary.items(), key=operator.itemgetter(1))
    twenty_percent_index = int(len(sorted_images) * 0.75)  # 75
    last_20_percent = sorted_images[twenty_percent_index:]

    # List containing the most distant observations for color
    bad_list_color = [item[0] for item in last_20_percent]

    # TEXTURE--------------------
    # Texture dictionary extraction
    diz_texture = {}

    # Iterating through images to compute texture histograms
    for k in tqdm(range(len(filtered_data_dict["train"]["image"]))):
        # Preprocessing image
        img = np.array(filtered_data_dict["train"]["image"][k])
        dim = (64, 64)
        img = cv2.resize(img, dim)
        img = img_center(img, dim[0], dim[1])
        img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)

        # Extracting LBP features
        lb, vektor = extractLBP(img)
        diz_texture[k] = np.array(vektor)

    # Mean texture histogram calculation
    array_values = np.array(list(diz_texture.values()))
    median_array = np.median(array_values, axis=0)

    # Distance calculation for texture
    ut = np.sum(array_values, axis=1)
    uq = np.sum(median_array)
    distance = np.zeros(n_img * 26).reshape(n_img, 26)
    for i in range(n_img):
        for j in range(26):
            distance[i, j] = (abs(diz_texture[i][j] - median_array[j])) / (
                        abs(diz_texture[i][j] + ut[i]) + abs(median_array[j] + uq))
    distanceSum = np.sum(distance, axis=1)

    # Calculation of values to remove for texture
    keys = np.arange(len(distanceSum), dtype=int)
    Imagedictionary = dict(zip(keys, distanceSum))
    sorted_images = sorted(Imagedictionary.items(), key=operator.itemgetter(1))
    twenty_percent_index = int(len(sorted_images) * 0.75)  # 75
    last_20_percent = sorted_images[twenty_percent_index:]

    # List containing the most distant observations for texture
    bad_list_texture = [item[0] for item in last_20_percent]

    # ELIMINATION OF VALUES AND UNION ------------
    # Combining bad lists for color and texture
    bad_list = bad_list_texture + bad_list_color
    bad_list = list(set(bad_list))

    # Filtering good images based on bad list
    good_image = filtered_data_dict['train']['image']
    for index in sorted(bad_list, reverse=True):
        del good_image[index]

    # Creating a new dataset after filtering
    filt_data = DatasetDict({'train': filtered_data_dict['train'].filter(lambda example: example['image'] in good_image)})

    # Union of datasets
    if unione == 0:
        unione = filt_data['train']
    else:
        unione = concatenate_datasets([unione, filt_data['train']])

  0%|          | 0/1 [00:00<?, ?it/s]

Filter:   0%|          | 0/549 [00:00<?, ? examples/s]

100%|██████████| 549/549 [07:15<00:00,  1.26it/s]
100%|██████████| 549/549 [05:53<00:00,  1.55it/s]


Filter:   0%|          | 0/549 [00:00<?, ? examples/s]

100%|██████████| 1/1 [13:20<00:00, 800.80s/it]


In [None]:
unione.save_to_disk("/content/train/Union")

Saving the dataset (0/1 shards):   0%|          | 0/323 [00:00<?, ? examples/s]

# Data preparation for models

##Train

New folder with clean train

In [None]:
source_folder = "/content/train/train_set"
destination_folder = "/content/train/train_set_filtered"
for file_name in unione:
    source_file_path = os.path.join(source_folder, file_name)
    destination_file_path = os.path.join(destination_folder, file_name)

    # Check if the file exists before copying
    if os.path.exists(source_file_path):
        shutil.copy2(source_file_path, destination_file_path)
    else:
        print(f"File not found: {file_name}")

Csv as clean as train-test

In [None]:
df = pd.read_csv("/content/drive/MyDrive/VIPM/Label/train_info_dirty.csv")
filtered_df = df[df['file_name'].isin(unione)]
filtered_df['nome'] = filtered_df['file_name']
filtered_df.to_csv("/content/train/label.csv", index=False)

Create the folder by placing the images in subfolders indicating the classes they belong to

In [None]:
source_folder = "/content/train/train_set"
destination_folder = "/content/train/train_filtered_nest"
for index, row in filtered_df.iterrows():
    file_name = row['file_name']
    classe_folder = str(row['y'])

    # Create subfolder if not exists
    class_folder_path = os.path.join(destination_folder, classe_folder)
    os.makedirs(class_folder_path, exist_ok=True)

    source_file_path = os.path.join(source_folder, file_name)
    destination_file_path = os.path.join(class_folder_path, file_name)

    # Check if the file exists before copying
    if os.path.exists(source_file_path):
        shutil.copy2(source_file_path, destination_file_path)
    else:
        print(f"File not found: {file_name}")

##Test

In [None]:
!unzip "/content/drive/MyDrive/VIPM/Dataset/val_set.zip" -d val_set

In [None]:
!unzip "/content/drive/MyDrive/VIPM/Dataset/val_set_degraded.zip" -d val_set_degraded

Adding name column to label dataset for test

In [None]:
df = pd.read_csv("/content/drive/MyDrive/VIPM/Label/val_info.csv", names=['file_name','y'])
df['nome'] = df['file_name']
df.to_csv("/content/val_set/metadata.csv", index=False)
df.to_csv("/content/val_set_degraded/metadata.csv", index=False)

Create the folder by placing the images in subfolders indicating the classes they belong to

###Clean test

In [None]:
source_folder = "/content/val_set/val_set"
destination_folder = "/content/val_set/val_set_nest"
for index, row in df.iterrows():
    file_name = row['file_name']
    classe_folder = str(row['y'])

    # Create subfolder if not exists
    class_folder_path = os.path.join(destination_folder, classe_folder)
    os.makedirs(class_folder_path, exist_ok=True)

    source_file_path = os.path.join(source_folder, file_name)
    destination_file_path = os.path.join(class_folder_path, file_name)

    # Check if the file exists before copying
    if os.path.exists(source_file_path):
        shutil.copy2(source_file_path, destination_file_path)
    else:
        print(f"File not found: {file_name}")

### Test degraded

In [None]:
source_folder = "/content/val_set_degraded/val_set_degraded"
destination_folder = "/content/val_set_degraded/val_set_degraded_nest"
for index, row in df.iterrows():
    file_name = row['file_name']
    classe_folder = str(row['y'])

    # Create subfolder if not exists
    class_folder_path = os.path.join(destination_folder, classe_folder)
    os.makedirs(class_folder_path, exist_ok=True)

    source_file_path = os.path.join(source_folder, file_name)
    destination_file_path = os.path.join(class_folder_path, file_name)

    # Check if the file exists before copying
    if os.path.exists(source_file_path):
        shutil.copy2(source_file_path, destination_file_path)
    else:
        print(f"File not found: {file_name}")


**Reference**

- CDH: [CBIR-Using-CDH](https://github.com/AdityaShaha/CBIR-Using-CDH)
- Texture: [Texture-Shape-And-Color-Extraction](https://github.com/faoezanf/Texture-Shape-And-Color-Extraction/blob/master/Texture%20Feature%20Extraction%20Using%20LBP.ipynb)