In [1]:
import sys

sys.path.insert(0, "..\\Scripts")

from ImageDatastore import ImageDatastore
from Utils import create_or_clear_directory, transform
from NeuralFeatureExtractor import MobileNetFeatureExtractor
from torch.utils.data import DataLoader
from torchvision.transforms import ToTensor
from sklearn.cluster import DBSCAN
from scipy.spatial.distance import pdist, squareform
from scipy.stats import zscore
import numpy as np
import os
import shutil
from sklearn.preprocessing import StandardScaler
import pandas as pd

In [2]:
train_data = ImageDatastore('train', transform=transform(256))

batch_size = 512
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

In [3]:
base_feature_path = '../Features/features/'
base_labels_path = '../Features/labels/'

In [10]:
# extractor = MobileNetFeatureExtractor()
features = np.load(base_feature_path + 'mobilenet_v3_classifier.npy')
labels = np.load(base_labels_path + 'mobilenet_v3_classifier.npy')
# features, labels = extractor.compute_features(train_dataloader)

new_features = []
new_labels = []

In [5]:
scaler = StandardScaler()

In [6]:
df_small = pd.read_csv('../Dataset/train_small.csv')

In [7]:
path_outliers = '../Results/outliers2'
create_or_clear_directory(path_outliers)

valid_image_idx = list(range(len(features)))

range_labels = len(features) // 20
outliers = []

for i in range(range_labels):
    
    current_features = features[i*20:(i+1)*20, :]

    features_scaled = scaler.fit_transform(current_features)

    distanze = pdist(features_scaled, 'euclidean')
    
    epsilon = np.median(distanze)
    min_pts = 2
    
    dbscan = DBSCAN(eps=epsilon, min_samples=min_pts)
    labels = dbscan.fit_predict(features_scaled)
    
    outlier_idx = np.where(labels == -1)[0]

    if outlier_idx.size == 0:
        outlier_idx = np.array([0])
    else:
        for idx in outlier_idx:
            img_idx = i * 20 + idx
            img_name = train_data.labels.iloc[img_idx][0]
            img_path = os.path.join(train_data.images_directory, img_name)

            df_small.loc[img_idx, 'Label'] = -1

            valid_image_idx.remove(img_idx)
            
            file_name, file_ext = os.path.splitext(img_name)
            outlier_file_name = os.path.join(path_outliers, f"{i}_class_{file_name}{file_ext}")
            
            shutil.copy2(img_path, outlier_file_name)
    outliers.extend(outlier_idx + i * 20)

  img_name = train_data.labels.iloc[img_idx][0]
  img_name = train_data.labels.iloc[img_idx][0]
  img_name = train_data.labels.iloc[img_idx][0]
  img_name = train_data.labels.iloc[img_idx][0]
  img_name = train_data.labels.iloc[img_idx][0]
  img_name = train_data.labels.iloc[img_idx][0]
  img_name = train_data.labels.iloc[img_idx][0]
  img_name = train_data.labels.iloc[img_idx][0]
  img_name = train_data.labels.iloc[img_idx][0]
  img_name = train_data.labels.iloc[img_idx][0]
  img_name = train_data.labels.iloc[img_idx][0]
  img_name = train_data.labels.iloc[img_idx][0]
  img_name = train_data.labels.iloc[img_idx][0]
  img_name = train_data.labels.iloc[img_idx][0]
  img_name = train_data.labels.iloc[img_idx][0]
  img_name = train_data.labels.iloc[img_idx][0]
  img_name = train_data.labels.iloc[img_idx][0]
  img_name = train_data.labels.iloc[img_idx][0]
  img_name = train_data.labels.iloc[img_idx][0]
  img_name = train_data.labels.iloc[img_idx][0]
  img_name = train_data.labels.iloc[img_

In [11]:
for idx in valid_image_idx:
    new_features.append(features[idx])
    new_labels.append(labels[idx])

In [10]:
df_small_without_ouliers = df_small[df_small['Label'] != -1]

In [11]:
df_small_without_ouliers.to_csv('../Dataset/train_small_without_outliers.csv', index=False)

In [12]:
new_features = np.array(new_features)
new_labels = np.array(new_labels)

In [15]:
np.save(base_feature_path + 'mobilenet_v3_classifier_without_outliers.npy', new_features)
np.save(base_labels_path + 'mobilenet_v3_classifier_without_outliers.npy', new_labels)