In [2]:
import torch
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader, WeightedRandomSampler
from torchvision import transforms
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import cumfreq
import pickle
import pandas as pd
from scipy.stats import entropy
from collections import Counter

from b_weightedimagedataset import WeightedImageDataset

from sklearn.metrics.pairwise import cosine_similarity
import hdbscan


CATEGORY_MAPPING = {
    '_non_distracted': 0, 'distracted': 1
}

In [3]:
# Define file paths and parameters
feature_file_path = '/home/sur06423/hiwi/vit_exp/vision_tranformer_baseline/src/components/distraction_detection_d_b/clustering_experiments/features_split_0_kinect_rgb/all_split_0_rgb_features.pkl'
label_file_path = '/home/sur06423/hiwi/vit_exp/vision_tranformer_baseline/src/components/distraction_detection_d_b/clustering_experiments/features_split_0_kinect_rgb/all_split_0_rgb_labels.pkl'
img_path_file_path = '/home/sur06423/hiwi/vit_exp/vision_tranformer_baseline/src/components/distraction_detection_d_b/clustering_experiments/features_split_0_kinect_rgb/all_split_0_rgb_imagepaths.pkl'
num_categories = 2
batch_size = 1024
save_path = '/home/sur06423/hiwi/vit_exp/vision_tranformer_baseline/src/components/distraction_detection_d_b/KL_results'

In [4]:
def load_data(feature_file_path, label_file_path, img_path_file_path):
    with open(feature_file_path, 'rb') as file:
        features = pickle.load(file)
    with open(label_file_path, 'rb') as file:
        labels = pickle.load(file)
    with open(img_path_file_path, 'rb') as file:
        img_paths = pickle.load(file)
    return features, labels, img_paths

In [5]:
features, labels, img_paths = load_data(feature_file_path, label_file_path, img_path_file_path)

# The size of RGB Kinect DAA dataset (split_0) is 259865

In [6]:
len(features)

259865

# Each extracted feature corresponds to a size of 1280

In [12]:
len(features[0])

1280

In [7]:
len(labels)

259865

# There are 254 batches

In [8]:
len(img_paths)

254

# All batches contain 1024 images except last batch

In [9]:
len(img_paths[0])

1024

# Last Batch Contains Only 793 Images

In [11]:
len(img_paths[253])

793

In [13]:
# Batch conversion of precomputed features, Batches = 254, Batch Size 1024, Feature size: [1,1280]
features_loader = [features[i:i+1024] for i in range(0, len(features), 1024)]
gt_labels_loader = [labels[i:i+1024] for i in range(0, len(labels), 1024)]

In [14]:
len(features_loader)

254

In [15]:
len(features_loader[253])

793

In [16]:
import torch
from torch.utils.data import DataLoader, WeightedRandomSampler
import matplotlib.pyplot as plt
import numpy as np
import pickle

from b_weightedimagedataset import WeightedImageDataset
from sklearn.metrics.pairwise import cosine_similarity
import hdbscan

import os
import getpass
import sys

class DataLoaderEpochsCalculation:
    def __init__(self, feature_file_path, label_file_path, img_path_file_path, num_categories, batch_size):
        self.batch_size = batch_size
        self.num_categories = num_categories
        self.features, self.labels, self.img_paths_list = self.load_data(feature_file_path, label_file_path, img_path_file_path)
        self.image_paths_all = [path for sublist in self.img_paths_list for path in sublist]
        self.dataloader_b, self.weights_list, self.all_labels, self.all_cluster_counts = self.initialize_dataloaders()

    def load_data(self, feature_file_path, label_file_path, img_path_file_path):
        with open(feature_file_path, 'rb') as file:
            features = pickle.load(file)
        with open(label_file_path, 'rb') as file:
            labels = pickle.load(file)
        with open(img_path_file_path, 'rb') as file:
            img_paths = pickle.load(file)
        return features, labels, img_paths

    def initialize_dataloaders(self):
        # Batch conversion of precomputed features, Batches = 254, Batch Size 1024, Feature size: [1,1280]
        features_loader = [self.features[i:i+1024] for i in range(0, len(self.features), 1024)]
        gt_labels_loader = [self.labels[i:i+1024] for i in range(0, len(self.labels), 1024)]

        # Get the weights
        weights_list, all_labels, all_cluster_counts = self.process_batches(features_loader)
        dataset_b = WeightedImageDataset(self.img_paths_list, weights_list, gt_labels_loader)
        sampler_b = WeightedRandomSampler(dataset_b.weights, num_samples=len(dataset_b.weights), replacement=True)
        dataloader_b = DataLoader(dataset_b, batch_size=self.batch_size, sampler=sampler_b, num_workers=10)

        return dataloader_b, weights_list, all_labels, all_cluster_counts

    def compute_weights_cosine_dist(self, features):
        cosine_dist_matrix = 1 - cosine_similarity(features).astype(np.float64)
        # Using Updated HDBSCAN for clustering with tuned Hyperparameters
        clusterer = hdbscan.HDBSCAN(min_cluster_size=25, 
                                    min_samples=1, 
                                    cluster_selection_epsilon=0.0, 
                                    metric='precomputed', 
                                    cluster_selection_method='eom', 
                                    allow_single_cluster=False)
        # clusterer = hdbscan.HDBSCAN(min_cluster_size=2, metric='precomputed', cluster_selection_method='eom')
        labels = clusterer.fit_predict(cosine_dist_matrix)

        weights = np.zeros_like(labels, dtype=float)
        unique_labels = np.unique(labels)
        noise_label = -1
        # Initialize variables for managing the new outlier clusters
        max_label = labels.max()
        current_outlier_cluster_label = max_label + 1
        outlier_cluster_count = 0

        for label in unique_labels:
            if label == noise_label:
                # Process each noise point
                for noise_index in np.where(labels == noise_label)[0]:
                    # Assign it to the current outlier cluster
                    labels[noise_index] = current_outlier_cluster_label
                    outlier_cluster_count += 1
                    weights[noise_index] = 0.001  # Assign weight as 0.001 Exp9

                    # If the outlier cluster reaches its max size, move to a new one
                    if outlier_cluster_count >= 50:
                        current_outlier_cluster_label += 1
                        outlier_cluster_count = 0
            else:
                # For non-noise points, distribute weights evenly within clusters
                indices = np.where(labels == label)[0]
                weights[indices] = 1.0 / len(indices)

        total_clusters = len(np.unique(labels)) - 1  # Exclude the original noise label

        return weights, labels, total_clusters

    def process_batches(self, dataloader):
        all_weights = []
        all_labels = []
        all_cluster_counts = []
        for batch_features in dataloader:
            weights, labels, total_clusters = self.compute_weights_cosine_dist(batch_features)
            all_weights.append(weights)
            all_labels.append(labels)
            all_cluster_counts.append(total_clusters)

        return all_weights, all_labels, all_cluster_counts
    
    ##############################################################################

    def calculate_epochs_to_see_all_samples(self, dataloader, total_unique_samples):
        unique_samples_seen = set()
        unique_counts_per_epoch = []  # List to store counts after each epoch
        epochs = 0
        while len(unique_samples_seen) < total_unique_samples:
            for _, _, _, paths in dataloader:
                unique_samples_seen.update(paths)
            unique_counts_per_epoch.append(len(unique_samples_seen))  # Store the count
            print(f'After epoch {epochs}, unique samples seen: {len(unique_samples_seen)}')

            # Call setup_ccname at epoch 0 and then every 15 epochs
            #if epochs == 0 or (epochs % 10 == 0 and epochs != 0):
            #    setup_ccname()

            epochs += 1

            if epochs > 1000:  # Safety check to avoid infinite loop
                break

        # Plot the results
        self.plot_unique_samples_per_epoch(unique_counts_per_epoch, epochs)

        return epochs
    
    def plot_unique_samples_per_epoch(self, unique_counts_per_epoch, epochs):
        plt.figure(figsize=(10, 6))
        plt.plot(range(1, epochs + 1), unique_counts_per_epoch, marker='o')
        plt.title('Unique Samples Seen Over Epochs')
        plt.xlabel('Epochs')
        plt.ylabel('Cumulative Unique Samples Seen')
        plt.grid(True)
        plt.show()

    ################### Save the Results ######################################

    def save_results_to_pickle(self, data, file_name):
        with open(file_name, 'wb') as file:
            pickle.dump(data, file)

    def loop_dataloader_and_save(self, save_path):
        # Calculate the number of epochs to see all unique samples
        total_unique_samples = len(set(self.image_paths_all))  # all image paths are unique
        print(f"Calculating Total Epochs needed to see all :{total_unique_samples} image samples.")
        epochs_needed = self.calculate_epochs_to_see_all_samples(self.dataloader_b, total_unique_samples)
        
        # Save the result
        self.save_results_to_pickle(epochs_needed, f'{save_path}/epochs_needed.pkl')
        self.save_results_to_pickle(self.weights_list, f'{save_path}/weights_list_dataloader_b.pkl')

        return epochs_needed

In [None]:
# Define file paths and parameters
feature_file_path = '/home/sur06423/hiwi/vit_exp/vision_tranformer_baseline/src/components/distraction_detection_d_b/clustering_experiments/features_split_0_kinect_rgb/all_split_0_rgb_features.pkl'
label_file_path = '/home/sur06423/hiwi/vit_exp/vision_tranformer_baseline/src/components/distraction_detection_d_b/clustering_experiments/features_split_0_kinect_rgb/all_split_0_rgb_labels.pkl'
img_path_file_path = '/home/sur06423/hiwi/vit_exp/vision_tranformer_baseline/src/components/distraction_detection_d_b/clustering_experiments/features_split_0_kinect_rgb/all_split_0_rgb_imagepaths.pkl'
num_categories = 2
batch_size = 1024
save_path = '/home/sur06423/hiwi/vit_exp/vision_tranformer_baseline/src/components/distraction_detection_d_b'

In [None]:
class DataLoaderComparisonKL:
    def __init__(self, feature_file_path, label_file_path, img_path_file_path, num_categories, batch_size):
        self.batch_size = batch_size
        self.num_categories = num_categories
        self.features, self.labels, self.img_paths = self.load_data(feature_file_path, label_file_path, img_path_file_path)
        self.dataloader_a, self.dataloader_b, self.weights_list, self.pred_labels_b, self.pred_cluster_counts, self.n_clusters_b_noise = self.initialize_dataloaders()

    def load_data(self, feature_file_path, label_file_path, img_path_file_path):
        with open(feature_file_path, 'rb') as file:
            features = pickle.load(file)
        with open(label_file_path, 'rb') as file:
            labels = pickle.load(file)
        with open(img_path_file_path, 'rb') as file:
            img_paths = pickle.load(file)
        return features, labels, img_paths

    def initialize_dataloaders(self):
        # CustomImageDataset and WeightedImageDataset initialization
        transform_a = transforms.Compose([
                transforms.Resize((224, 224)),  # Resize the image to a fixed size (224x224)
                transforms.ToTensor(),          # Convert the image to a PyTorch tensor
                transforms.Normalize(
                    mean=[0.485, 0.456, 0.406],  # Normalize the image channels (mean)
                    std=[0.229, 0.224, 0.225]    # Normalize the image channels (std)
                )
            ])
        dataset_a = ImageFolder(root_dir='/net/polaris/storage/deeplearning/sur_data/binary_rgb_daa/split_0/train', transform=transform_a)
        dataloader_a = DataLoader(dataset_a, batch_size=self.batch_size, shuffle=True, num_workers=10, drop_last=False)

        # Batch conversion of precomputed features, Batches = 254, Batch Size 1024, Feature size: [1,1280]
        features_loader = [self.features[i:i+1024] for i in range(0, len(self.features), 1024)]
        gt_labels_loader = [self.labels[i:i+1024] for i in range(0, len(self.labels), 1024)]

        # Get the weights
        weights_list, pred_labels_b, pred_cluster_counts, n_clusters_b_noise, n_noise = self.process_batches(features_loader)
        dataset_b = WeightedImageDataset(self.img_paths, weights_list, gt_labels_loader)
        sampler_b = WeightedRandomSampler(dataset_b.weights, num_samples=len(dataset_b.weights), replacement=True)
        dataloader_b = DataLoader(dataset_b, batch_size=self.batch_size, sampler=sampler_b, num_workers=10)

        return dataloader_a, dataloader_b, weights_list, pred_labels_b, pred_cluster_counts, n_clusters_b_noise

    def compute_weights_cosine_dist(self, features):
        cosine_dist_matrix = 1 - cosine_similarity(features).astype(np.float64)
        clusterer = hdbscan.HDBSCAN(min_cluster_size=2, metric='precomputed', cluster_selection_method='eom')
        labels = clusterer.fit_predict(cosine_dist_matrix)

        weights = np.zeros_like(labels, dtype=float)
        unique_labels = np.unique(labels)
        noise_label = -1
        max_label = unique_labels.max()

        num_clusters_before_noise = len(unique_labels[unique_labels != noise_label])
        num_noise_points = len(labels[labels == noise_label])

        for label in unique_labels:
            indices = np.where(labels == label)[0]
            group_size = len(indices)

            if label == noise_label:
                for noise_index in indices:
                    max_label += 1
                    weights[noise_index] = 1
                    labels[noise_index] = max_label
            else:
                weights[indices] = 1.0 / group_size

        total_clusters = len(np.unique(labels))
        return weights, labels, total_clusters, num_clusters_before_noise, num_noise_points


    def process_batches(self, dataloader):
        all_weights = []
        all_labels = []
        all_cluster_counts = []
        n_clusters_b_noise = []
        n_noise = []

        for batch_features in dataloader:
            weights, labels, total_clusters, num_clusters_before_noise, num_noise_points = self.compute_weights_cosine_dist(batch_features)
            all_weights.append(weights)
            all_labels.append(labels)
            all_cluster_counts.append(total_clusters)
            n_clusters_b_noise.append(num_clusters_before_noise)
            n_noise.append(num_noise_points)

        return all_weights, all_labels, all_cluster_counts, n_clusters_b_noise, n_noise

In [None]:


# Create an instance of DataLoaderComparisonKL and perform the comparison
comparison_kl = DataLoaderComparisonKL(feature_file_path, label_file_path, img_path_file_path, num_categories, batch_size)
counts_a, counts_b, kl_divergences_a, kl_divergences_b, b_unique_images_per_batch, b_total_unique_samples, b_most_picked_per_batch, tuple_most_common_sample = comparison_kl.compare_dataloaders_and_save(save_path)


In [None]:
import torch
from torch.utils.data import DataLoader, WeightedRandomSampler
import matplotlib.pyplot as plt
import numpy as np
import pickle

from b_weightedimagedataset import WeightedImageDataset
from sklearn.metrics.pairwise import cosine_similarity
import hdbscan

import os
import getpass
import sys

def setup_ccname():
    user=getpass.getuser()
    # check if k5start is running, exit otherwise
    try:
        pid=open("/tmp/k5pid_"+user).read().strip()
        os.kill(int(pid), 0)
    except:
        sys.stderr.write("Unable to setup KRB5CCNAME!\nk5start not running!\n")
        sys.exit(1)
    try:
        ccname=open("/tmp/kccache_"+user).read().split("=")[1].strip()
        os.environ['KRB5CCNAME']=ccname
    except:
        sys.stderr.write("Unable to setup KRB5CCNAME!\nmaybe k5start not running?\n")
        sys.exit(1)

class DataLoaderEpochsCalculation:
    def __init__(self, feature_file_path, label_file_path, img_path_file_path, num_categories, batch_size):
        self.batch_size = batch_size
        self.num_categories = num_categories
        self.features, self.labels, self.img_paths_list = self.load_data(feature_file_path, label_file_path, img_path_file_path)
        self.image_paths_all = [path for sublist in self.img_paths_list for path in sublist]
        self.dataloader_b, self.weights_list, self.all_labels, self.all_cluster_counts = self.initialize_dataloaders()

    def load_data(self, feature_file_path, label_file_path, img_path_file_path):
        with open(feature_file_path, 'rb') as file:
            features = pickle.load(file)
        with open(label_file_path, 'rb') as file:
            labels = pickle.load(file)
        with open(img_path_file_path, 'rb') as file:
            img_paths = pickle.load(file)
        return features, labels, img_paths

    def initialize_dataloaders(self):
        # Batch conversion of precomputed features, Batches = 254, Batch Size 1024, Feature size: [1,1280]
        features_loader = [self.features[i:i+1024] for i in range(0, len(self.features), 1024)]
        gt_labels_loader = [self.labels[i:i+1024] for i in range(0, len(self.labels), 1024)]

        # Get the weights
        weights_list, all_labels, all_cluster_counts = self.process_batches(features_loader)
        dataset_b = WeightedImageDataset(self.img_paths_list, weights_list, gt_labels_loader)
        sampler_b = WeightedRandomSampler(dataset_b.weights, num_samples=len(dataset_b.weights), replacement=True)
        dataloader_b = DataLoader(dataset_b, batch_size=self.batch_size, sampler=sampler_b, num_workers=10)

        return dataloader_b, weights_list, all_labels, all_cluster_counts

    def compute_weights_cosine_dist(self, features):
        cosine_dist_matrix = 1 - cosine_similarity(features).astype(np.float64)
        # Using Updated HDBSCAN for clustering with tuned Hyperparameters
        clusterer = hdbscan.HDBSCAN(min_cluster_size=25, 
                                    min_samples=1, 
                                    cluster_selection_epsilon=0.0, 
                                    metric='precomputed', 
                                    cluster_selection_method='eom', 
                                    allow_single_cluster=False)
        # clusterer = hdbscan.HDBSCAN(min_cluster_size=2, metric='precomputed', cluster_selection_method='eom')
        labels = clusterer.fit_predict(cosine_dist_matrix)

        weights = np.zeros_like(labels, dtype=float)
        unique_labels = np.unique(labels)
        noise_label = -1
        # Initialize variables for managing the new outlier clusters
        max_label = labels.max()
        current_outlier_cluster_label = max_label + 1
        outlier_cluster_count = 0

        for label in unique_labels:
            if label == noise_label:
                # Process each noise point
                for noise_index in np.where(labels == noise_label)[0]:
                    # Assign it to the current outlier cluster
                    labels[noise_index] = current_outlier_cluster_label
                    outlier_cluster_count += 1
                    weights[noise_index] = 0.01  # Assign weight as 0.01

                    # If the outlier cluster reaches its max size, move to a new one
                    if outlier_cluster_count >= 50:
                        current_outlier_cluster_label += 1
                        outlier_cluster_count = 0
            else:
                # For non-noise points, distribute weights evenly within clusters
                indices = np.where(labels == label)[0]
                weights[indices] = 1.0 / len(indices)

        total_clusters = len(np.unique(labels)) - 1  # Exclude the original noise label

        return weights, labels, total_clusters

    def process_batches(self, dataloader):
        all_weights = []
        all_labels = []
        all_cluster_counts = []
        for batch_features in dataloader:
            weights, labels, total_clusters = self.compute_weights_cosine_dist(batch_features)
            all_weights.append(weights)
            all_labels.append(labels)
            all_cluster_counts.append(total_clusters)

        return all_weights, all_labels, all_cluster_counts
    
    ##############################################################################

    def calculate_epochs_to_see_all_samples(self, dataloader, total_unique_samples):
        unique_samples_seen = set()
        unique_counts_per_epoch = []  # List to store counts after each epoch
        epochs = 0
        while len(unique_samples_seen) < total_unique_samples:
            setup_ccname()
            for _, _, _, paths in dataloader:
                unique_samples_seen.update(paths)
            unique_counts_per_epoch.append(len(unique_samples_seen))  # Store the count
            print(f'After epoch {epochs}, unique samples seen: {len(unique_samples_seen)}')

            # Call setup_ccname at epoch 0 and then every 15 epochs
            #if epochs == 0 or (epochs % 10 == 0 and epochs != 0):
            #    setup_ccname()

            epochs += 1

            if epochs > 1000:  # Safety check to avoid infinite loop
                break

        # Plot the results
        self.plot_unique_samples_per_epoch(unique_counts_per_epoch, epochs)

        return epochs
    
    def plot_unique_samples_per_epoch(self, unique_counts_per_epoch, epochs):
        plt.figure(figsize=(10, 6))
        plt.plot(range(1, epochs + 1), unique_counts_per_epoch, marker='o')
        plt.title('Unique Samples Seen Over Epochs')
        plt.xlabel('Epochs')
        plt.ylabel('Cumulative Unique Samples Seen')
        plt.grid(True)
        plt.show()

    ################### Save the Results ######################################

    def save_results_to_pickle(self, data, file_name):
        with open(file_name, 'wb') as file:
            pickle.dump(data, file)

    def loop_dataloader_and_save(self, save_path):
        # Calculate the number of epochs to see all unique samples
        total_unique_samples = len(set(self.image_paths_all))  # all image paths are unique
        print(f"Calculating Total Epochs needed to see all :{total_unique_samples} image samples.")
        epochs_needed = self.calculate_epochs_to_see_all_samples(self.dataloader_b, total_unique_samples)
        
        # Save the result
        self.save_results_to_pickle(epochs_needed, f'{save_path}/epochs_needed.pkl')
        self.save_results_to_pickle(self.weights_list, f'{save_path}/weights_list_dataloader_b.pkl')

        return epochs_needed