In [1]:
import torch
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader, WeightedRandomSampler
from torchvision import transforms
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import cumfreq
import pickle
import pandas as pd
from scipy.stats import entropy
from collections import Counter

from b_weightedimagedataset import WeightedImageDataset

from sklearn.metrics.pairwise import cosine_similarity
import hdbscan


CATEGORY_MAPPING = {
    '_non_distracted': 0, 'distracted': 1
}

In [2]:
# Define file paths and parameters
feature_file_path = '/home/sur06423/hiwi/vit_exp/vision_tranformer_baseline/src/components/distraction_detection_d_b/clustering_experiments/features_split_0_kinect_rgb/all_split_0_rgb_features.pkl'
label_file_path = '/home/sur06423/hiwi/vit_exp/vision_tranformer_baseline/src/components/distraction_detection_d_b/clustering_experiments/features_split_0_kinect_rgb/all_split_0_rgb_labels.pkl'
img_path_file_path = '/home/sur06423/hiwi/vit_exp/vision_tranformer_baseline/src/components/distraction_detection_d_b/clustering_experiments/features_split_0_kinect_rgb/all_split_0_rgb_imagepaths.pkl'
num_categories = 2
batch_size = 1024
save_path = '/home/sur06423/hiwi/vit_exp/vision_tranformer_baseline/src/components/distraction_detection_d_b/KL_results'

In [3]:
def load_data(feature_file_path, label_file_path, img_path_file_path):
    with open(feature_file_path, 'rb') as file:
        features = pickle.load(file)
    with open(label_file_path, 'rb') as file:
        labels = pickle.load(file)
    with open(img_path_file_path, 'rb') as file:
        img_paths = pickle.load(file)
    return features, labels, img_paths

In [4]:
features, labels, img_paths = load_data(feature_file_path, label_file_path, img_path_file_path)

# The size of RGB Kinect DAA dataset (split_0) is 259865

In [5]:
len(features)

259865

# Each extracted feature corresponds to a size of 1280

In [6]:
len(features[0])

1280

In [7]:
len(labels)

259865

# There are 254 batches

In [8]:
len(img_paths)

254

# All batches contain 1024 images except last batch

In [9]:
len(img_paths[0])

1024

# Last Batch Contains Only 793 Images

In [10]:
len(img_paths[253])

793

In [11]:
# Batch conversion of precomputed features, Batches = 254, Batch Size 1024, Feature size: [1,1280]
features_loader = [features[i:i+1024] for i in range(0, len(features), 1024)]
gt_labels_loader = [labels[i:i+1024] for i in range(0, len(labels), 1024)]

In [12]:
len(features_loader)

254

In [13]:
len(features_loader[253])

793

# Experiment : Outliers weight = 0.02

In [14]:
import torch
from torch.utils.data import DataLoader, WeightedRandomSampler
import matplotlib.pyplot as plt
import numpy as np
import pickle

from b_weightedimagedataset import WeightedImageDataset
from sklearn.metrics.pairwise import cosine_similarity
import hdbscan

import os
import getpass
import sys

def setup_ccname():
    user=getpass.getuser()
    # check if k5start is running, exit otherwise
    try:
        pid=open("/tmp/k5pid_"+user).read().strip()
        os.kill(int(pid), 0)
    except:
        sys.stderr.write("Unable to setup KRB5CCNAME!\nk5start not running!\n")
        sys.exit(1)
    try:
        ccname=open("/tmp/kccache_"+user).read().split("=")[1].strip()
        os.environ['KRB5CCNAME']=ccname
    except:
        sys.stderr.write("Unable to setup KRB5CCNAME!\nmaybe k5start not running?\n")
        sys.exit(1)

class DataLoaderEpochsCalculation:
    def __init__(self, feature_file_path, label_file_path, img_path_file_path, num_categories, batch_size):
        self.batch_size = batch_size
        self.num_categories = num_categories
        self.features, self.labels, self.img_paths_list = self.load_data(feature_file_path, label_file_path, img_path_file_path)
        self.image_paths_all = [path for sublist in self.img_paths_list for path in sublist]
        self.dataloader_b, self.weights_list, self.all_labels, self.all_cluster_counts = self.initialize_dataloaders()

    def load_data(self, feature_file_path, label_file_path, img_path_file_path):
        with open(feature_file_path, 'rb') as file:
            features = pickle.load(file)
        with open(label_file_path, 'rb') as file:
            labels = pickle.load(file)
        with open(img_path_file_path, 'rb') as file:
            img_paths = pickle.load(file)
        return features, labels, img_paths

    def initialize_dataloaders(self):
        # Batch conversion of precomputed features, Batches = 254, Batch Size 1024, Feature size: [1,1280]
        features_loader = [self.features[i:i+1024] for i in range(0, len(self.features), 1024)]
        gt_labels_loader = [self.labels[i:i+1024] for i in range(0, len(self.labels), 1024)]

        # Get the weights
        weights_list, all_labels, all_cluster_counts = self.process_batches(features_loader)
        dataset_b = WeightedImageDataset(self.img_paths_list, weights_list, gt_labels_loader)
        sampler_b = WeightedRandomSampler(dataset_b.weights, num_samples=len(dataset_b.weights), replacement=True)
        dataloader_b = DataLoader(dataset_b, batch_size=self.batch_size, sampler=sampler_b, num_workers=10)

        return dataloader_b, weights_list, all_labels, all_cluster_counts

    def compute_weights_cosine_dist(self, features):
        cosine_dist_matrix = 1 - cosine_similarity(features).astype(np.float64)
        # Using Updated HDBSCAN for clustering with tuned Hyperparameters
        clusterer = hdbscan.HDBSCAN(min_cluster_size=25, 
                                    min_samples=1, 
                                    cluster_selection_epsilon=0.0, 
                                    metric='precomputed', 
                                    cluster_selection_method='eom', 
                                    allow_single_cluster=False)
        # clusterer = hdbscan.HDBSCAN(min_cluster_size=2, metric='precomputed', cluster_selection_method='eom')
        labels = clusterer.fit_predict(cosine_dist_matrix)

        weights = np.zeros_like(labels, dtype=float)
        unique_labels = np.unique(labels)
        noise_label = -1
        # Initialize variables for managing the new outlier clusters
        max_label = labels.max()
        current_outlier_cluster_label = max_label + 1
        outlier_cluster_count = 0

        for label in unique_labels:
            if label == noise_label:
                # Process each noise point
                for noise_index in np.where(labels == noise_label)[0]:
                    # Assign it to the current outlier cluster
                    labels[noise_index] = current_outlier_cluster_label
                    outlier_cluster_count += 1
                    weights[noise_index] = 0.02  # Assign weight as 0.02 Exp9

                    # If the outlier cluster reaches its max size, move to a new one
                    if outlier_cluster_count >= 50:
                        current_outlier_cluster_label += 1
                        outlier_cluster_count = 0
            else:
                # For non-noise points, distribute weights evenly within clusters
                indices = np.where(labels == label)[0]
                weights[indices] = 1.0 / len(indices)

        total_clusters = len(np.unique(labels)) - 1  # Exclude the original noise label

        return weights, labels, total_clusters

    def process_batches(self, dataloader):
        all_weights = []
        all_labels = []
        all_cluster_counts = []
        for batch_features in dataloader:
            weights, labels, total_clusters = self.compute_weights_cosine_dist(batch_features)
            all_weights.append(weights)
            all_labels.append(labels)
            all_cluster_counts.append(total_clusters)

        return all_weights, all_labels, all_cluster_counts
    
    ##############################################################################

    def calculate_epochs_to_see_all_samples(self, dataloader, total_unique_samples):
        unique_samples_seen = set()
        unique_counts_per_epoch = []  # List to store counts after each epoch
        epochs = 0
        while len(unique_samples_seen) < total_unique_samples:
            setup_ccname()
            for _, _, _, paths in dataloader:
                unique_samples_seen.update(paths)
            unique_counts_per_epoch.append(len(unique_samples_seen))  # Store the count
            print(f'After epoch {epochs}, unique samples seen: {len(unique_samples_seen)}')

            # Call setup_ccname at epoch 0 and then every 15 epochs
            #if epochs == 0 or (epochs % 10 == 0 and epochs != 0):
            #    setup_ccname()

            epochs += 1

            if epochs > 1000:  # Safety check to avoid infinite loop
                break

        # Plot the results
        self.plot_unique_samples_per_epoch(unique_counts_per_epoch, epochs)

        return epochs
    
    def plot_unique_samples_per_epoch(self, unique_counts_per_epoch, epochs):
        plt.figure(figsize=(10, 6))
        plt.plot(range(1, epochs + 1), unique_counts_per_epoch, marker='o')
        plt.title('Unique Samples Seen Over Epochs')
        plt.xlabel('Epochs')
        plt.ylabel('Cumulative Unique Samples Seen')
        plt.grid(True)
        plt.show()

    ################### Save the Results ######################################

    def save_results_to_pickle(self, data, file_name):
        with open(file_name, 'wb') as file:
            pickle.dump(data, file)

    def loop_dataloader_and_save(self, save_path):
        # Calculate the number of epochs to see all unique samples
        total_unique_samples = len(set(self.image_paths_all))  # all image paths are unique
        print(f"Calculating Total Epochs needed to see all :{total_unique_samples} image samples.")
        epochs_needed = self.calculate_epochs_to_see_all_samples(self.dataloader_b, total_unique_samples)
        
        # Save the result
        self.save_results_to_pickle(epochs_needed, f'{save_path}/epochs_needed.pkl')
        self.save_results_to_pickle(self.weights_list, f'{save_path}/weights_list_dataloader_b.pkl')

        return epochs_needed

In [15]:
# Define file paths and parameters
feature_file_path = '/home/sur06423/hiwi/vit_exp/vision_tranformer_baseline/src/components/distraction_detection_d_b/clustering_experiments/features_split_0_kinect_rgb/all_split_0_rgb_features.pkl'
label_file_path = '/home/sur06423/hiwi/vit_exp/vision_tranformer_baseline/src/components/distraction_detection_d_b/clustering_experiments/features_split_0_kinect_rgb/all_split_0_rgb_labels.pkl'
img_path_file_path = '/home/sur06423/hiwi/vit_exp/vision_tranformer_baseline/src/components/distraction_detection_d_b/clustering_experiments/features_split_0_kinect_rgb/all_split_0_rgb_imagepaths.pkl'
num_categories = 2
batch_size = 1024
save_path = '/home/sur06423/hiwi/vit_exp/vision_tranformer_baseline/src/components/distraction_detection_d_b'

In [16]:
# Created an instance of DataLoaderEpochsCalculation and performed the comparison
epoch_dataloader_b = DataLoaderEpochsCalculation(feature_file_path, label_file_path, img_path_file_path, num_categories, batch_size)
epochs_needed_b = epoch_dataloader_b.loop_dataloader_and_save(save_path)
print("Epochs needed to see all samples:", epochs_needed_b)

Calculating Total Epochs needed to see all :259865 image samples.
After epoch 0, unique samples seen: 158865
After epoch 1, unique samples seen: 216403
After epoch 2, unique samples seen: 239527
After epoch 3, unique samples seen: 249328
After epoch 4, unique samples seen: 253882
After epoch 5, unique samples seen: 256088
After epoch 6, unique samples seen: 257227
After epoch 7, unique samples seen: 257906
After epoch 8, unique samples seen: 258319
After epoch 9, unique samples seen: 258584
After epoch 10, unique samples seen: 258764
After epoch 11, unique samples seen: 258915
After epoch 12, unique samples seen: 259022
After epoch 13, unique samples seen: 259091
After epoch 14, unique samples seen: 259171
After epoch 15, unique samples seen: 259248
After epoch 16, unique samples seen: 259293
After epoch 17, unique samples seen: 259347
After epoch 18, unique samples seen: 259385
After epoch 19, unique samples seen: 259415
After epoch 20, unique samples seen: 259445
After epoch 21, uniq

KeyboardInterrupt: 

Calculating Total Epochs needed to see all :259865 image samples.
After epoch 0, unique samples seen: 158865
After epoch 1, unique samples seen: 216403
After epoch 2, unique samples seen: 239527
After epoch 3, unique samples seen: 249328
After epoch 4, unique samples seen: 253882
After epoch 5, unique samples seen: 256088
After epoch 6, unique samples seen: 257227
After epoch 7, unique samples seen: 257906
After epoch 8, unique samples seen: 258319
After epoch 9, unique samples seen: 258584
After epoch 10, unique samples seen: 258764
After epoch 11, unique samples seen: 258915
After epoch 12, unique samples seen: 259022
After epoch 13, unique samples seen: 259091
After epoch 14, unique samples seen: 259171
After epoch 15, unique samples seen: 259248
After epoch 16, unique samples seen: 259293
After epoch 17, unique samples seen: 259347
After epoch 18, unique samples seen: 259385
After epoch 19, unique samples seen: 259415
After epoch 20, unique samples seen: 259445
After epoch 21, unique samples seen: 259470
After epoch 22, unique samples seen: 259492
After epoch 23, unique samples seen: 259517
After epoch 24, unique samples seen: 259535
After epoch 25, unique samples seen: 259562
After epoch 26, unique samples seen: 259587
After epoch 27, unique samples seen: 259603
After epoch 28, unique samples seen: 259628
After epoch 29, unique samples seen: 259638
After epoch 30, unique samples seen: 259648
After epoch 31, unique samples seen: 259663
After epoch 32, unique samples seen: 259673
After epoch 33, unique samples seen: 259683
After epoch 34, unique samples seen: 259691
After epoch 35, unique samples seen: 259704
After epoch 36, unique samples seen: 259715
After epoch 37, unique samples seen: 259726
After epoch 38, unique samples seen: 259739
After epoch 39, unique samples seen: 259746
After epoch 40, unique samples seen: 259752
After epoch 41, unique samples seen: 259765
After epoch 42, unique samples seen: 259769
After epoch 43, unique samples seen: 259780
After epoch 44, unique samples seen: 259783
After epoch 45, unique samples seen: 259791
After epoch 46, unique samples seen: 259797
After epoch 47, unique samples seen: 259802
After epoch 48, unique samples seen: 259805
After epoch 49, unique samples seen: 259808
After epoch 50, unique samples seen: 259809
After epoch 51, unique samples seen: 259812
After epoch 52, unique samples seen: 259812
After epoch 53, unique samples seen: 259817
After epoch 54, unique samples seen: 259824
After epoch 55, unique samples seen: 259828
After epoch 56, unique samples seen: 259831
After epoch 57, unique samples seen: 259834
After epoch 58, unique samples seen: 259837
After epoch 59, unique samples seen: 259837
After epoch 60, unique samples seen: 259838
After epoch 61, unique samples seen: 259840
After epoch 62, unique samples seen: 259843
After epoch 63, unique samples seen: 259844
After epoch 64, unique samples seen: 259845
After epoch 65, unique samples seen: 259846
After epoch 66, unique samples seen: 259850
After epoch 67, unique samples seen: 259851
