## Input data: Model outputs
This data comes from scratch/XXXXXXX/Vis/output/

It contains a folder for each skill (e.g., M2, M4, etc). Each folder contains multiple .npz files that represents the sessions. ``<video_id>``-features.npz files contain the features extracted from each video (session). 
Features contain the following keys: ``['fps', 'window_limit', 'omnivore', 'slowfast', 'avion', 'objects_conf', 'objects_bbox', 'hls_l_avg', 'window_medoid']``

## Generate a reduced version of the data. Taking into account 10 sessions per skill

To extract all .npz file names from our manually created output folder and use them for the files_to_copy variable, the following Python script recursively search through the output folder, collect all .npz file names, and print or save them to a list.

In [2]:
import os

def extract_npz_files(source_dir):
    """
    Extracts all .npz file names from the source_dir and its subdirectories.
    
    :param source_dir: Path to the source directory containing all skills.
    :return: List of .npz file names.
    """
    npz_files = []
    
    for root, dirs, files in os.walk(source_dir):
        for file in files:
            if file.endswith('.npz'):
                # Store the relative path to maintain folder structure
                relative_path = os.path.relpath(os.path.join(root, file), source_dir)
                npz_files.append(relative_path)
    
    return npz_files

# Example usage
source_directory = '/Users/soniacq/PTG/BBN_data/updated_models/output_sample'  # Replace with your actual source directory
npz_files = extract_npz_files(source_directory)
print(npz_files)
# Print the list of .npz files
# for npz_file in npz_files:
#     print(npz_file)

['M1/M1-4-features.npz', 'M1/M1-3-features.npz', 'M1/M1-2-features.npz', 'M1/M1-5-features.npz', 'M1/M1-8-features.npz', 'M1/assessment_2-features.npz', 'M1/assessment_1-features.npz', 'M1/M1-7-features.npz', 'M1/M1-1-features.npz', 'M1/M1-6-features.npz', 'M5/M5-8-features.npz', 'M5/20230712_100327_HoloLens-features.npz', 'M5/M5-3-features.npz', 'M5/M5-4-features.npz', 'M5/M5-5-features.npz', 'M5/M5-2-features.npz', 'M5/20230712_100156_HoloLens-features.npz', 'M5/M5-7-features.npz', 'M5/M5-6-features.npz', 'M5/M5-1-features.npz', 'R19/R19-5-features.npz', 'R19/R19-2-features.npz', 'R19/R19-10-features.npz', 'R19/R19-3-features.npz', 'R19/R19-4-features.npz', 'R19/R19-9-features.npz', 'R19/R19-8-features.npz', 'R19/R19-6-features.npz', 'R19/R19-1-features.npz', 'R19/R19-7-features.npz', 'M2/3_tourns_2-features.npz', 'M2/20230403_094350_HoloLens-features.npz', 'M2/M2-4-features.npz', 'M2/M2-3-features.npz', 'M2/M2-2-features.npz', 'M2/M2-5-features.npz', 'M2/20230403_094735_HoloLens-fea

#### Generating reduced versions
This script will maintain the folder structure and ensure that each .npz file is copied to the appropriate location in the target directory.

In [34]:
import os
import shutil

def copy_npz_files_with_structure(source_dir, target_dir, file_patterns):
    """
    Copy specific .npz files from source_dir to target_dir while preserving the directory structure.
    
    :param source_dir: Path to the source directory containing all skills.
    :param target_dir: Path to the target directory where files will be copied.
    :param file_patterns: List of file patterns to match (e.g., ['M1/M1-1-features.npz', 'M1/M1-2-features.npz']).
    """
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    for file_pattern in file_patterns:
        # Construct source and destination paths
        src_file = os.path.join(source_dir, file_pattern)
        if os.path.exists(src_file):
            # Create the target subdirectory if it doesn't exist
            target_file_path = os.path.join(target_dir, file_pattern)
            target_file_dir = os.path.dirname(target_file_path)
            if not os.path.exists(target_file_dir):
                os.makedirs(target_file_dir)
            
            # Copy the file
            shutil.copy(src_file, target_file_path)
            print(f'Copied {src_file} to {target_file_path}')
        else:
            print(f'File {src_file} does not exist.')

# Example usage
source_directory_window_1s = '/Users/soniacq/PTG/BBN_data/updated_models/different_windows_size/output/window_1s'
target_directory_window_1s = '/Users/soniacq/PTG/BBN_data/updated_models/different_windows_size/output/window_1s_sample'
files_to_copy = npz_files  # Your list of npz files with paths

copy_npz_files_with_structure(source_directory_window_1s, target_directory_window_1s, files_to_copy)

# Example usage
source_directory_window_2s = '/Users/soniacq/PTG/BBN_data/updated_models/different_windows_size/output/window_2s'
target_directory_window_2s = '/Users/soniacq/PTG/BBN_data/updated_models/different_windows_size/output/window_2s_sample'
files_to_copy = npz_files  # Your list of npz files with paths

copy_npz_files_with_structure(source_directory_window_2s, target_directory_window_2s, files_to_copy)

# Example usage
source_directory_window_4s = '/Users/soniacq/PTG/BBN_data/updated_models/different_windows_size/output/window_4s'
target_directory_window_4s = '/Users/soniacq/PTG/BBN_data/updated_models/different_windows_size/output/window_4s_sample'
files_to_copy = npz_files  # Your list of npz files with paths

copy_npz_files_with_structure(source_directory_window_4s, target_directory_window_4s, files_to_copy)

Copied /Users/soniacq/PTG/BBN_data/updated_models/different_windows_size/output/window_1s/M1/M1-4-features.npz to /Users/soniacq/PTG/BBN_data/updated_models/different_windows_size/output/window_1s_sample/M1/M1-4-features.npz
Copied /Users/soniacq/PTG/BBN_data/updated_models/different_windows_size/output/window_1s/M1/M1-3-features.npz to /Users/soniacq/PTG/BBN_data/updated_models/different_windows_size/output/window_1s_sample/M1/M1-3-features.npz
Copied /Users/soniacq/PTG/BBN_data/updated_models/different_windows_size/output/window_1s/M1/M1-2-features.npz to /Users/soniacq/PTG/BBN_data/updated_models/different_windows_size/output/window_1s_sample/M1/M1-2-features.npz
Copied /Users/soniacq/PTG/BBN_data/updated_models/different_windows_size/output/window_1s/M1/M1-5-features.npz to /Users/soniacq/PTG/BBN_data/updated_models/different_windows_size/output/window_1s_sample/M1/M1-5-features.npz
Copied /Users/soniacq/PTG/BBN_data/updated_models/different_windows_size/output/window_1s/M1/M1-8-fe

### Testing

In [60]:
import numpy as np
import math

user_path = target_directory_window_4s + "/R19/"
# scp -r scqXXX@greene.hpc.nyu.edu:/scratch/XXXXXXX/data/BBN/features/M2/aug_img_perc0.8_NEW-YOLO_weighted_loss .
path_original_data = user_path # + 'BBN_data/RNN/aug_img_perc0.8_NEW-YOLO_weighted_loss/'
# Load the .npy file with allow_pickle=True
features_M2_1 = np.load(path_original_data + 'R19-10-features.npz', allow_pickle=True)

# Load the .npy file with allow_pickle=True
# labels_M2_1 = np.load(path_original_data + '3_tourns_2-window_label.npz', allow_pickle=True)

In [61]:
# Get the keys and convert them to a list
keys = list(features_M2_1.keys())
# Print the keys
print(keys)
# # Get the keys and convert them to a list
# keys_labels = list(labels_M2_1.keys())
# # Print the keys
# print(keys_labels)

['fps', 'window_limit', 'window_medoid', 'omnivore', 'slowfast', 'avion', 'objects_conf', 'objects_bbox', 'label', 'label_desc', 'hls_l_avg', 'objects_class']



We should have the following data (12 in total): ['fps', 'window_limit', 'omnivore', 'slowfast', 'avion', 'objects_conf', 'objects_bbox', 'hls_l_avg', 'window_medoid', 'objects_class', 'label', 'label_desc']

In [5]:
temp = ['No Step'] * features_M2_1['window_medoid'].shape[0] if features_M2_1['label'].shape[0] == 0 else features_M2_1['label']
max(np.unique(temp))

6

In [62]:
features_M2_1['window_medoid']

array([   1,   27,   70,   72,  148,  274,  281,  403,  403,  471,  571,
        607,  611,  738,  738,  826,  899,  952,  991, 1057, 1154, 1180,
       1264])

In [7]:
omnivore_features = features_M2_1['omnivore']
print(features_M2_1['omnivore'].shape)
print(features_M2_1['slowfast'].shape)
print(features_M2_1['avion'].shape)

# array_labels = labels_M2_1['label_desc']
# print(len(array_labels))

(51, 1024)
(51, 1600)
(51, 3806)


In [8]:
window_limit_data = features_M2_1['window_limit']
n_window_frames = window_limit_data.shape[0]-1
last_IDframe = window_limit_data[n_window_frames][1]
video_duration = round(last_IDframe/features_M2_1['fps'].item()) # Use round() to round a number to the nearest integer
print("n_window_frames:", n_window_frames)
print("last_IDframe:", last_IDframe)
print("video_duration:", video_duration)


n_window_frames: 50
last_IDframe: 1495
video_duration: 50


In [9]:
window_limit_data.shape[0]

51

# Data Generation

In [105]:
# path_raw_data = target_directory_window_1s
# path_raw_data = target_directory_window_2s
path_raw_data = target_directory_window_4s

## Generate session metadata

In [53]:
import os
import numpy as np
import json

def compute_video_metadata(npz_file_path):
    features = np.load(npz_file_path, allow_pickle=True)
    window_limit_data = features['window_limit']
    n_window_frames = window_limit_data.shape[0] - 1
    last_IDframe = window_limit_data[n_window_frames][1]
    label = [0] * features['window_medoid'].shape[0] if features['label'].shape[0] == 0 else features['label']
    label_desc = ['No step'] * features['window_medoid'].shape[0] if features['label_desc'].shape[0] == 0 else features['label_desc']
    fps = features['fps'].item()
    duration_seconds = round(last_IDframe / fps)
    
    
    return {
        "duration_seconds": duration_seconds,
        "numberof_window_frames": n_window_frames,
        "last_IDframe": int(last_IDframe),
        "total_labels": int(len(np.unique(label))),
        "max_id_labels": int(max(np.unique(label))),
        "unique_labels": str(np.unique(label_desc)),
        "unique_id_labels": str(np.unique(label))
    }

def generate_json(directory):
    data = {}
    
    for skill_name in os.listdir(directory):
        skill_path = os.path.join(directory, skill_name)
        if os.path.isdir(skill_path):
            skill_data = {}
            for file_name in os.listdir(skill_path):
                if file_name.endswith('-features.npz'):
                    video_id = file_name.split('-features.npz')[0]
                    file_path = os.path.join(skill_path, file_name)
                    video_metadata = compute_video_metadata(file_path)
                    skill_data[video_id] = video_metadata
            if skill_data:
                data[skill_name] = skill_data

    return data

def save_json(data, output_file):
    with open(output_file, 'w') as json_file:
        json.dump(data, json_file, indent=4)

# Example usage in a Jupyter notebook
directory = path_raw_data  # Replace with the path to the directory containing the skill folders
output_file = path_raw_data + "/sessions_metadata.json"            # Replace with the desired output file path
result = generate_json(directory)
save_json(result, output_file)
print("JSON file generated successfully.")


JSON file generated successfully.


## Generate data for each window_frame for each corresponging session

In [54]:
import os
import numpy as np
import json


def generate_detailed_json(directory):
    data = []
    
    for skill_name in os.listdir(directory):
        skill_path = os.path.join(directory, skill_name)
        if os.path.isdir(skill_path):
            for file_name in os.listdir(skill_path):
                if file_name.endswith('-features.npz'):
                    video_id = file_name.split('-features.npz')[0]
                    file_path = os.path.join(skill_path, file_name)
                    features = np.load(file_path, allow_pickle=True)
                    
                    keys = list(features.keys())
                    # Print the keys
                    hls_l_avg = features['hls_l_avg']
                    label = [0] * features['window_medoid'].shape[0] if features['label'].shape[0] == 0 else features['label']
                    label_desc = ['No step'] * features['window_medoid'].shape[0] if features['label_desc'].shape[0] == 0 else features['label_desc']
                    objects_bbox = features['objects_bbox']
                    objects_conf = features['objects_conf']
                    frame_id = features['window_medoid']
                    
                    trial_data = []
                    
                    for i in range(len(hls_l_avg)):
                        window_data = {
                            "seconds": i,
                            "hls_lightness_avg": float(hls_l_avg[i]),
                            "step": float(label[i]),
                            "step_desc": label_desc[i],
                            "objects_bbox": objects_bbox[i].tolist(),
                            "objects_conf": objects_conf[i].tolist(),
                            # "objects_conf_avg": float(np.mean(objects_conf[i]))
                            "objects_conf_avg": float(np.mean(objects_conf[i])) if len(objects_conf[i]) > 0 else -1,
                            "frame_id": str(frame_id[i])
                        }
                        trial_data.append(window_data)
                    
                    subject_data = {
                        "skill_id": skill_name,
                        "session_id": video_id,
                        "data": trial_data
                    }
                    
                    data.append(subject_data)
    
    return data

def save_json(data, output_file):
    with open(output_file, 'w') as json_file:
        json.dump(data, json_file, indent=4)

# Example usage in a Jupyter notebook
directory = path_raw_data  # Replace with the path to the directory containing the skill folders
output_file = path_raw_data + "/sessions_window_frame_data.json"            # Replace with the desired output file path
result = generate_detailed_json(directory)
save_json(result, output_file)
print("JSON file generated successfully.")


JSON file generated successfully.


### Save t-SNE resutls

#### tsne position for all sessions,l using PCA, tSNE, and UMAP


In [106]:
import os
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap  # Import UMAP

def load_features(file_path):
    features = np.load(file_path, allow_pickle=True)
    omnivore = features['omnivore']
    slowfast = features['slowfast']
    avion = features['avion']
    window_medoid = features['window_medoid']
    objects_class = features['objects_class']
    objects_conf = features['objects_conf']
    step_label = [0] * features['window_medoid'].shape[0] if features['label'].shape[0] == 0 else features['label']
    step_label_desc = ['No step'] * features['window_medoid'].shape[0] if features['label_desc'].shape[0] == 0 else features['label_desc']
    
    return omnivore, slowfast, avion, window_medoid, objects_class, objects_conf, step_label, step_label_desc

def apply_pca(features, metadata):
    pca = PCA(n_components=2, random_state=42)
    pca_results = pca.fit_transform(features)
    return format_data(pca_results, metadata)

def apply_tsne(features, metadata):
    tsne = TSNE(n_components=2, random_state=42)
    tsne_results = tsne.fit_transform(features)
    return format_data(tsne_results, metadata)

def apply_umap(features, metadata):
    umap_model = umap.UMAP(n_components=2, random_state=42)
    umap_results = umap_model.fit_transform(features)
    return format_data(umap_results, metadata)

def format_data(results, metadata):
    data = []
    for i, (x, y) in enumerate(results):
        data.append({
            "id": metadata[i]['id'],
            "session_id": metadata[i]['session_id'],
            "frame_id": metadata[i]['frame_id'],
            "objects_class": metadata[i]['objects_class'],
            "objects_conf": metadata[i]['objects_conf'],
            "step_label": metadata[i]['step_label'],
            "step_label_desc": metadata[i]['step_label_desc'],
            "x": float(x),  # Convert to native Python float
            "y": float(y),  # Convert to native Python float
            "method": metadata[i]['method'],
            "session": metadata[i]['session'],
            "skill": metadata[i]['skill']
        })
    return data

def generate_csv(directory, pca_output_file, tsne_output_file, umap_output_file):
    all_omnivore_features = []
    all_slowfast_features = []
    all_avion_features = []
    
    omnivore_labels = []
    slowfast_labels = []
    avion_labels = []
    
    for skill_name in os.listdir(directory):
        skill_path = os.path.join(directory, skill_name)
        if os.path.isdir(skill_path):
            for file_name in os.listdir(skill_path):
                if file_name.endswith('-features.npz'):
                    video_id = file_name.split('-features.npz')[0]
                    file_path = os.path.join(skill_path, file_name)
                    omnivore, slowfast, avion, window_medoid, objects_class, objects_conf, step_label, step_label_desc = load_features(file_path)
                    
                    session_length = len(omnivore) + len(slowfast) + len(avion)
                    current_session_id = 0
                    for i in range(len(omnivore)):
                        omnivore_labels.append({"id": len(omnivore_labels), "session_id": current_session_id, "frame_id": window_medoid[i], "objects_class": objects_class[i], "objects_conf": objects_conf[i], "step_label": step_label[i], "step_label_desc": step_label_desc[i], "method": 'omnivore', "session": video_id, "skill": skill_name})
                        current_session_id += 1
                    current_session_id = 0
                    for i in range(len(slowfast)):
                        slowfast_labels.append({"id": len(slowfast_labels), "session_id": current_session_id, "frame_id": window_medoid[i], "objects_class": objects_class[i], "objects_conf": objects_conf[i], "step_label": step_label[i], "step_label_desc": step_label_desc[i], "method": 'slowfast', "session": video_id, "skill": skill_name})
                        current_session_id += 1
                    current_session_id = 0
                    for i in range(len(avion)):
                        avion_labels.append({"id": len(avion_labels), "session_id": current_session_id, "frame_id": window_medoid[i], "objects_class": objects_class[i], "objects_conf": objects_conf[i], "step_label": step_label[i], "step_label_desc": step_label_desc[i], "method": 'avion', "session": video_id, "skill": skill_name})
                        current_session_id += 1
                    
                    all_omnivore_features.append(omnivore)
                    all_slowfast_features.append(slowfast)
                    all_avion_features.append(avion)
    
    # Concatenate all features
    all_omnivore_features = np.concatenate(all_omnivore_features)
    all_slowfast_features = np.concatenate(all_slowfast_features)
    all_avion_features = np.concatenate(all_avion_features)
    
    # Apply PCA, t-SNE, and UMAP on the concatenated features
    omnivore_pca_data = apply_pca(all_omnivore_features, omnivore_labels)
    slowfast_pca_data = apply_pca(all_slowfast_features, slowfast_labels)
    avion_pca_data = apply_pca(all_avion_features, avion_labels)
    
    omnivore_tsne_data = apply_tsne(all_omnivore_features, omnivore_labels)
    slowfast_tsne_data = apply_tsne(all_slowfast_features, slowfast_labels)
    avion_tsne_data = apply_tsne(all_avion_features, avion_labels)
    
    omnivore_umap_data = apply_umap(all_omnivore_features, omnivore_labels)
    slowfast_umap_data = apply_umap(all_slowfast_features, slowfast_labels)
    avion_umap_data = apply_umap(all_avion_features, avion_labels)
    
    # Combine all PCA data
    all_pca_data = omnivore_pca_data + slowfast_pca_data + avion_pca_data
    
    # Combine all t-SNE data
    all_tsne_data = omnivore_tsne_data + slowfast_tsne_data + avion_tsne_data
    
    # Combine all UMAP data
    all_umap_data = omnivore_umap_data + slowfast_umap_data + avion_umap_data
    
    # Save to CSV
    pca_df = pd.DataFrame(all_pca_data)
    pca_df.to_csv(pca_output_file, index=False)
    print(f"PCA CSV file generated successfully: {pca_output_file}")
    
    tsne_df = pd.DataFrame(all_tsne_data)
    tsne_df.to_csv(tsne_output_file, index=False)
    print(f"t-SNE CSV file generated successfully: {tsne_output_file}")
    
    umap_df = pd.DataFrame(all_umap_data)
    umap_df.to_csv(umap_output_file, index=False)
    print(f"UMAP CSV file generated successfully: {umap_output_file}")
    
    return all_pca_data, all_tsne_data, all_umap_data

# Example usage in a Jupyter notebook
directory = path_raw_data  # Replace with the path to the directory containing the skill folders
pca_output_file = os.path.join(directory, "all_sessions_pca_results_10p.csv")
tsne_output_file = os.path.join(directory, "all_sessions_tsne_results_10p.csv")
umap_output_file = os.path.join(directory, "all_sessions_umap_results_10p.csv")
all_pca_data, all_tsne_data, all_umap_data = generate_csv(directory, pca_output_file, tsne_output_file, umap_output_file)


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


PCA CSV file generated successfully: /Users/soniacq/PTG/BBN_data/updated_models/different_windows_size/output/window_4s_sample/all_sessions_pca_results_10p.csv
t-SNE CSV file generated successfully: /Users/soniacq/PTG/BBN_data/updated_models/different_windows_size/output/window_4s_sample/all_sessions_tsne_results_10p.csv
UMAP CSV file generated successfully: /Users/soniacq/PTG/BBN_data/updated_models/different_windows_size/output/window_4s_sample/all_sessions_umap_results_10p.csv


## Compute Similarity scores

#### Using kmeans (2 clusters) to calculate silhouette score for each skill (individual class)

In [56]:
import numpy as np
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.cluster import KMeans
import json

def compute_silhouette_scores(data):
    print("Hello")
    # Initialize results dictionary
    results = {}
    
    # Get unique methods
    methods = set(d['method'] for d in data)
    
    for method in methods:
        # Filter data based on the method
        method_data = [d for d in data if d['method'] == method]
        
        # Extract x, y positions and skill labels
        positions = np.array([(d['x'], d['y']) for d in method_data])
        labels = np.array([d['skill'] for d in method_data])
        print(labels)
        # Ensure there are at least two unique labels
        if len(set(labels)) > 1:
            # Compute overall silhouette score and boxplot statistics
            silhouette_values = silhouette_samples(positions, labels, metric='euclidean')
            average_score = silhouette_score(positions, labels, metric='euclidean')
            boxplot_stats = calculate_boxplot_stats(silhouette_values)
            
            # Initialize method results
            method_results = {
                'average_score': average_score,
                'boxplot': boxplot_stats,
                'skills': {}
            }
            
            # Calculate silhouette score for each skill (individual class)
            skills = set(labels)
            for skill in skills:
                # Filter data for the specific skill
                skill_data = np.array([pos for pos, lbl in zip(positions, labels) if lbl == skill])
                
                # Apply KMeans clustering to get cluster labels for skill data
                if len(skill_data) > 1:
                    kmeans = KMeans(n_clusters=2, random_state=0).fit(skill_data)
                    skill_labels = kmeans.labels_
                    
                    # Compute silhouette score for skill data
                    skill_silhouette_values = silhouette_samples(skill_data, skill_labels, metric='euclidean')
                    skill_average_score = silhouette_score(skill_data, skill_labels, metric='euclidean')
                    skill_boxplot_stats = calculate_boxplot_stats(skill_silhouette_values)
                    
                    # Store skill results
                    method_results['skills'][skill] = {
                        'average_score': skill_average_score,
                        'boxplot': skill_boxplot_stats
                    }
            
            # Store method results in the main results dictionary
            results[method] = method_results
    
    # Save results to a JSON file
    silhoutte_path = path_raw_data + "/silhouette_scores.json"
    with open(silhoutte_path, 'w') as outfile:
        json.dump(results, outfile, indent=4)
    
    return results

def calculate_boxplot_stats(data):
    # Sort data
    sorted_data = np.sort(data)
    
    # Calculate quartiles
    q1 = np.percentile(sorted_data, 25)
    median = np.percentile(sorted_data, 50)
    q3 = np.percentile(sorted_data, 75)
    min_value = sorted_data[0]
    max_value = sorted_data[-1]
    
    return {
        'min': min_value,
        'q1': q1,
        'median': median,
        'q3': q3,
        'max': max_value
    }

# Compute silhouette scores and save to JSON
results = compute_silhouette_scores(all_data)

results


Hello
['M1' 'M1' 'M1' ... 'R16' 'R16' 'R16']
['M1' 'M1' 'M1' ... 'R16' 'R16' 'R16']
['M1' 'M1' 'M1' ... 'R16' 'R16' 'R16']


{'omnivore': {'average_score': 0.11586185767699353,
  'boxplot': {'min': -0.7089620427280873,
   'q1': -0.18062965983748563,
   'median': 0.16584202113233062,
   'q3': 0.45000095982587845,
   'max': 0.6578222000579298},
  'skills': {'M5': {'average_score': 0.7183624096198824,
    'boxplot': {'min': 0.1808708342673149,
     'q1': 0.652535803013684,
     'median': 0.7806542520518555,
     'q3': 0.8085438433133809,
     'max': 0.8496638791762264}},
   'M2': {'average_score': 0.8524349428474006,
    'boxplot': {'min': 0.07206482223398054,
     'q1': 0.8662045823149115,
     'median': 0.8803178960688576,
     'q3': 0.8936254411955391,
     'max': 0.9189645979096883}},
   'R16': {'average_score': 0.8191038524435319,
    'boxplot': {'min': 0.6505884331575554,
     'q1': 0.7920248275323194,
     'median': 0.8275653362709775,
     'q3': 0.8499536880385701,
     'max': 0.910048818677284}},
   'M1': {'average_score': 0.7273887179438435,
    'boxplot': {'min': 0.29954747558489037,
     'q1': 0.659

#### Using Density clustering (dbscan) to calculate silhouette score for each skill (individual class)[USED on PAPER]

In [107]:
import numpy as np
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.cluster import DBSCAN
import json

def compute_silhouette_scores(data, method_name):
    # Initialize results dictionary
    results = {}
    
    # Get unique methods
    methods = set(d['method'] for d in data)
    
    for method in methods:
        # Filter data based on the method
        method_data = [d for d in data if d['method'] == method]
        
        # Extract x, y positions and skill labels
        positions = np.array([(d['x'], d['y']) for d in method_data])
        labels = np.array([d['skill'] for d in method_data])
        
        # Ensure there are at least two unique labels (skills)
        if len(set(labels)) > 1:
            # Compute overall silhouette score using skill labels as clusters
            silhouette_values = silhouette_samples(positions, labels, metric='euclidean')
            average_score = silhouette_score(positions, labels, metric='euclidean')
            boxplot_stats = calculate_boxplot_stats(silhouette_values)
            
            # Initialize method results
            method_results = {
                'average_score': average_score,
                'boxplot': boxplot_stats,
                'skills': {}
            }
            
            # Calculate silhouette score for each skill (individual class) using DBSCAN
            skills = set(labels)
            for skill in skills:
                # Filter data for the specific skill
                skill_data = np.array([pos for pos, lbl in zip(positions, labels) if lbl == skill])
                
                # Apply DBSCAN clustering to get cluster labels for skill data
                if len(skill_data) > 1:
                    dbscan = DBSCAN(eps=0.5, min_samples=5).fit(skill_data)
                    skill_labels = dbscan.labels_
                    
                    # Check if DBSCAN found at least two clusters
                    unique_labels = set(skill_labels)
                    if len(unique_labels) > 1 and -1 in unique_labels:
                        unique_labels.remove(-1)  # Ignore noise label
                    
                    if len(unique_labels) > 1:  # Only proceed if there are at least two clusters
                        # Filter out noise points (label -1)
                        non_noise_indices = skill_labels != -1
                        filtered_skill_data = skill_data[non_noise_indices]
                        filtered_skill_labels = skill_labels[non_noise_indices]
                        
                        # Compute silhouette score for skill data without noise
                        skill_silhouette_values = silhouette_samples(filtered_skill_data, filtered_skill_labels, metric='euclidean')
                        skill_average_score = silhouette_score(filtered_skill_data, filtered_skill_labels, metric='euclidean')
                        skill_boxplot_stats = calculate_boxplot_stats(skill_silhouette_values)
                        
                        # Store skill results
                        method_results['skills'][skill] = {
                            'average_score': skill_average_score,
                            'boxplot': skill_boxplot_stats
                        }
            
            # Store method results in the main results dictionary
            results[method] = method_results
    
    # Save results to a JSON file
    silhouette_path = path_raw_data + f"/silhouette_scores_dbscan_{method_name}.json"
    with open(silhouette_path, 'w') as outfile:
        json.dump(results, outfile, indent=4)
    
    return results

def calculate_boxplot_stats(data):
    # Sort data
    sorted_data = np.sort(data)
    
    # Calculate quartiles
    q1 = np.percentile(sorted_data, 25)
    median = np.percentile(sorted_data, 50)
    q3 = np.percentile(sorted_data, 75)
    min_value = sorted_data[0]
    max_value = sorted_data[-1]
    
    return {
        'min': min_value,
        'q1': q1,
        'median': median,
        'q3': q3,
        'max': max_value
    }

def compute_for_all_projections(all_pca_data, all_tsne_data, all_umap_data):
    compute_silhouette_scores(all_pca_data, 'pca')
    compute_silhouette_scores(all_tsne_data, 'tsne')
    compute_silhouette_scores(all_umap_data, 'umap')

# Example of calling the function with data
compute_for_all_projections(all_pca_data, all_tsne_data, all_umap_data)


## Similarity measurements

#### Using projected space (ex. x and y positions based on tSNE)

In [59]:
import os
import pandas as pd

# Define the root directory containing the CSV folders
root_dir = "/Users/soniacq/PTG/BBN_data/updated_models/different_windows_size/similarity/csv"
output_csv = root_dir + "/combined_jaccard_index.csv"
import os
import pandas as pd

# Define the root directory containing the CSV folders
root_dir = "/Users/soniacq/PTG/BBN_data/updated_models/different_windows_size/similarity/csv"

# Define the output file paths for each of the CSVs
output_files = {
    "1s": root_dir + "/combined_jaccard_index_1s.csv",
    "2s": root_dir + "/combined_jaccard_index_2s.csv",
    "4s": root_dir + "/combined_jaccard_index_4s.csv"
}

# Iterate over each subfolder (1s, 2s, 4s)
for subfolder, output_csv in output_files.items():
    # Initialize an empty DataFrame to hold all the data for the current subfolder
    combined_df = pd.DataFrame()

    # Construct the full path to the subfolder
    subfolder_path = os.path.join(root_dir, subfolder)

    # Iterate over files in the subfolder
    for file in os.listdir(subfolder_path):
        if file.endswith("_jaccard_index.csv"):
            # Extract skill from the filename
            skill = file.split('_')[0]
            
            # Read the CSV file into a DataFrame
            file_path = os.path.join(subfolder_path, file)
            df = pd.read_csv(file_path)
            
            # Add the 'skill' column to the DataFrame
            df.insert(0, 'skill', skill)
            
            # Append the DataFrame to the combined DataFrame
            combined_df = pd.concat([combined_df, df], ignore_index=True)

    # Save the combined DataFrame to a new CSV file for the current subfolder
    combined_df.to_csv(output_csv, index=False)

    print(f"Combined CSV file for {subfolder} saved to {output_csv}")


Combined CSV file for 1s saved to /Users/soniacq/PTG/BBN_data/updated_models/different_windows_size/similarity/csv/combined_jaccard_index_1s.csv
Combined CSV file for 2s saved to /Users/soniacq/PTG/BBN_data/updated_models/different_windows_size/similarity/csv/combined_jaccard_index_2s.csv
Combined CSV file for 4s saved to /Users/soniacq/PTG/BBN_data/updated_models/different_windows_size/similarity/csv/combined_jaccard_index_4s.csv


#### Using original high dimensional space [USED on PAPER]

In [83]:
import os
import pandas as pd

# Define the root directory containing the CSV folders
root_dir = "/Users/soniacq/PTG/BBN_data/updated_models/different_windows_size/highdimensional_space_similarity/csv"
output_csv = root_dir + "/combined_jaccard_index.csv"
import os
import pandas as pd

# Define the root directory containing the CSV folders
root_dir = "/Users/soniacq/PTG/BBN_data/updated_models/different_windows_size/highdimensional_space_similarity/csv"

# Define the output file paths for each of the CSVs
output_files = {
    "1s": root_dir + "/combined_jaccard_original_space_index_1s.csv",
    "2s": root_dir + "/combined_jaccard_original_space_index_2s.csv",
    "4s": root_dir + "/combined_jaccard_original_space_index_4s.csv"
}

# Iterate over each subfolder (1s, 2s, 4s)
for subfolder, output_csv in output_files.items():
    # Initialize an empty DataFrame to hold all the data for the current subfolder
    combined_df = pd.DataFrame()

    # Construct the full path to the subfolder
    subfolder_path = os.path.join(root_dir, subfolder)

    # Iterate over files in the subfolder
    for file in os.listdir(subfolder_path):
        if file.endswith("jaccard_index_original_space.csv"):
            # Extract skill from the filename
            skill = file.split('_')[0]
            
            # Read the CSV file into a DataFrame
            file_path = os.path.join(subfolder_path, file)
            df = pd.read_csv(file_path)
            
            # Add the 'skill' column to the DataFrame
            df.insert(0, 'skill', skill)
            
            # Append the DataFrame to the combined DataFrame
            combined_df = pd.concat([combined_df, df], ignore_index=True)

    # Save the combined DataFrame to a new CSV file for the current subfolder
    combined_df.to_csv(output_csv, index=False)

    print(f"Combined CSV file for {subfolder} saved to {output_csv}")


Combined CSV file for 1s saved to /Users/soniacq/PTG/BBN_data/updated_models/different_windows_size/highdimensional_space_similarity/csv/combined_jaccard_original_space_index_1s.csv
Combined CSV file for 2s saved to /Users/soniacq/PTG/BBN_data/updated_models/different_windows_size/highdimensional_space_similarity/csv/combined_jaccard_original_space_index_2s.csv
Combined CSV file for 4s saved to /Users/soniacq/PTG/BBN_data/updated_models/different_windows_size/highdimensional_space_similarity/csv/combined_jaccard_original_space_index_4s.csv


In [66]:
# !pip install umap-learn


Collecting umap-learn
  Using cached umap_learn-0.5.6-py3-none-any.whl.metadata (21 kB)
Collecting numba>=0.51.2 (from umap-learn)
  Using cached numba-0.60.0-cp312-cp312-macosx_10_9_x86_64.whl.metadata (2.7 kB)
Collecting pynndescent>=0.5 (from umap-learn)
  Using cached pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Collecting tqdm (from umap-learn)
  Downloading tqdm-4.66.5-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting llvmlite<0.44,>=0.43.0dev0 (from numba>=0.51.2->umap-learn)
  Using cached llvmlite-0.43.0-cp312-cp312-macosx_10_9_x86_64.whl.metadata (4.8 kB)
Using cached umap_learn-0.5.6-py3-none-any.whl (85 kB)
Using cached numba-0.60.0-cp312-cp312-macosx_10_9_x86_64.whl (2.6 MB)
Using cached pynndescent-0.5.13-py3-none-any.whl (56 kB)
Downloading tqdm-4.66.5-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.4/