In [7]:
# Install UMAP
import umap

In [8]:
#@title Import packages
import os # for handling files and directories
import librosa # for audio processing
import tensorflow as tf # for machine learning
import tensorflow_hub as hub # for machine learning
import numpy as np # for numerical processing
import pandas as pd # for handling dataframes
from tqdm import tqdm # for progress bar
import matplotlib.pyplot as plt # for potting
from sklearn.cluster import KMeans # for clustering

# To get reproducabel results, we set the seed
random_seed = 0

In [9]:
#@title Set all filepaths

# Path where the csv of extracted features was saved
features_df_path = '/home/os/aqoustics/Aqoustics-Surfperch/data/output_dir/surfperch_fish_feature_embeddings.csv'

# Path where we will save an updated version of features_df, with added metadata
features_metadata_path = '/home/os/aqoustics/Aqoustics-Surfperch/data/output_dir/reduced_fish_feature_embeddings.csv'

# Path where we will save cluster results
cluster_results_path = '/home/os/aqoustics/Aqoustics-Surfperch/data/output_dir/cluster_results.csv'

In [None]:
# Load the saved csv from gdrive as a dataframe
features_df = pd.read_csv(features_df_path)

features_df

In [None]:
def extract_metadata_from_filename(file):
    # Split the filename using 'clip_ind_' as the delimiter
    parts = file.split('clip_ind_')
    
    # Extract the first letter after 'clip_ind_' to determine the class_type
    class_type = parts[1][0] if len(parts) > 1 else None
    
    return class_type


# Applying the function to each filename in the DataFrame
features_df[['class_type']] = features_df['filename'].apply(
    lambda x: pd.Series(extract_metadata_from_filename(x))
)

# Arrange columns in desired order
column_order = ['filename','class_type'] + \
               [col for col in features_df.columns if col.startswith('feature_')]
features_metadata_df = features_df[column_order]

# Save df
features_metadata_df.to_csv(features_metadata_path, index = False)

# Take a look
features_metadata_df

In [None]:
import itertools
import umap
import matplotlib.pyplot as plt

# Define your hyperparameter lists
n_neighbors = [5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
min_dist = [0.1, 0.2, 0.3, 0.4, 0.5]
n = [16, 32, 64]
random_seed = 42  # Example random seed for reproducibility

# Class and color mappings
class_mapping = {'H': 'Healthy', 'D': 'Degraded', 'R': 'Restored', 'N': 'Newly-Restored'}
color_mapping = {'Healthy': 'green', 'Degraded': 'red', 'Restored': 'blue', 'Newly-Restored': 'yellow'}

# Loop over all combinations of n_neighbors, min_dist, and n
for n_neighbor, min_d, n_components in itertools.product(n_neighbors, min_dist, n):
    
    # Step 1: First UMAP reduction to n_components dimensions
    umap_reducer_1 = umap.UMAP(n_components=n_components, random_state=random_seed, n_neighbors=n_neighbor, min_dist=min_d)
    reduced_features_128 = umap_reducer_1.fit_transform(features_metadata_df.iloc[:, 2:])
    
    # Optional: Second UMAP reduction to a specific number of dimensions (like 64), can uncomment and use if needed
    # umap_reducer_2 = umap.UMAP(n_components=64, random_state=random_seed, n_neighbors=n_neighbor, min_dist=min_d)
    # reduced_features_64 = umap_reducer_2.fit_transform(reduced_features_128)
    
    # Step 3: Third UMAP reduction to 2 dimensions for plotting
    umap_reducer_3 = umap.UMAP(n_components=2, random_state=random_seed, n_neighbors=n_neighbor, min_dist=min_d)
    umap_embeddings = umap_reducer_3.fit_transform(reduced_features_128)
    
    # Set up the plot
    plt.figure(figsize=(10, 10))
    
    # Plot each class with its own color and label using the mapping
    for class_type, label in class_mapping.items():
        indices = features_metadata_df['class_type'] == class_type
        plt.scatter(umap_embeddings[indices, 0], umap_embeddings[indices, 1], label=label,
                    color=color_mapping[label], alpha=0.5)
    
    # Set title and labels
    plt.title(f'UMAP Projection: n_neighbors={n_neighbor}, min_dist={min_d}, n_components={n_components}')
    plt.xlabel('UMAP 1')
    plt.ylabel('UMAP 2')
    plt.legend(title='Class Label')
    
    # Show the plot
    plt.show()

    # You can save the plot if needed using the following line:
    # plt.savefig(f'umap_projection_n_neighbors_{n_neighbor}_min_dist_{min_d}_n_components_{n_components}.png')
