In [None]:
import torch
import pandas as pd
from dl.loaders import ultrasound_dataset
from torch.utils.data import DataLoader
import  numpy as np
import pickle
import os
import plotly.express as px

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import random
import SimpleITK as sitk

In [None]:
mount_point = '/mnt/raid/C1_ML_Analysis'

out_dir = "test_output/classification/extract_frames_Dataset_C_masked_resampled_256_spc075_wscores_meta_noflyto_100K_train/epoch=9-val_loss=0.27"

df = pd.read_parquet(os.path.join(mount_point, out_dir, 'extract_frames_Dataset_C_masked_resampled_256_spc075_wscores_meta_noflyto_100K_train_prediction.parquet'))
feat = pickle.load(open(os.path.join(mount_point, out_dir, 'extract_frames_Dataset_C_masked_resampled_256_spc075_wscores_meta_noflyto_100K_train_prediction.pickle'), 'rb'))
feat = feat.squeeze()

In [None]:
df.columns

In [None]:

# Apply t-SNE for dimensionality reduction to 2D
tsne = TSNE(n_components=2, perplexity=500, n_iter=300, random_state=42)
tsne_results = tsne.fit_transform(feat)

In [None]:
def generate_random_color():
    return "#{:06x}".format(random.randint(0, 0xFFFFFF))

def generate_random_colors(num_colors):
    return [generate_random_color() for _ in range(num_colors)]


# Generate random colors
unique_categories = df['pred_class'].unique()
random_colors = generate_random_colors(len(unique_categories))
color_map = {category: color for category, color in zip(unique_categories, random_colors)}


In [None]:
df['tsne_0'] = tsne_results[:,0]
df['tsne_1'] = tsne_results[:,1]

fig = px.scatter(df, x='tsne_0', y='tsne_1', color='pred_class')
fig.update_traces(marker=dict(color=[color_map[cat] for cat in df['pred_class']]))
fig

In [None]:
pca = PCA(n_components=2)
reduced_features = pca.fit_transform(feat)

df['pca_0'] = reduced_features[:,0]
df['pca_1'] = reduced_features[:,1]

fig = px.scatter(df, x='pca_0', y='pca_1', color='pred_class')
fig.update_traces(marker=dict(color=[color_map[cat] for cat in df['pred_class']]))
fig

In [None]:
out_dir = "test_output/classification/extract_frames_Dataset_C_masked_resampled_256_spc075_wscores_meta_noflyto/epoch=9-val_loss=0.27"

df_noflyto = pd.read_parquet(os.path.join(mount_point, out_dir, 'extract_frames_Dataset_C_masked_resampled_256_spc075_wscores_meta_noflyto_prediction.parquet'))


In [None]:
desc = df_noflyto.groupby('pred_class').describe()
desc

In [None]:
desc[('score', 'count')].sort_values(ascending=False)

In [None]:
desc[('score', 'mean')].sort_values(ascending=False)


In [None]:
img_fn = os.path.join(mount_point, df_noflyto.query('pred_class == 0')['img_path'].sample(1).values[0])
# img_fn = os.path.join(mount_point, df_noflyto['img_path'].sample(1).values[0])
img_np = sitk.GetArrayFromImage(sitk.ReadImage(img_fn))

px.imshow(img_np.squeeze())


In [None]:
n_samples_per_class = 1000
sampled_df = df_noflyto.groupby('pred_class', group_keys=False).apply(lambda x: x.sample(n_samples_per_class, random_state=42))

In [None]:
# sampled_df.to_parquet(os.path.join(mount_point, out_dir, 'extract_frames_Dataset_C_masked_resampled_256_spc075_wscores_meta_noflyto_sampledwsimnorth.parquet'), index=False)
df_voluson = pd.read_parquet(os.path.join(mount_point, 'CSV_files/extract_frames_blind_sweeps_voluson_simnorth.parquet'))

In [None]:

desc = df_voluson.groupby('pred_class').describe()
desc

In [None]:
desc[('ga_boe', 'count')].sort_values(ascending=False)

In [None]:
n_samples_per_class = 25000
sampled_df = df_voluson.groupby('pred_class', group_keys=False).apply(lambda x: x.sample(n_samples_per_class, random_state=42))
# sampled_df.to_parquet(os.path.join(mount_point, 'CSV_files/extract_frames_blind_sweeps_voluson_sampledwsimnorth.parquet'), index=False)