## Code to perform cluster analysis for GENE data in DB 
March 17, 2025

In [35]:
import numpy as np
import matplotlib.pyplot as plt
import sys
import pandas as pd 
import itertools

In [44]:
from ipywidgets import *
%matplotlib widget

In [2]:
from umap import UMAP
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [3]:
## Dimensionality reduction
def f_dim_reduction(data,pdict, vis_type='UMAP'):
    '''
    Compute dimensionality reduction : UMAP, tSNE or PCA
    Need 2D array
    '''  
    
    assert vis_type in ['UMAP','tSNE','PCA'],"vis_type must be one of %s"%(['UMAP','tSNE','PCA'])    
    
    verbose = 0
    if vis_type=='UMAP':
        n_neighbours = pdict['n_neighbors']
        min_dist = pdict['min_dist']

        func_2d = UMAP(n_components=2, n_neighbors = n_neighbours, min_dist= min_dist, init='random', random_state=0, verbose=verbose)
        # func_3d = UMAP(n_components=3, n_neighbors = n_neighbours, min_dist= min_dist, init='random', random_state=0)

    elif vis_type=='tSNE':
        perplexity = pdict['perplexity']
        n_iter     = pdict['n_iter']

        func_2d = TSNE(n_components=2, verbose = verbose, perplexity = perplexity, n_iter = n_iter)
        # func_3d = TSNE(n_components=3, verbose = verbose, perplexity = perplexity, n_iter = n_iter)

    elif vis_type=='PCA':
        func_2d = PCA(n_components = 2)
        # func_3d = PCA(n_components = 3)

    if type(data) is list: 
        if vis_type=='tSNE':
            results = [func_2d.fit_transform(d) for d in data]
        else:
            func_2d.fit(data[0])   
            results = [func_2d.transform(d) for d in data]
    
    else: 
        results = [func_2d.fit_transform(data)]
        
    # proj_3d = func_3d.fit_transform(features)

    return results

In [48]:
def f_plot_dim_red(arr_list,vis_type):
    pdict={'n_neighbors':50, 'min_dist':0.5, 'perplexity':min(30,arr_list[0].shape[0]-1) , 'n_iter':1000}

    results = f_dim_reduction(arr_list, pdict, vis_type)

    plt.figure()
    for count,(result,m) in enumerate(zip(results,itertools.cycle('sDhx*o'))):
        plt.scatter(result[:,0],result[:,1],marker=m,label=count)
    plt.title('test')
    plt.legend()
    plt.show()

In [53]:
def f_gen_mock_data(seed=42):
    # Set random seed for reproducibility
    np.random.seed(seed)
    
    # Generate 200 samples of 5D data with 4 distinct clusters
    n_samples = 200
    n_features = 5
    n_clusters = 4
    
    # Initialize empty array
    data = np.zeros((n_samples, n_features))
    
    # Generate cluster centers
    cluster_centers = np.random.uniform(-10, 10, (n_clusters, n_features))
    
    # Generate samples for each cluster
    samples_per_cluster = n_samples // n_clusters
    for i in range(n_clusters):
        start_idx = i * samples_per_cluster
        end_idx = (i + 1) * samples_per_cluster if i < n_clusters - 1 else n_samples
        
        # Generate points around cluster center with some noise
        cluster_data = (cluster_centers[i] + 
                       np.random.normal(0, 1.5, (end_idx - start_idx, n_features)))
        data[start_idx:end_idx] = cluster_data
    
    # Shuffle the data
    np.random.shuffle(data)

    return data

d1 = f_gen_mock_data(24)
d2 = f_gen_mock_data(44)

d1.shape,d2.shape

((200, 5), (200, 5))

In [55]:
# f_plot_dim_red(d1,'tSNE')

In [54]:
interact(f_plot_dim_red,arr_list = fixed([d1,d2]), vis_type=['UMAP','tSNE','PCA'])

interactive(children=(Dropdown(description='vis_type', options=('UMAP', 'tSNE', 'PCA'), value='UMAP'), Output(…

<function __main__.f_plot_dim_red(arr_list, vis_type)>

(200, 5)