# Implementación de UMAP: Datos de E=0,14

In [None]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import re
import umap.umap_ as umap
import umap.plot
from sklearn.preprocessing import StandardScaler

## Carga de datos

In [None]:
path = "/home/lcampos/zeus/proyecto_fondecyt/CODE/"
df_pca = pd.read_csv(path+'csv_creados/df_pca_E014.csv')
df_pca.set_index(['R', 'Unnamed: 1'], inplace=True) 
df_pca.rename_axis(index={'Unnamed: 1': 'Time'}, inplace=True)
df_energy = pd.read_csv(path+'csv_creados/df_energy_E014.csv')
df_energy.set_index(['R', 'Unnamed: 1'], inplace=True)  
df_energy.rename_axis(index={'Unnamed: 1': 'Time'}, inplace=True)

In [None]:
data = df_energy.join([df_pca])
data

## Funciones para implementar UMAP

Se requiere que los datos estén normalizados, sin embargo, hay 2 posibles normalizaciones a considerar

normalizador type 1: Los estandariza de manera que tengan media $0$ y std $1$.

normalizador type 2: Representa los datos a $[0,1]$.

In [None]:
def min_max_normalize(column):
    return (column - column.min()) / (column.max() - column.min() + 1e-8)  # se suma la cte pequeña pues hay columnas con valores ctes

def implementacion_umap(dataset, normalizador='type 1', min_dist=0.1, n_neighbors=15, n_components=2):
    if normalizador=='type 1':
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(dataset.values)
    
    elif normalizador=='type 2':
        X_scaled = dataset.apply(min_max_normalize).values
        
    reducer = umap.UMAP(n_neighbors=n_neighbors,
                            min_dist=min_dist,
                            n_components=n_components,
                            metric='euclidean')
    embedding = reducer.fit_transform(X_scaled)
    
    return reducer, embedding

def grafico_umap(embedding, color, title='Umap Projection', s=5):
    x, y = embedding.shape
    
    if y == 2:
        plt.figure(figsize=(8, 6))
        plt.scatter(embedding[:, 0], embedding[:, 1], c = color, s=s)
        plt.title(title)
        plt.xlabel('Dimension 1')
        plt.ylabel('Dimension 2')
        plt.show() 
        
    elif y == 3:
        fig = plt.figure(figsize=(8, 6))
        ax = fig.add_subplot(111, projection='3d')
        ax.scatter(embedding[:, 0], embedding[:, 1], embedding[:, 2], c=color, cmap='viridis', s=s)
        ax.set_title(title)
        ax.set_xlabel('Dimension 1')
        ax.set_ylabel('Dimension 2')
        ax.set_zlabel('Dimension 3')
        plt.show()     

## Umap a cada repetición
Considera n\_neighbors=100 y min\_dist=0.1

In [None]:
# 2D
# Normalizador type 1: datos con media 0 y std=1
for i in range(100):
    df_i = data.loc[data.index.get_level_values('R') == i+1]
    reducer, embedding = implementacion_umap(df_i, normalizador='type 1', min_dist=0.1, n_neighbors=100, n_components=2)
    grafico_umap(embedding, np.linspace(0,50000,5001), title='Umap Projection- Repetition'+str(i+1), s=5)

In [None]:
# 3D
# Normalizador type 1: datos con media 0 y std=1
for i in range(100):
    df_i = data.loc[data.index.get_level_values('R') == i+1]
    reducer, embedding = implementacion_umap(df_i, normalizador='type 1', min_dist=0.1, n_neighbors=100, n_components=3)
    grafico_umap(embedding, np.linspace(0,50000,5001), title='Umap Projection- Repetition'+str(i+1), s=5)

In [None]:
# 2D
# Normalizador type 2: datos representados en [0,1]
for i in range(100):
    df_i = data.loc[data.index.get_level_values('R') == i+1]
    reducer, embedding = implementacion_umap(df_i, normalizador='type 2', min_dist=0.1, n_neighbors=100, n_components=2)
    grafico_umap(embedding, np.linspace(0,50000,5001), title='Umap Projection- Repetition' +str(i+1), s=5)

In [None]:
# 3D
# Normalizador type 2: datos representados en [0,1]
for i in range(100):
    df_i = data.loc[data.index.get_level_values('R') == i+1]
    reducer, embedding = implementacion_umap(df_i, normalizador='type 2', min_dist=0.1, n_neighbors=100, n_components=3)
    grafico_umap(embedding, np.linspace(0,50000,5001), title='Umap Projection- Repetition' +str(i+1), s=5)

## Umap sobre (casi) todos los datos
Dada la cantidad de datos, y lo lento que es UMAP, consideraremos un set de datos reducido

In [None]:
time_values = np.append(np.array([0,10]),np.arange(100,50010,100))
tiempo = np.tile(time_values,100)
df_ = data.loc[data.index.get_level_values('Time').isin(time_values)]
df_

In [None]:
# 2D
# Normalizador type 1, n_neighbors=15
reducer, embedding = implementacion_umap(df_, normalizador='type 1', min_dist=0.1, n_neighbors=15, n_components=2)
grafico_umap(embedding, tiempo, title='Umap Projection 2D all data', s=1)

In [None]:
# 2D
# Normalizador type 1, n_neighbors=100
reducer, embedding = implementacion_umap(df_, normalizador='type 1', min_dist=0.1, n_neighbors=100, n_components=2)
grafico_umap(embedding, tiempo, title='Umap Projection 2D all data', s=1)

In [None]:
# Crear un gráfico UMAP para cada repetición, como los datos estan ordenados no debiese haber problema
for i in range(100):
    subset_embedding = embedding[502*i:502*(i+1)]
    plt.figure(figsize=(8, 6))
    plt.scatter(subset_embedding[:, 0], subset_embedding[:, 1], c=time_values,  s=1)
    plt.title(f'UMAP Projection 2D of all repetition, repetition ' + str(i+1))
    plt.xlabel('UMAP Dimension 1')
    plt.ylabel('UMAP Dimension 2')
    plt.show()

In [None]:
# 3D
# Normalizador type 1, n_neighbors=15
reducer, embedding = implementacion_umap(df_, normalizador='type 1', min_dist=0.1, n_neighbors=15, n_components=3)
grafico_umap(embedding, tiempo, title='Umap Projection 2D all data', s=1)

In [None]:
# 3D
# Normalizador type 1, n_neighbors=100
reducer, embedding = implementacion_umap(df_, normalizador='type 1', min_dist=0.1, n_neighbors=100, n_components=3)
grafico_umap(embedding, tiempo, title='Umap Projection 3D all data', s=1)

In [None]:
# Crear un gráfico UMAP para cada repeticion
for i in range(100):
    subset_embedding = embedding[502*i:502*(i+1)]
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(subset_embedding[:, 0], subset_embedding[:, 1], subset_embedding[:, 2], c=time_values, cmap='viridis', s=1)
    ax.set_title('UMAP projection 3D of all repetition, repetition '+str(i+1))
    ax.set_xlabel('Dimension 1')
    ax.set_ylabel('Dimension 2')
    ax.set_zlabel('Dimension 3')
    plt.show()

In [None]:
# 2D
# Normalizador type 1, n_neighbors=15
reducer, embedding = implementacion_umap(df_, normalizador='type 2', min_dist=0.1, n_neighbors=15, n_components=2)
grafico_umap(embedding, tiempo, title='Umap Projection 2D all data', s=1)

In [None]:
# 2D
# Normalizador type 2, n_neighbors=100
reducer, embedding = implementacion_umap(df_, normalizador='type 2', min_dist=0.1, n_neighbors=100, n_components=2)
grafico_umap(embedding, tiempo, title='Umap Projection 3D all data', s=1)

In [None]:
# Es la misma que la anterior, pero sin los lim de los ejes
for i in range(100):
    subset_embedding = embedding[502*i:502*(i+1)]
    plt.figure(figsize=(8, 6))
    plt.scatter(subset_embedding[:, 0], subset_embedding[:, 1], c=time_values,  s=1)
    plt.title(f'UMAP Projection 2D of all repetition, repetition ' + str(i+1))
    plt.xlabel('UMAP Dimension 1')
    plt.ylabel('UMAP Dimension 2')
    plt.show()

In [None]:
# 3D
# Normalizador type 2, n_neighbors=15
reducer, embedding = implementacion_umap(df_, normalizador='type 1', min_dist=0.1, n_neighbors=15, n_components=3)
grafico_umap(embedding, tiempo, title='Umap Projection 2D all data', s=1)

In [None]:
# 3D
# Normalizador type 2, n_neighbors=100
reducer, embedding = implementacion_umap(df_, normalizador='type 2', min_dist=0.1, n_neighbors=100, n_components=3)
grafico_umap(embedding, tiempo, title='Umap Projection 3D all data', s=1)

In [None]:
# Crear un gráfico UMAP para cada repeticion
for i in range(100):
    subset_embedding = embedding[502*i:502*(i+1)]
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(subset_embedding[:, 0], subset_embedding[:, 1], subset_embedding[:, 2], c=time_values, cmap='viridis', s=1)
    ax.set_title('UMAP projection 3D of all repetition, repetition '+str(i+1))
    ax.set_xlabel('Dimension 1')
    ax.set_ylabel('Dimension 2')
    ax.set_zlabel('Dimension 3')
    plt.show()

## Variar parametros

In [None]:
# 2D
# Normalizador type 1, n_neighbors=500, min_dist=0.1
reducer, embedding = implementacion_umap(df_, normalizador='type 1', min_dist=0.1, n_neighbors=500, n_components=2)
grafico_umap(embedding, tiempo, title='Umap Projection 2D all data', s=1)

In [None]:
# 2D
# Normalizador type 1, n_neighbors=500, min_dist=0.1
reducer, embedding = implementacion_umap(df_, normalizador='type 1', min_dist=0.5, n_neighbors=250, n_components=2)
grafico_umap(embedding, tiempo, title='Umap Projection 2D all data', s=1)

In [None]:
# 2D
# Normalizador type 1, n_neighbors=500, min_dist=0.5
reducer, embedding = implementacion_umap(df_, normalizador='type 1', min_dist=0.5, n_neighbors=500, n_components=2)
grafico_umap(embedding, tiempo, title='Umap Projection 2D all data', s=1)

In [None]:
# 2D
# Normalizador type 2, n_neighbors=500, min_dist=0.1
reducer, embedding = implementacion_umap(df_, normalizador='type 2', min_dist=0.1, n_neighbors=500, n_components=2)
grafico_umap(embedding, tiempo, title='Umap Projection 2D all data', s=1)

In [None]:
# 2D
# Normalizador type 1, n_neighbors=500, min_dist=0.1
reducer, embedding = implementacion_umap(df_, normalizador='type 2', min_dist=0.5, n_neighbors=250, n_components=2)
grafico_umap(embedding, tiempo, title='Umap Projection 2D all data', s=1)

In [None]:
# 2D
# Normalizador type 2, n_neighbors=500, min_dist=0.5
reducer, embedding = implementacion_umap(df_, normalizador='type 2', min_dist=0.5, n_neighbors=500, n_components=2)
grafico_umap(embedding, tiempo, title='Umap Projection 2D all data', s=1)

In [None]:
# Es la misma que la anterior, pero sin los lim de los ejes
for i in range(100):
    subset_embedding = embedding[502*i:502*(i+1)]
    plt.figure(figsize=(8, 6))
    plt.scatter(subset_embedding[:, 0], subset_embedding[:, 1], c=time_values,  s=1)
    plt.title(f'UMAP Projection 2D of all repetition, repetition ' + str(i+1))
    plt.xlabel('UMAP Dimension 1')
    plt.ylabel('UMAP Dimension 2')
    plt.show()

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(embedding[:, 0], embedding[:, 1], c=tiempo,  s=1)
plt.title(f'UMAP Projection 2D of all repetition')
plt.xlabel('UMAP Dimension 1')
plt.ylabel('UMAP Dimension 2')
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
for i in (10,21,29,32,35,40,43,51):
    subset_embedding = embedding[502*(i-1):502*(i)]  
    plt.scatter(subset_embedding[:, 0], subset_embedding[:, 1], s=5)

plt.legend([10,21,29,32,35,40,43,51])
plt.title('UMAP projection 2D')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
for i in (54,73,76,82,88,91):
    subset_embedding = embedding[502*(i-1):502*(i)]  
    plt.scatter(subset_embedding[:, 0], subset_embedding[:, 1], s=5)

plt.legend([54,73,76,82,88,91])
plt.title('UMAP projection 2D')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.show()