In [1]:
import pandas as pd

data = pd.read_csv('../db/dataset_tissue.txt', header=None, sep=' ')

labels = pd.read_csv('../db/clase.txt', header=None)

print(data)

                                                       0
0      ,"GSM11805.CEL.gz","GSM11814.CEL.gz","GSM11823...
1      1007_s_at,10.1912666822211,10.5091673351314,10...
2      1053_at,6.04046250272039,6.69607547976383,6.14...
3      117_at,7.44740927631719,7.77535403522073,7.696...
4      121_at,12.0250418580982,12.0078171043575,11.63...
...                                                  ...
22211  91703_at,6.43485083964546,7.6769887682576,6.42...
22212  91816_f_at,5.70044827295955,6.56647865392162,5...
22213  91826_at,9.21116303052509,9.41598030255139,8.1...
22214  91920_at,8.33913007018526,8.21442605889894,8.4...
22215  91952_at,7.36779705423511,7.9177537650076,7.59...

[22216 rows x 1 columns]


In [3]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler  # Agregamos la importación

skip_rows = 1  # omitir la primera fila
data = pd.read_csv('../db/dataset_tissue.txt', skiprows=skip_rows, header=None, sep=',')  # Cambiar el separador a ','

# Eliminar la primera columna que contiene identificadores de genes
data = data.iloc[:, 1:]


scaler = MinMaxScaler()

# Aplicar la normalización Min-Max a tus datos
normalized_data = scaler.fit_transform(data)

# Especificar el número de componentes principales a retener
n_components = 10

# Crear una instancia de PCA con el número de componentes
pca = PCA(n_components=n_components)

# Ajustar y transformar los datos con PCA
data_pca = pca.fit_transform(normalized_data)

In [15]:
import random
import numpy as np 
import math

class GMMComponent:

    def __init__(self, d):
        self.mean = [random.random() for _ in range(d)]
        self.cov = [[random.random() for _ in range(d)] for _ in range(d)]
        
    def score(self, point):
        mean = self.mean
        cov = self.cov 
        
        const = 1 / (np.sqrt((2*np.pi)**d * max(np.linalg.det(cov), 1e-6)))
        x_m = np.matrix(point - mean)
        exp = np.exp(-0.5 * (x_m * np.linalg.inv(cov) * x_m.T))
        return const * exp

    def update(self, resp, data, N, d):
        resp_k = sum(resp)
        
        mean_k = [r * data[i] for i,r in enumerate(resp)]
        self.mean = np.sum(mean_k, axis=0) / resp_k
        
        diff = [np.matrix(data[i] - self.mean) for i,r in enumerate(resp)]
        cov_k = [r * (d[i].T @ d[i]) for i, r in zip(diff, resp)]
        self.cov = np.sum(cov_k, axis=0) / resp_k

class GMM:  

    def __init__(self, k, d):
        self.k = k
        self.components = [GMMComponent(d) for _ in range(k)]
    
    def fit(self, data):
       N, d = data.shape
        
       for _ in range(10):
           resp = [self._e_step(point) for point in data]
           self._m_step(resp, data, N, d)
           
    def predict(self, point):
        resp = self._e_step(point)
        return max(range(self.k), key=lambda k: resp[k])

    def _e_step(self, data):
        scores = []
        for point in data:
            scores.append([c.score(point) for c in self.components])
        total_scores = np.sum(scores, axis=1)
        normalized_scores = [s / total for s, total in zip(scores, total_scores)]
        return normalized_scores


    def _m_step(self, resp, data, N, d):
        for k in range(self.k):
            resp_k = [r[k] for r in resp]
            self.components[k].update(resp_k, data, N, d)
       
# Uso:
        
d = 50 # dimensiones tras PCA
gmm = GMM(5, d) 
gmm.fit(data_pca)
clusters = [gmm.predict(point) for point in data_pca]

KeyboardInterrupt: 