In [4]:
import pandas as pd

data = pd.read_csv('../db/dataset_tissue.txt', header=None, sep=' ')

labels = pd.read_csv('../db/clase.txt', header=None)

print(data)

                                                       0
0      ,"GSM11805.CEL.gz","GSM11814.CEL.gz","GSM11823...
1      1007_s_at,10.1912666822211,10.5091673351314,10...
2      1053_at,6.04046250272039,6.69607547976383,6.14...
3      117_at,7.44740927631719,7.77535403522073,7.696...
4      121_at,12.0250418580982,12.0078171043575,11.63...
...                                                  ...
22211  91703_at,6.43485083964546,7.6769887682576,6.42...
22212  91816_f_at,5.70044827295955,6.56647865392162,5...
22213  91826_at,9.21116303052509,9.41598030255139,8.1...
22214  91920_at,8.33913007018526,8.21442605889894,8.4...
22215  91952_at,7.36779705423511,7.9177537650076,7.59...

[22216 rows x 1 columns]


In [5]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

data_tissue = pd.read_csv('../db/dataset_tissue.txt', sep=',')
data_no_id = data_tissue.drop(columns=["Unnamed: 0"])
scaler = StandardScaler()
data_standardized = scaler.fit_transform(data_no_id)

# Realiza PCA
pca_original = PCA()
data_pca_original = pca_original.fit_transform(data_standardized)


In [6]:
explained_variance_ratio_cumsum_original = pca_original.explained_variance_ratio_.cumsum()

In [7]:
num_components_95 = (explained_variance_ratio_cumsum_original >= 0.95).argmax() + 1
print(f"Número de componentes para obtener la data al 95%: {num_components_95}")

Número de componentes para obtener la data al 95%: 6


In [17]:
import numpy as np
from scipy.stats import multivariate_normal

class GMMComponent:

    def __init__(self, d):
        self.mean = np.random.rand(d)
        self.cov = np.eye(d) 
        self.cov_inv = np.linalg.inv(self.cov)

    def score(self, data):
        return multivariate_normal.pdf(data, self.mean, self.cov)

    def update(self, resp, data):
        N = resp.shape[0]
        
        # actualizar mean
        resp_mean = np.sum(resp[:,np.newaxis] * data, axis=0) 
        new_mean = resp_mean / N
        
        # actualizar covarianza
        diff = data - new_mean
        new_cov = np.dot((resp * diff).T, diff) / N
        
        self.mean = new_mean
        self.cov = new_cov + np.eye(data.shape[1]) * 1e-6 
        self.cov_inv = np.linalg.inv(self.cov)

        
class GMM:

    def __init__(self, k, d):
        self.components = [GMMComponent(d) for _ in range(k)]
        
    def fit(self, data):
        N = data.shape[0]
        for _ in range(25):
            resp = self._e_step(data)
            self._m_step(resp, data)
            
    def _e_step(self, data):
        scores = np.array([c.score(data) for c in self.components]) 
        tot_scores = scores.sum(axis=1)
        return (scores.T / tot_scores).T 

    def _m_step(self, resp, data):
        for k in range(len(self.components)):
            resp_k = resp[:,k]
            self.components[k].update(resp_k, data)
            
    def predict(self, data):
        resp = self._e_step(data)
        return np.argmax(resp, axis=1)
        
# Usar el modelo 
d = num_components_95       
gmm = GMM(5, d)
gmm.fit(data)
clusters = gmm.predict(data)

ValueError: could not convert string to float: ',"GSM11805.CEL.gz","GSM11814.CEL.gz","GSM11823.CEL.gz","GSM11830.CEL.gz","GSM12067.CEL.gz","GSM12075.CEL.gz","GSM12079.CEL.gz","GSM12098.CEL.gz","GSM12105.CEL.gz","GSM12268.CEL.gz","GSM12270.CEL.gz","GSM12283.CEL.gz","GSM12298.CEL.gz","GSM12300.CEL.gz","GSM12399.CEL.gz","GSM12444.CEL.gz","GSM21203.cel.gz","GSM21204.cel.gz","GSM21205.cel.gz","GSM21206.cel.gz","GSM21207.cel.gz","GSM21208.cel.gz","GSM21209.cel.gz","GSM21210.cel.gz","GSM21212.cel.gz","GSM21213.cel.gz","GSM21214.cel.gz","GSM21215.cel.gz","GSM21216.cel.gz","GSM21217.cel.gz","GSM21218.cel.gz","GSM21219.cel.gz","GSM21220.cel.gz","GSM21221.cel.gz","GSM21222.cel.gz","GSM21223.cel.gz","GSM21224.cel.gz","GSM21225.cel.gz","GSM21226.cel.gz","GSM21227.cel.gz","GSM21228.cel.gz","GSM21229.cel.gz","GSM21230.cel.gz","GSM21231.cel.gz","GSM21232.cel.gz","GSM21233.cel.gz","GSM87058.cel.gz","GSM87064.cel.gz","GSM87065.cel.gz","GSM87066.cel.gz","GSM87071.cel.gz","GSM87072.cel.gz","GSM87073.cel.gz","GSM87074.cel.gz","GSM87075.cel.gz","GSM87085.cel.gz","GSM87086.cel.gz","GSM87087.cel.gz","GSM87088.cel.gz","GSM87089.cel.gz","GSM87090.CEL.gz","GSM87091.cel.gz","GSM87092.cel.gz","GSM87093.cel.gz","GSM87094.cel.gz","GSM87095.cel.gz","GSM87096.cel.gz","GSM87097.cel.gz","GSM87098.cel.gz","GSM87099.cel.gz","GSM87100.cel.gz","GSM87101.cel.gz","GSM87102.cel.gz","GSM146778.CEL.gz","GSM146779.CEL.gz","GSM146781.CEL.gz","GSM146782.CEL.gz","GSM146783.CEL.gz","GSM146785.CEL.gz","GSM146787.CEL.gz","GSM146788.CEL.gz","GSM146789.CEL.gz","GSM146791.CEL.gz","GSM146793.CEL.gz","GSM146795.CEL.gz","GSM146797.CEL.gz","GSM92240.CEL.gz","GSM92241.CEL.gz","GSM92242.CEL.gz","GSM92243.CEL.gz","GSM92244.CEL.gz","GSM92245.CEL.gz","GSM92247.CEL.gz","GSM92248.CEL.gz","GSM92249.CEL.gz","GSM92250.CEL.gz","GSM92253.CEL.gz","GSM92254.CEL.gz","GSM92255.CEL.gz","GSM92256.CEL.gz","GSM92257.CEL.gz","GSM92258.CEL.gz","GSM92259.CEL.gz","GSM92260.CEL.gz","GSM92261.CEL.gz","GSM92262.CEL.gz","GSM92263.CEL.gz","GSM92264.CEL.gz","GSM92265.CEL.gz","GSM92266.CEL.gz","GSM92267.CEL.gz","GSM92268.CEL.gz","GSM92269.CEL.gz","GSM92270.CEL.gz","GSM92271.CEL.gz","GSM92272.CEL.gz","GSM92273.CEL.gz","GSM92274.CEL.gz","GSM92275.CEL.gz","GSM92276.CEL.gz","GSM35979.cel.gz","GSM35980.cel.gz","GSM35981.cel.gz","GSM35982.cel.gz","GSM35983.cel.gz","GSM35984.cel.gz","GSM35991.cel.gz","GSM35992.cel.gz","GSM35993.cel.gz","GSM35994.cel.gz","GSM35995.cel.gz","GSM35996.cel.gz","GSM36003.cel.gz","GSM44675.CEL.gz","GSM44689.CEL.gz","GSM44697.CEL.gz","GSM44702.CEL.gz","GSM181429.CEL.gz","GSM181430.CEL.gz","GSM181431.CEL.gz","GSM181432.CEL.gz","GSM181433.CEL.gz","GSM18917.CEL.gz","GSM18918.CEL.gz","GSM18953.CEL.gz","GSM18954.CEL.gz","GSM18955.CEL.gz","GSM18956.CEL.gz","GSM296875.CEL.gz","GSM296876.CEL.gz","GSM296878.CEL.gz","GSM296879.CEL.gz","GSM296880.CEL.gz","GSM296881.CEL.gz","GSM296882.CEL.gz","GSM296883.CEL.gz","GSM296886.CEL.gz","GSM296887.CEL.gz","GSM296888.CEL.gz","GSM296889.CEL.gz","GSM296890.CEL.gz","GSM296891.CEL.gz","GSM296892.CEL.gz","GSM298747.CEL.gz","GSM298748.CEL.gz","GSM298749.CEL.gz","GSM298750.CEL.gz","GSM299110.CEL.gz","GSM299111.CEL.gz","GSM299112.CEL.gz","GSM299113.CEL.gz","GSM299244.CEL.gz","GSM299245.CEL.gz","GSM299246.CEL.gz","GSM299247.CEL.gz","GSM322969.CEL.gz","GSM323054.CEL.gz","GSM323523.CEL.gz","GSM323524.CEL.gz","GSM323527.CEL.gz","GSM323565.CEL.gz","GSM323566.CEL.gz","GSM323567.CEL.gz","GSM246492.CEL.gz","GSM246493.CEL.gz","GSM246494.CEL.gz","GSM307639.CEL.gz","GSM307640.CEL.gz","GSM307641.CEL.gz"'

In [None]:
n=0