In [1]:
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

## Carregar os dados

In [3]:
df_cells = pd.read_csv('./dataset/synthetic_cell_data.csv')

In [4]:
df_cells.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 51 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   gene_1     1200 non-null   float64
 1   gene_2     1200 non-null   float64
 2   gene_3     1200 non-null   float64
 3   gene_4     1200 non-null   float64
 4   gene_5     1200 non-null   float64
 5   gene_6     1200 non-null   float64
 6   gene_7     1200 non-null   float64
 7   gene_8     1200 non-null   float64
 8   gene_9     1200 non-null   float64
 9   gene_10    1200 non-null   float64
 10  gene_11    1200 non-null   float64
 11  gene_12    1200 non-null   float64
 12  gene_13    1200 non-null   float64
 13  gene_14    1200 non-null   float64
 14  gene_15    1200 non-null   float64
 15  gene_16    1200 non-null   float64
 16  gene_17    1200 non-null   float64
 17  gene_18    1200 non-null   float64
 18  gene_19    1200 non-null   float64
 19  gene_20    1200 non-null   float64
 20  gene_21 

## EDA

In [5]:
len(df_cells['cell_type'].unique())

5

In [6]:
df_cells.describe()

Unnamed: 0,gene_1,gene_2,gene_3,gene_4,gene_5,gene_6,gene_7,gene_8,gene_9,gene_10,...,gene_41,gene_42,gene_43,gene_44,gene_45,gene_46,gene_47,gene_48,gene_49,gene_50
count,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,...,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0
mean,0.269324,2.026219,0.015786,-0.028116,-2.122801,-0.490857,1.284298,5.828234,0.362583,0.011576,...,0.365303,0.228197,-4.46738,0.869411,0.351452,0.024154,-0.021449,-0.29827,1.336862,0.386345
std,3.775422,3.047931,1.019325,1.010042,10.841627,3.609355,3.468614,10.981872,3.652475,0.973866,...,3.694792,3.585006,12.248655,9.529351,3.733268,1.019383,1.010276,3.681659,3.565798,3.550924
min,-10.141897,-9.22891,-3.482768,-3.242987,-41.44159,-10.793123,-8.99635,-27.318668,-11.678312,-3.003106,...,-11.9607,-10.366488,-38.545817,-36.738775,-12.783092,-3.246233,-2.999714,-13.125571,-10.672461,-10.133236
25%,-2.42113,-0.046719,-0.649693,-0.686873,-9.392613,-2.998869,-1.287978,-1.725341,-2.108999,-0.648617,...,-2.127106,-2.097078,-12.919963,-5.672383,-2.275796,-0.671541,-0.702535,-2.838792,-1.001999,-2.067543
50%,0.184105,2.071347,0.01529,-0.046073,-2.297948,-0.570461,1.404739,5.651884,0.191376,0.00706,...,0.385808,0.08386,-4.561076,0.866118,0.19884,0.01551,-0.022558,-0.379001,1.540704,0.241216
75%,2.889395,4.047296,0.744118,0.629383,4.491653,1.843312,3.737431,13.200223,2.845956,0.700536,...,2.876378,2.840972,3.336691,7.170686,3.000446,0.720197,0.688822,2.221177,3.875756,2.864945
max,12.232814,13.086474,3.475239,3.175204,41.637061,10.616334,11.972265,46.421128,11.404758,3.076787,...,12.015285,10.727785,34.017735,39.138959,11.778587,3.313657,3.374655,11.040306,11.612362,12.975577


In [7]:
percentual_cell_type = df_cells.value_counts('cell_type') / len(df_cells) * 100
px.bar(percentual_cell_type, color=percentual_cell_type.index)

In [8]:
import numpy as np

matriz_correlacao_cosmeticos = df_cells.corr(numeric_only=True)

upper_tri = matriz_correlacao_cosmeticos.where(np.triu(np.ones(matriz_correlacao_cosmeticos.shape),k=1).astype(bool))

top_corrs = upper_tri.stack().reindex(
    upper_tri.stack().abs().sort_values(ascending=False).index
).head(10)

print(top_corrs)

gene_38  gene_50   -0.499143
gene_5   gene_48   -0.492854
gene_20  gene_44   -0.482737
gene_8   gene_14    0.464962
gene_1   gene_43   -0.456296
gene_18  gene_45    0.453807
gene_17  gene_18   -0.453178
gene_9   gene_20    0.449876
gene_20  gene_35    0.444095
gene_18  gene_32   -0.436876
dtype: float64


In [9]:
px.histogram(df_cells['gene_38'], title='Distribuição da variável gene_38')

In [11]:
px.scatter(df_cells, x='gene_38', y='gene_50', color='cell_type',hover_data=['cell_type', 'gene_38', 'gene_50'])

In [12]:
px.scatter(df_cells, x='gene_5', y='gene_48', color='cell_type',hover_data=['cell_type', 'gene_5', 'gene_48'])

## Treinamento

In [13]:
X = df_cells.copy()
X.drop(columns=['cell_type'], axis=1, inplace=True)

In [14]:
numeric_features = [i for i in df_cells.columns if i.startswith('gene_')]

preprocess = ColumnTransformer(
    transformers=[
        ('num',StandardScaler(),numeric_features),
    ],
    remainder='passthrough'
)

X_transformed = preprocess.fit_transform(X)
X_transformed

array([[-0.77243764, -0.30925347,  0.99911555, ...,  1.23138234,
         0.19406509,  0.25207215],
       [ 0.59158306, -1.52089217,  0.23387275, ...,  0.33949527,
         1.11666232, -0.91061889],
       [ 0.15862546,  1.31222629, -0.04039477, ...,  0.78976021,
         1.45998971, -1.58952741],
       ...,
       [-1.1511887 ,  0.29350813, -0.53687154, ..., -0.71829898,
         0.49844215,  0.31968906],
       [ 0.58742701, -0.31050335, -1.6682397 , ..., -0.47976208,
         0.4266841 , -0.01311075],
       [ 0.47010141, -1.13816167, -0.33354413, ...,  1.60336574,
         0.14627282, -0.31349598]], shape=(1200, 50))

In [17]:
def tsne_results_calc(number_components=2):
    results_df = pd.DataFrame()

    for perplexity in range(2, 51):

        tsne = TSNE(n_components=number_components, perplexity=perplexity, init="random", max_iter=250, random_state=51)
        tsne_results = tsne.fit_transform(X_transformed)

        temp_df = pd.DataFrame(tsne_results, columns=[f'Componente {i}' for i in range(1, number_components + 1)])
        temp_df['Perplexity'] = perplexity
        temp_df['cell_type'] = df_cells['cell_type'].values
        results_df = pd.concat([results_df, temp_df], axis=0)
    return results_df

In [18]:
df_results = tsne_results_calc()


In [19]:
df_results_3d = tsne_results_calc(3)

In [20]:
df_results.head()

Unnamed: 0,Componente 1,Componente 2,Perplexity,cell_type
0,0.967107,0.616711,2,type_4
1,0.105627,1.566604,2,type_0
2,-1.495191,-1.649051,2,type_1
3,0.905883,0.355864,2,type_4
4,0.03906,0.557629,2,type_4


In [21]:
df_results_3d.head()

Unnamed: 0,Componente 1,Componente 2,Componente 3,Perplexity,cell_type
0,-0.269145,-0.327953,1.104423,2,type_4
1,0.116239,-1.235683,0.26999,2,type_0
2,0.629731,1.754703,0.467794,2,type_1
3,-0.08567,-0.097807,1.314393,2,type_4
4,0.305286,-0.333449,0.282959,2,type_4


In [22]:
px.scatter(df_results, x='Componente 1', y='Componente 2',color='cell_type')

In [23]:
px.scatter_3d(df_results_3d,x='Componente 1', y='Componente 2', z='Componente 3', color='cell_type')

## Análise

A aplicação do algoritmo t-SNE demonstrou que os dados de expressão gênica são altamente informativos para a classificação celular. Observou-se a formação de agrupamentos claros e distintos para a maioria dos tipos, com destaque para o type_2, que se posicionou em regiões mais isoladas do espaço latente. Embora os grupos type_0 e type_4 apresentem um gráfico em que não se apresenta um padrão muit claro, a visualização em 3D revelou uma separação latente que confirma a distinção entre todas as classes estudadas. Portanto, as 50 dimensões originais foram reduzidas com sucesso, preservando a estrutura local dos dados.