<a href="https://colab.research.google.com/github/Vjfrib/vue-sdheal/blob/main/GSE234729_e_GSE7501.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Export

In [1]:
# Instalar dependências
!pip install GEOparse pandas

Collecting GEOparse
  Downloading GEOparse-2.0.4-py3-none-any.whl.metadata (6.5 kB)
Downloading GEOparse-2.0.4-py3-none-any.whl (29 kB)
Installing collected packages: GEOparse
Successfully installed GEOparse-2.0.4


In [2]:
import GEOparse
import pandas as pd
import matplotlib.pyplot as plt


In [6]:
# cel 1 -
def load_gse(gse_id, dest_dir="."):
    """Baixa e carrega um dataset GEO."""
    print(f"🔽 Baixando {gse_id}...")
    gse = GEOparse.get_GEO(geo=gse_id, destdir=dest_dir, silent=True)
    print(f"✅ {gse_id} carregado com sucesso.")
    return gse

def extract_clinical_data(gse):
    """Extrai dados clínicos do GSE para DataFrame."""
    data = []
    for gsm_id, gsm in gse.gsms.items():
        row = {"sample_id": gsm_id}
        for k, v in gsm.metadata.items():
            row[k] = "; ".join(v)
        data.append(row)
    return pd.DataFrame(data)

def extract_expression_data(gse):
    """Extrai matriz de expressão (genes x amostras) de forma segura."""
    # Escolhe a primeira plataforma
    platform_id = list(gse.gpls.keys())[0]
    platform_table = gse.gpls[platform_id].table

    # Verifica qual coluna usar como identificador
    if 'ID_REF' in platform_table.columns:
        id_col = 'ID_REF'
    elif 'gene_id' in platform_table.columns:
        id_col = 'gene_id'
    elif 'Gene Symbol' in platform_table.columns:
        id_col = 'Gene Symbol'
    else:
        id_col = platform_table.columns[0]  # pega a primeira coluna disponível

    print(f"Usando '{id_col}' como identificador de genes para pivotar.")

    expr = gse.pivot_samples('VALUE')  # pivot normal
    expr.index = gse.gpls[platform_id].table.set_index(id_col).loc[expr.index][id_col]
    return expr

In [11]:
# cel 2 -
gse_ids = ["GSE75010", "GSE234729"]
gse_objects = {}
for gse_id in gse_ids:
    gse_objects[gse_id] = load_gse(gse_id)

🔽 Baixando GSE75010...
✅ GSE75010 carregado com sucesso.
🔽 Baixando GSE234729...
✅ GSE234729 carregado com sucesso.


In [12]:
# cel 3 -
clinical_data = {}
for gse_id, gse in gse_objects.items():
    clinical_data[gse_id] = extract_clinical_data(gse)

In [14]:
# cel 4 -
for gse_id in gse_ids:
    print(f"\n📋 Dados clínicos ({gse_id}) - primeiras 5 linhas:")
    print(clinical_data[gse_id].head())

    print(f"\n🧬 Matriz de expressão ({gse_id}) - dimensões:")
    print(expression_data[gse_id].shape)


📋 Dados clínicos (GSE75010) - primeiras 5 linhas:
    sample_id               title geo_accession                 status  \
0  GSM1940492   PE-Term-SGA, rep1    GSM1940492  Public on May 10 2016   
1  GSM1940493  Cont-Term-CH, rep1    GSM1940493  Public on May 10 2016   
2  GSM1940494  Cont-Term-CH, rep2    GSM1940494  Public on May 10 2016   
3  GSM1940495   PE-Term-AGA, rep1    GSM1940495  Public on May 10 2016   
4  GSM1940496    PE-PreT-CH, rep1    GSM1940496  Public on May 10 2016   

  submission_date last_update_date type channel_count  \
0     Nov 13 2015      Oct 02 2018  RNA             1   
1     Nov 13 2015      Oct 02 2018  RNA             1   
2     Nov 13 2015      May 11 2016  RNA             1   
3     Nov 13 2015      Apr 26 2017  RNA             1   
4     Nov 13 2015      Oct 02 2018  RNA             1   

                              source_name_ch1  organism_ch1  ...  \
0   Human placenta from PE-Term-SGA pregnancy  Homo sapiens  ...   
1  Human placenta from Co

NameError: name 'expression_data' is not defined

In [15]:
# cel 5 -
expr = expression_data["GSE75010"]
expr.iloc[:, 0].head(50).plot(title="Expressão de genes - 1ª amostra GSE75010")
plt.xlabel("Genes")
plt.ylabel("Expressão")
plt.show()

NameError: name 'expression_data' is not defined

In [16]:
# cel 6 -
# Exporta dados clínicos e expressão de ambos os datasets para Excel
for gse_id in gse_ids:
    # Dados clínicos
    clinical_file = f"{gse_id}_clinical.xlsx"
    clinical_data[gse_id].to_excel(clinical_file, index=False)
    print(f"📋 Dados clínicos ({gse_id}) salvos em {clinical_file}")

    # Matriz de expressão
    expression_file = f"{gse_id}_expression.xlsx"
    expression_data[gse_id].to_excel(expression_file)
    print(f"🧬 Matriz de expressão ({gse_id}) salva em {expression_file}")


📋 Dados clínicos (GSE75010) salvos em GSE75010_clinical.xlsx


NameError: name 'expression_data' is not defined