In [19]:
# Basic imports
import os
import pandas as pd
import numpy as np

# Optional: show all columns in pandas
pd.set_option("display.max_columns", 100)

In [20]:
os.makedirs("../data", exist_ok=True)

In [21]:
!pip install GEOparse



In [22]:
import GEOparse

# Download and parse the dataset
gse = GEOparse.get_GEO(
    "GSE16561",
    destdir="../data",
    annotate_gpl=True  # attach platform annotation if available
)

02-Feb-2026 19:49:25 DEBUG utils - Directory ../data already exists. Skipping.
02-Feb-2026 19:49:25 INFO GEOparse - Downloading ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE16nnn/GSE16561/soft/GSE16561_family.soft.gz to ../data\GSE16561_family.soft.gz
100%|████████████████████████████████████████████████████████████████████████████████| 17.0M/17.0M [00:01<00:00, 16.5MB/s]
02-Feb-2026 19:49:26 DEBUG downloader - Size validation passed
02-Feb-2026 19:49:26 DEBUG downloader - Moving C:\Users\suzie\AppData\Local\Temp\tmp8dcl1q2c to C:\Users\suzie\Documents\AI_stroke\stroke_predictions\data\GSE16561_family.soft.gz
02-Feb-2026 19:49:26 DEBUG downloader - Successfully downloaded ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE16nnn/GSE16561/soft/GSE16561_family.soft.gz
02-Feb-2026 19:49:26 INFO GEOparse - Parsing ../data\GSE16561_family.soft.gz: 
02-Feb-2026 19:49:26 DEBUG GEOparse - DATABASE: GeoMiame
02-Feb-2026 19:49:26 DEBUG GEOparse - SERIES: GSE16561
02-Feb-2026 19:49:26 DEBUG GEOparse - PLATFORM:

In [23]:
# How many samples
print(f"Number of samples: {len(gse.gsms)}")

# Sample IDs
list(gse.gsms.keys())[:5]  # first 5 samples

# Metadata columns
sample = list(gse.gsms.values())[0]
sample.table.head()  # first few rows of a sample

Number of samples: 63


Unnamed: 0,ID_REF,VALUE
0,ILMN_1809034,0.022013
1,ILMN_1660305,-0.30821
2,ILMN_1762337,0.231243
3,ILMN_2055271,0.056959
4,ILMN_1814316,0.092737


In [24]:
# Print the column names of first sample
print(sample.table.columns)

Index(['ID_REF', 'VALUE'], dtype='str')


In [25]:
expression_data = []

for gsm_name, gsm in gse.gsms.items():
    table = gsm.table.copy()
    
    # Gene/probe column = first column
    gene_col = table.columns[0]
    
    # Expression value column = last numeric column
    value_col = table.select_dtypes(include="number").columns[-1]
    
    data = table[[gene_col, value_col]].copy()
    data.columns = ["gene", gsm_name]
    
    expression_data.append(data)

# Merge all samples into one matrix
expression_df = expression_data[0]
for df in expression_data[1:]:
    expression_df = expression_df.merge(df, on="gene")

expression_df.head()

Unnamed: 0,gene,GSM416528,GSM416529,GSM416530,GSM416531,GSM416532,GSM416533,GSM416534,GSM416535,GSM416536,GSM416537,GSM416538,GSM416539,GSM416540,GSM416541,GSM416542,GSM416543,GSM416544,GSM416545,GSM416546,GSM416547,GSM416548,GSM416549,GSM416550,GSM416551,GSM416552,GSM416553,GSM416554,GSM416555,GSM416556,GSM416557,GSM416558,GSM416559,GSM416560,GSM416561,GSM416562,GSM416563,GSM416564,GSM416565,GSM416566,GSM416567,GSM416568,GSM416569,GSM416570,GSM416571,GSM416572,GSM416573,GSM416574,GSM416575,GSM416576,GSM416577,GSM416578,GSM416579,GSM416580,GSM416581,GSM416582,GSM416583,GSM416584,GSM416585,GSM416586,GSM416587,GSM416588,GSM416589,GSM416590
0,ILMN_1809034,0.022013,0.190456,0.469427,0.116737,-0.699963,-0.478397,-0.318018,-0.208519,-0.345568,-0.056777,0.09107,-0.302657,-0.381425,-0.504762,-0.386471,0.094351,-0.758722,0.112586,0.031651,-0.234292,0.03256,0.050117,0.17248,-0.088443,0.053586,0.074093,-0.413808,0.152572,0.084034,0.194967,,0.205493,-0.119312,-0.113844,-0.1153,0.109764,-0.159538,-0.132803,-0.223391,0.295331,0.298213,-0.091371,0.210176,-0.371085,-0.186481,-0.458075,0.03613,-0.650574,-0.000719,-0.010536,0.140763,-0.046631,0.029425,0.109767,-0.108704,-0.236372,0.161365,0.242682,0.203559,0.084222,0.040328,0.089137,-0.113315
1,ILMN_1660305,-0.30821,-0.092378,-0.238029,-0.200707,-0.0051,0.364626,0.35442,0.182956,0.477714,0.243682,-0.299673,0.008643,-0.114489,0.268062,0.531219,0.361876,0.44317,0.450423,0.037638,-0.177883,0.121291,0.072276,-0.091946,-0.320289,-0.021301,-0.229272,-0.261179,0.162331,0.244913,0.445852,-0.33105,0.118499,0.130139,0.0231,0.202082,0.386924,0.078592,-0.047485,0.157869,0.079512,-0.416909,-0.314314,-0.295032,0.053877,0.101785,-0.056473,-0.136997,0.159331,-0.047834,-0.263496,-0.341471,-0.419814,-0.299314,-0.19741,,0.017656,-0.450677,-0.213411,0.002232,0.067206,-0.184929,-0.092011,-0.369319
2,ILMN_1762337,0.231243,0.716422,0.584303,0.348969,-0.232339,-0.4331,-0.246338,-0.003613,-0.09691,-0.139331,0.295425,-0.182422,-0.261745,-0.309865,-0.52447,0.15669,-0.721849,0.080895,0.172382,0.004978,0.255229,-0.819237,0.170418,0.053386,-0.639751,0.053905,-0.123723,0.464999,0.182916,0.068471,0.100921,-0.313919,0.212468,0.228988,-0.411081,0.35903,0.530479,-0.208286,-0.136183,0.701905,0.137778,-0.076157,0.61051,-0.746741,-0.174491,-0.640257,-0.100314,-0.628088,-0.084126,-0.297192,0.274459,0.250102,0.011623,0.328239,-0.129587,-0.191624,0.216006,0.669996,-0.591509,-0.352594,,0.161088,-0.158862
3,ILMN_2055271,0.056959,0.623819,0.475457,0.578134,-0.161864,-0.350654,-0.246458,0.027901,-0.155984,-0.089207,0.26293,-0.053236,-0.142034,-0.248516,-0.15836,0.04708,-0.782099,0.25595,,0.077271,0.310247,-0.693969,0.559497,0.047076,-0.771517,0.044528,0.31669,0.381425,0.242918,0.00518,0.435189,-0.437686,0.372948,0.153007,-0.240293,0.348096,0.438978,-0.201178,-0.014614,0.745462,0.056809,-0.182072,0.630791,-0.364724,-0.246197,-0.372049,-0.001289,-0.833504,-0.064888,-0.011854,0.228154,0.285425,-0.195305,0.36079,0.083792,-0.173882,-0.055565,0.510503,-0.553546,-0.232203,-0.008569,0.18786,-0.15681
4,ILMN_1814316,0.092737,0.636337,0.602461,0.253304,-0.393164,-0.556802,-0.228912,-0.185812,-0.020556,0.091969,0.238231,-0.200913,-0.274535,-0.31727,-0.554717,0.245111,-0.790458,0.205919,0.012997,0.000408,0.223716,-0.977337,0.149116,0.176498,-0.861912,0.095062,0.002292,0.514992,0.421149,-0.093424,0.171887,-0.430117,0.043027,0.263642,-0.509589,0.185727,0.478854,-0.288208,-0.174017,0.527976,0.154593,-0.266207,0.547911,-0.844136,-0.291175,-0.743741,-0.110044,-0.739939,0.036358,-0.336099,0.186498,0.327218,-0.141585,0.176324,-0.226657,-0.152971,,0.386905,-0.658145,-0.147197,-0.154492,0.046649,-0.17479


In [26]:
# Make processed folder
os.makedirs("../data/processed", exist_ok=True)

# Save to CSV
expression_df.to_csv("../data/processed/GSE16561_expression_matrix.csv", index=False)

In [28]:
# Check shape
expression_df.shape

# Check for missing values
expression_df.isna().sum().sum()

# Check number of samples (columns excluding 'gene')
expression_df.shape[1] - 1

# Quick peek at top rows
expression_df.head()


Unnamed: 0,gene,GSM416528,GSM416529,GSM416530,GSM416531,GSM416532,GSM416533,GSM416534,GSM416535,GSM416536,GSM416537,GSM416538,GSM416539,GSM416540,GSM416541,GSM416542,GSM416543,GSM416544,GSM416545,GSM416546,GSM416547,GSM416548,GSM416549,GSM416550,GSM416551,GSM416552,GSM416553,GSM416554,GSM416555,GSM416556,GSM416557,GSM416558,GSM416559,GSM416560,GSM416561,GSM416562,GSM416563,GSM416564,GSM416565,GSM416566,GSM416567,GSM416568,GSM416569,GSM416570,GSM416571,GSM416572,GSM416573,GSM416574,GSM416575,GSM416576,GSM416577,GSM416578,GSM416579,GSM416580,GSM416581,GSM416582,GSM416583,GSM416584,GSM416585,GSM416586,GSM416587,GSM416588,GSM416589,GSM416590
0,ILMN_1809034,0.022013,0.190456,0.469427,0.116737,-0.699963,-0.478397,-0.318018,-0.208519,-0.345568,-0.056777,0.09107,-0.302657,-0.381425,-0.504762,-0.386471,0.094351,-0.758722,0.112586,0.031651,-0.234292,0.03256,0.050117,0.17248,-0.088443,0.053586,0.074093,-0.413808,0.152572,0.084034,0.194967,,0.205493,-0.119312,-0.113844,-0.1153,0.109764,-0.159538,-0.132803,-0.223391,0.295331,0.298213,-0.091371,0.210176,-0.371085,-0.186481,-0.458075,0.03613,-0.650574,-0.000719,-0.010536,0.140763,-0.046631,0.029425,0.109767,-0.108704,-0.236372,0.161365,0.242682,0.203559,0.084222,0.040328,0.089137,-0.113315
1,ILMN_1660305,-0.30821,-0.092378,-0.238029,-0.200707,-0.0051,0.364626,0.35442,0.182956,0.477714,0.243682,-0.299673,0.008643,-0.114489,0.268062,0.531219,0.361876,0.44317,0.450423,0.037638,-0.177883,0.121291,0.072276,-0.091946,-0.320289,-0.021301,-0.229272,-0.261179,0.162331,0.244913,0.445852,-0.33105,0.118499,0.130139,0.0231,0.202082,0.386924,0.078592,-0.047485,0.157869,0.079512,-0.416909,-0.314314,-0.295032,0.053877,0.101785,-0.056473,-0.136997,0.159331,-0.047834,-0.263496,-0.341471,-0.419814,-0.299314,-0.19741,,0.017656,-0.450677,-0.213411,0.002232,0.067206,-0.184929,-0.092011,-0.369319
2,ILMN_1762337,0.231243,0.716422,0.584303,0.348969,-0.232339,-0.4331,-0.246338,-0.003613,-0.09691,-0.139331,0.295425,-0.182422,-0.261745,-0.309865,-0.52447,0.15669,-0.721849,0.080895,0.172382,0.004978,0.255229,-0.819237,0.170418,0.053386,-0.639751,0.053905,-0.123723,0.464999,0.182916,0.068471,0.100921,-0.313919,0.212468,0.228988,-0.411081,0.35903,0.530479,-0.208286,-0.136183,0.701905,0.137778,-0.076157,0.61051,-0.746741,-0.174491,-0.640257,-0.100314,-0.628088,-0.084126,-0.297192,0.274459,0.250102,0.011623,0.328239,-0.129587,-0.191624,0.216006,0.669996,-0.591509,-0.352594,,0.161088,-0.158862
3,ILMN_2055271,0.056959,0.623819,0.475457,0.578134,-0.161864,-0.350654,-0.246458,0.027901,-0.155984,-0.089207,0.26293,-0.053236,-0.142034,-0.248516,-0.15836,0.04708,-0.782099,0.25595,,0.077271,0.310247,-0.693969,0.559497,0.047076,-0.771517,0.044528,0.31669,0.381425,0.242918,0.00518,0.435189,-0.437686,0.372948,0.153007,-0.240293,0.348096,0.438978,-0.201178,-0.014614,0.745462,0.056809,-0.182072,0.630791,-0.364724,-0.246197,-0.372049,-0.001289,-0.833504,-0.064888,-0.011854,0.228154,0.285425,-0.195305,0.36079,0.083792,-0.173882,-0.055565,0.510503,-0.553546,-0.232203,-0.008569,0.18786,-0.15681
4,ILMN_1814316,0.092737,0.636337,0.602461,0.253304,-0.393164,-0.556802,-0.228912,-0.185812,-0.020556,0.091969,0.238231,-0.200913,-0.274535,-0.31727,-0.554717,0.245111,-0.790458,0.205919,0.012997,0.000408,0.223716,-0.977337,0.149116,0.176498,-0.861912,0.095062,0.002292,0.514992,0.421149,-0.093424,0.171887,-0.430117,0.043027,0.263642,-0.509589,0.185727,0.478854,-0.288208,-0.174017,0.527976,0.154593,-0.266207,0.547911,-0.844136,-0.291175,-0.743741,-0.110044,-0.739939,0.036358,-0.336099,0.186498,0.327218,-0.141585,0.176324,-0.226657,-0.152971,,0.386905,-0.658145,-0.147197,-0.154492,0.046649,-0.17479
