In [1]:
import io, re
import numpy as np
import pandas as pd

EXPR_CSV = "expression_matrix.csv"
GPL_FILE = "GPL570-55999.txt"
META_CSV = "sample_metadata.csv"   # optional merge in cell 5

TARGET_GENES = ["EPAS1", "ERC2", "PRC1", "CSGALNACT1", "CCND1"]

In [2]:
# expression: rows = probes (ID_REF), cols = GSM...
expr = pd.read_csv(EXPR_CSV, dtype=str).set_index("ID_REF")
expr = expr.apply(pd.to_numeric, errors="coerce")

# GPL: read first real table and normalize headers
with open(GPL_FILE, "rt", encoding="utf-8", errors="ignore") as f:
    lines = [ln for ln in f if not ln.startswith("#")]
i = 0
while i < len(lines) and (not lines[i].strip() or lines[i].startswith("!")):
    i += 1
gpl = pd.read_csv(io.StringIO("".join(lines[i:])), sep="\t", dtype=str)
gpl.columns = [c.strip() for c in gpl.columns]

probe_col, symbol_col = ('ID', 'Gene Symbol')


In [3]:
# explode multi-symbol cells
MULTI_SEP = re.compile(r"\s*///\s*|\s*//\s*|\s*;\s*|\s*,\s*|\s*\|\s*")

def _norm_symbol(s: str) -> str:
    if not isinstance(s, str): return ""
    return s.strip().replace('"', "").replace("'", "").upper()

tmp = gpl[[probe_col, symbol_col]].dropna(subset=[probe_col]).copy()
tmp[probe_col] = tmp[probe_col].astype(str)
tmp = tmp[tmp[probe_col].isin(expr.index)]
tmp = tmp.rename(columns={probe_col: "PROBE", symbol_col: "SYMBOL_RAW"})

tmp["SYMBOL_RAW"] = tmp["SYMBOL_RAW"].fillna("").astype(str)
tmp["SYMBOL_RAW"] = tmp["SYMBOL_RAW"].str.replace(r"\s*\(.*?\)\s*", "", regex=True)
tmp["_SYMS"] = tmp["SYMBOL_RAW"].apply(lambda x: [z for z in (_norm_symbol(p) for p in MULTI_SEP.split(x)) if z])
exploded = tmp.explode("_SYMS").rename(columns={"_SYMS": "SYMBOL"}).drop(columns=["SYMBOL_RAW"])

# drop empties and placeholders
bad = {"", "NA", "N/A", "—", "-", "NONE"}
exploded = exploded[exploded["SYMBOL"].notna()]
exploded = exploded.loc[~exploded["SYMBOL"].isin(bad)]
exploded = exploded.loc[:, ~exploded.columns.duplicated()]

# subset expression to mapped probes
mapped_expr = expr.loc[expr.index.intersection(exploded["PROBE"].unique())].copy()

# max MAD per gene and ensure each probe used once
vals = mapped_expr.values.astype(float)
row_means = vals.mean(axis=1, keepdims=True)
mad_vals = np.mean(np.abs(vals - row_means), axis=1)
mad = pd.DataFrame({"MAD": mad_vals}, index=mapped_expr.index)

df = exploded.merge(mad, left_on="PROBE", right_index=True, how="left")
df = df.sort_values(["SYMBOL", "MAD"], ascending=[True, False])
best = df.groupby("SYMBOL", as_index=False).first()[["SYMBOL", "PROBE", "MAD"]]
best = best.sort_values("MAD", ascending=False).drop_duplicates(subset="PROBE", keep="first")

gene_expr = mapped_expr.loc[best["PROBE"].values]
gene_expr.index = best["SYMBOL"].values
gene_expr = gene_expr[~gene_expr.index.duplicated(keep="first")]

gene_expr.shape


(22855, 559)

In [4]:
gene_expr.to_csv("GSE24080_gene_expr.csv")
five = gene_expr.reindex(TARGET_GENES)

five.to_csv("five_gene_matrix.csv")

print("Saved files:")
print(" - GSE24080_gene_expr.csv")
print(" - five_gene_matrix.csv")
five


Saved files:
 - GSE24080_gene_expr.csv
 - five_gene_matrix.csv


Unnamed: 0,GSM592391,GSM592392,GSM592393,GSM592394,GSM592395,GSM592396,GSM592397,GSM592398,GSM592399,GSM592400,...,GSM592940,GSM592941,GSM592942,GSM592943,GSM592944,GSM592945,GSM592946,GSM592947,GSM592948,GSM592949
EPAS1,6.6409,10.3075,7.6385,8.0128,6.7282,9.3709,8.904,10.4568,8.9082,7.8805,...,7.1034,9.0538,10.867,7.5816,6.7567,7.7027,8.4951,10.1724,8.6349,9.9335
ERC2,6.8256,9.3064,6.6543,7.4137,11.6046,7.5457,5.6107,6.7566,7.7499,7.1326,...,9.4622,7.1353,10.6469,7.385,6.712,9.5875,6.234,8.6274,5.7461,7.4326
PRC1,9.7622,8.5312,9.0621,8.0636,8.5189,7.4958,11.0569,9.642,9.5727,8.6419,...,8.9067,7.8226,10.343,9.4054,7.1986,8.9304,8.0754,9.0066,10.1847,8.451
CSGALNACT1,12.1424,10.7866,13.2617,11.5271,10.9782,13.3013,10.7527,9.774,12.6431,12.2782,...,11.3841,12.8169,11.1146,10.1919,9.8668,11.0351,11.2255,11.0449,10.3652,11.8412
CCND1,13.1576,11.8152,12.7715,3.9054,6.1372,4.8734,10.5403,10.6242,10.3134,11.8749,...,8.4259,12.9395,11.6175,7.2103,5.5067,9.8428,7.2564,8.1704,4.4717,8.1009
