In [20]:
import os
import pandas as pd
import numpy as np
import torch
from collections import defaultdict

In [21]:
tcga_dir = "tcga_data"

In [22]:
# collect data
sample_dict = defaultdict(dict)
all_genes = None
sample_type_labels = set()

In [23]:
for root, dirs, files in os.walk(tcga_dir):
    for file in files:
        if file.startswith('.'):
            continue  # skip hidden/system files

        full_path = os.path.join(root, file).replace("\\", "/")
        parts = file.split("-")

        if len(parts) < 4:
            print(f"Skipping malformed filename: {file}")
            continue

        patient_id = "-".join(parts[:3])   # e.g., TCGA-05-4244
        sample_type = parts[3][:2]         # e.g., 01 = primary tumor, 11 = normal
        sample_type_labels.add(sample_type)

        try:
            # Read as tab-delimited with no header
            df = pd.read_csv(full_path, sep="\t", header=None)

            if df.shape[1] != 2:
                print(f"Unexpected format in: {file}, skipping.")
                continue

            df.columns = ['gene_id', 'expression']
            df = df.set_index('gene_id')
            df.index = df.index.str.strip()

            # First successful file sets the gene order
            if all_genes is None:
                all_genes = df.index.tolist()
                print(f"Reference gene list initialized from: {file}")
            elif not df.index.equals(pd.Index(all_genes)):
                print(f"Gene list mismatch in {file}, skipping.")
                continue

            sample_dict[patient_id][sample_type] = df['expression'].values

        except Exception as e:
            print(f"Failed to parse {file}: {e}")
            continue

Reference gene list initialized from: TCGA-05-4244-01A-01R-1107-07


In [24]:
if all_genes is None:
    raise ValueError("No valid files found to build tensor. Check file format or path.")

genes = all_genes
patients = sorted(sample_dict.keys())
sample_types = sorted(sample_type_labels)

# Index maps
gene_idx = {g: i for i, g in enumerate(genes)}
patient_idx = {p: i for i, p in enumerate(patients)}
sample_type_idx = {s: i for i, s in enumerate(sample_types)}

In [25]:
tensor_data = np.full((len(genes), len(patients), len(sample_types)), np.nan)

for pid in patients:
    for stype, expr in sample_dict[pid].items():
        p_idx = patient_idx[pid]
        s_idx = sample_type_idx[stype]
        tensor_data[:, p_idx, s_idx] = expr


In [26]:
tensor = torch.tensor(tensor_data, dtype=torch.float32)

In [27]:
print(f"Tensor shape: {tensor.shape}")
print(f"Genes: {len(genes)}")
print(f"Patients: {len(patients)}")
print(f"Sample types: {sample_types}")

Tensor shape: torch.Size([60488, 515, 2])
Genes: 60488
Patients: 515
Sample types: ['01', '11']


In [None]:
torch.save(tensor, "tcga_tensor_by_sample_type.pt")