In [4]:
# preprocessing_minimal_tpm.py
# Minimal preprocessing for TPM RNA-seq data

import pandas as pd
import numpy as np

INPUT_FILE = "/content/human_liver.tsv"   # your downloaded TSV
OUTPUT_DIR = "/content/preproc_outputs"

# Load
df = pd.read_csv(INPUT_FILE, sep="\t", index_col=0)

# Log2 transform (stabilizes variance)
df_log2 = np.log2(df + 1)

# Filter low-expression genes (keep genes with log2(TPM+1) > 1 in at least 5 samples)
mask = (df_log2 > 1).sum(axis=1) >= 5
df_filtered = df_log2.loc[mask]

# Save
import os
os.makedirs(OUTPUT_DIR, exist_ok=True)
df_filtered.to_csv(f"{OUTPUT_DIR}/data_preprocessed.tsv", sep="\t")

print("Original shape:", df.shape)
print("After filtering:", df_filtered.shape)
print(f"Preprocessed data saved to {OUTPUT_DIR}/data_preprocessed.tsv")


Original shape: (35238, 903)
After filtering: (30612, 903)
Preprocessed data saved to /content/preproc_outputs/data_preprocessed.tsv
