In [1]:
import pandas as pd

df = pd.read_csv("/content/human_liver.tsv", sep="\t")
print(df.shape)
print(df.head())
print(df.info())


(11840, 904)
   genes  GSM742944  GSM2326089  GSM1807974  GSM1807990  GSM2055788  \
0   A1BG          0          92       11663        2089       18808   
1   A1CF          0          17        5738         490       14035   
2    A2M       6362         304       39269        8299       20260   
3  A2ML1          0          22          11           2          39   
4  A2MP1          0           3         167           7          44   

   GSM2142335  GSM1807979  GSM1695909  GSM1554468  ...  GSM2653842  \
0         183        5123       79640        1595  ...        30.0   
1           5        7127       10168         241  ...         9.0   
2          97       27210       63297        5817  ...         8.0   
3           8           4           9           5  ...         4.0   
4          15          16          61           2  ...         0.0   

   GSM2653843  GSM2653844  GSM2653845  GSM2653846  GSM2653847  GSM2653849  \
0        22.0        17.0        49.0       394.0        12.0 

In [2]:
import numpy as np
# Select only the numeric columns for normalization
numeric_cols = df.select_dtypes(include=np.number).columns
df_normalized = df[numeric_cols].apply(lambda x: np.log2(x+1))

In [4]:
# ead_analysis.py
# PCA + clustering + heatmap for preprocessed RNA-seq (TSV)
# Usage: put your preprocessed TSV in the same folder and name it "data_preprocessed.tsv"
# or change the FILENAME variable below.

import os
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import linkage, dendrogram
import matplotlib.pyplot as plt
import seaborn as sns

# ---------- SETTINGS ----------
FILENAME = "/content/data_preprocessed.tsv"   # change if needed
INDEX_COL = 0                        # row names (genes) are in first column
N_PCS = 10
N_TOP_VAR_GENES = 500                # for heatmap
K_CLUSTERS = 4                       # initial number of clusters to try
RANDOM_STATE = 42
OUTPUT_DIR = "eda_outputs"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ---------- LOAD ----------
df = pd.read_csv(FILENAME, sep="\t", index_col=INDEX_COL)
# If samples are rows and genes are columns, transpose so that rows=samples, cols=genes
if df.shape[0] > df.shape[1]:
    # Heuristic: if more rows than columns, we likely have samples in columns -> transpose
    df = df.T

# df now: rows = samples, columns = genes
print("Data shape (samples x genes):", df.shape)

# ---------- OPTIONAL: ensure numeric ----------
df = df.apply(pd.to_numeric, errors="coerce")
df = df.dropna(axis=0, how="any")   # drop samples with NaNs
df = df.loc[:, df.var(axis=0) > 0]  # drop constant genes

# ---------- SCALE ----------
scaler = StandardScaler(with_mean=True, with_std=True)
X = scaler.fit_transform(df.values)   # shape: (n_samples, n_genes)

# ---------- PCA ----------
pca = PCA(n_components=min(N_PCS, X.shape[1]), random_state=RANDOM_STATE)
X_pca = pca.fit_transform(X)
explained = pca.explained_variance_ratio_

# Save explained variance ratio
pd.DataFrame({
    "PC": [f"PC{i+1}" for i in range(len(explained))],
    "ExplainedVariance": explained
}).to_csv(os.path.join(OUTPUT_DIR, "pca_explained_variance.csv"), index=False)

# PCA scatter (PC1 vs PC2)
plt.figure(figsize=(7,6))
plt.scatter(X_pca[:,0], X_pca[:,1], s=40)
for i, sample in enumerate(df.index):
    plt.text(X_pca[i,0], X_pca[i,1], sample, fontsize=6, alpha=0.75)
plt.xlabel(f"PC1 ({explained[0]*100:.1f}%)")
plt.ylabel(f"PC2 ({explained[1]*100:.1f}%)")
plt.title("PCA: PC1 vs PC2")
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "pca_pc1_pc2.png"), dpi=150)
plt.close()

# Scree plot
plt.figure(figsize=(6,4))
plt.plot(np.arange(1, len(explained)+1), np.cumsum(explained), marker='o')
plt.xlabel("Number of PCs")
plt.ylabel("Cumulative explained variance")
plt.title("PCA: Cumulative explained variance")
plt.grid(True, ls='--', alpha=0.4)
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "pca_cumulative_variance.png"), dpi=150)
plt.close()

# ---------- KMeans clustering (on PCA-reduced data) ----------
k = K_CLUSTERS
kmeans = KMeans(n_clusters=k, random_state=RANDOM_STATE, n_init=50)
# use first N_PCS components (or all computed)
pc_for_clustering = min(N_PCS, X_pca.shape[1])
labels = kmeans.fit_predict(X_pca[:, :pc_for_clustering])
sil = silhouette_score(X_pca[:, :pc_for_clustering], labels)
print(f"KMeans (k={k}) silhouette score: {sil:.4f}")
pd.DataFrame({"sample": df.index, "kmeans_label": labels}).to_csv(
    os.path.join(OUTPUT_DIR, "kmeans_labels.csv"), index=False)

# PCA scatter colored by cluster
plt.figure(figsize=(8,6))
palette = sns.color_palette("tab10", n_colors=k)
sns.scatterplot(x=X_pca[:,0], y=X_pca[:,1], hue=labels, palette=palette, s=50, legend='full')
plt.xlabel(f"PC1 ({explained[0]*100:.1f}%)")
plt.ylabel(f"PC2 ({explained[1]*100:.1f}%)")
plt.title(f"PCA (PC1 vs PC2) colored by KMeans (k={k})")
plt.legend(title="cluster", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, f"pca_kmeans_k{k}.png"), dpi=150)
plt.close()

# ---------- Hierarchical clustering of samples (dendrogram) ----------
linked = linkage(X, method='ward')  # use scaled full data (or PCA data if desired)
plt.figure(figsize=(10, 5))
dendrogram(linked, labels=df.index, leaf_rotation=90, leaf_font_size=6, color_threshold=None)
plt.title("Hierarchical clustering dendrogram (samples)")
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "dendrogram_samples.png"), dpi=150)
plt.close()

# ---------- Heatmap of top variable genes ----------
# find top variable genes by variance across samples
gene_variances = df.var(axis=0).sort_values(ascending=False)
top_genes = gene_variances.index[:min(N_TOP_VAR_GENES, len(gene_variances))]
df_top = df.loc[:, top_genes]

# Optionally z-score genes (rows: samples, cols: genes) -> we want to show relative expression
df_top_z = df_top.apply(lambda x: (x - x.mean()) / x.std(ddof=0), axis=0)

# Create clustermap (samples clustered by rows, genes by columns)
# seaborn clustermap will cluster rows and cols by default
cg = sns.clustermap(df_top_z,
                    method='ward',
                    metric='euclidean',
                    figsize=(12, 10),
                    yticklabels=True,
                    xticklabels=False,
                    cmap="vlag",
                    standard_scale=None)
plt.title("Clustermap (top variable genes)", pad=100)
plt.savefig(os.path.join(OUTPUT_DIR, "clustermap_top_genes.png"), dpi=150)
plt.close()

# Save the subset used for heatmap as CSV
df_top.to_csv(os.path.join(OUTPUT_DIR, "top_variable_genes.tsv"), sep="\t")

# ---------- Optional: Save PCA components and transformed matrix ----------
pd.DataFrame(X_pca, index=df.index, columns=[f"PC{i+1}" for i in range(X_pca.shape[1])]) \
    .to_csv(os.path.join(OUTPUT_DIR, "samples_pca_coordinates.csv"))

pd.DataFrame(pca.components_, index=[f"PC{i+1}" for i in range(pca.components_.shape[0])],
             columns=df.columns).to_csv(os.path.join(OUTPUT_DIR, "pca_components.csv"))

# ---------- PRINT SUMMARY ----------
print("EDA outputs saved to:", OUTPUT_DIR)
print("Files created:")
for f in sorted(os.listdir(OUTPUT_DIR)):
    print(" -", f)


Data shape (samples x genes): (903, 7337)
KMeans (k=4) silhouette score: 0.5323




EDA outputs saved to: eda_outputs
Files created:
 - clustermap_top_genes.png
 - dendrogram_samples.png
 - kmeans_labels.csv
 - pca_components.csv
 - pca_cumulative_variance.png
 - pca_explained_variance.csv
 - pca_kmeans_k4.png
 - pca_pc1_pc2.png
 - samples_pca_coordinates.csv
 - top_variable_genes.tsv
