In [1]:
import time
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.sparse.linalg import svds
import pyarrow as pa

print(f" [i] Start analyzing, total 8 steps...")
print(f" [i] Setting up...")

# 強制 Qt 在 Linux 環境下使用 X11
os.environ["QT_QPA_PLATFORM"] = "xcb"

# 防止 NumPy 佔用過多記憶體
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"

# 固定隨機種子確保可重現性
np.random.seed(42)
os.environ["PYTHONHASHSEED"] = "42"

 [i] Start analyzing, total 8 steps...
 [i] Setting up...


In [None]:
def load_hgnc_protein_coding(filepath):
    df = pd.read_csv(filepath, sep="\t")
    df = df[df["Approved symbol"].str.startswith(("RPL", "RPS"), na = False)]
    df = df.dropna(subset=["Ensembl gene ID"]) # 確保 Ensembl gene ID 不是 NaN
    # print(df)
    return dict(zip(df["Ensembl gene ID"], df["Approved symbol"]))

# 設定本地 HGNC 資料檔案
print(f" [i] Reading HGNC protein coding gene...")
hgnc_file = r"~/gct_data/hgnc_protein_coding.tsv"
ensembl_to_symbol = load_hgnc_protein_coding(hgnc_file)
print(f" [i] Ribosomal Protein gene count: {len(ensembl_to_symbol)}")

def load_gct(filepath):
    """
    逐行解析 GCT 檔案，確保數據格式一致，避免記憶體使用過高。
    """
    print(" [1] Detecting column names...")
    with open(filepath, "r") as f:
        lines = f.readlines()
    header_line = lines[2]  # `.gct` 的標題通常在第 3 行（index 2）
    column_names = header_line.strip().split("\t")
    gene_id_column = column_names[0]
    print(f" [2] Detected Gene_ID column: {gene_id_column}")

    print(" [3] Streaming GCT data...", end = "\n")
    print(" [i] This may take a few minutes, please wait...", end = "\n")
    def process_chunk(chunk):
        chunk = chunk.rename(columns={gene_id_column: "Gene_ID"})
        chunk["Gene_ID"] = chunk["Gene_ID"].astype(str)
        chunk.set_index("Gene_ID", inplace=True)

        # 🔹 確保基因 ID 的格式與 ensembl_to_symbol 一致（去掉 .X 後綴）
        chunk.index = chunk.index.str.split('.').str[0]

        # 🔹 確保 index 不是 None
        if chunk.index is None or chunk.empty:
            return pd.DataFrame()

        return chunk[chunk.index.isin(ensembl_to_symbol)]

    chunk_size = 10000
    filtered_data = []

    with pd.read_csv(filepath, sep="\t", skiprows=2, chunksize=chunk_size) as reader:
        for chunk in reader:
            processed_chunk = process_chunk(chunk)
            filtered_data.append(pa.Table.from_pandas(processed_chunk))

    print(" [4] Merging processed data...")
    final_table = pa.concat_tables(filtered_data)
    df = final_table.to_pandas()
    if "Description" in df.columns:
        df.drop(columns=["Description"], inplace=True)
    df = df.set_index("Gene_ID", drop=True)
    df = df.T
    
    print(df.shape)
    
    return df

 [i] Reading HGNC protein coding gene...
 [i] Ribosomal Protein gene count: 1765


In [None]:
# 預處理數據
def preprocess_data(df):
    print(" [5] Pre-processing data...")
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    df = df.select_dtypes(include=[np.number])  # 只保留數值欄位
    data_scaled = scaler.fit_transform(df)
    return data_scaled, data_scaled.shape

In [4]:
# PCA 降維
def perform_pca_np(data, n_components=50):
    print(" [6] Performing PCA_NP...")
    data = data.astype(np.float32)
    mean = np.mean(data, axis=0)
    data_centered = data - mean
    U, S, Vt = svds(data_centered, k=n_components)
    return np.dot(data_centered, Vt.T)

In [5]:
# t-SNE 降維
def perform_tsne_np(data, n_components=2, perplexity=30, learning_rate=500, max_iter=2000):
    print(" [7] Performing t-SNE_NP...")
    from sklearn.manifold import TSNE
    tsne = TSNE(n_components=n_components, perplexity=perplexity, learning_rate=learning_rate, max_iter=max_iter)
    return tsne.fit_transform(data)

In [6]:
# 繪製 t-SNE 圖形
def plot_tsne(tsne_results, output_path):
    print(" [8] Drawing t-SNE plot...")
    plt.figure(figsize=(19.2, 10.8))
    sns.scatterplot(x=tsne_results[:, 0], y=tsne_results[:, 1], palette='Set2', s=50, edgecolor="black", alpha=0.8)
    plt.xlabel("t-SNE dimension 1")
    plt.ylabel("t-SNE dimension 2")
    plt.title("t-SNE visualization of GTEx Data")
    plt.legend(title="Tissues", bbox_to_anchor=(1, 1), loc='upper left')
    plt.tight_layout()  # 確保圖例不會超出畫布
    plt.savefig(output_path, dpi=300)
    print(f" [i] t-SNE figure is saved as: {output_path}")

In [7]:
# 主執行流程
if __name__ == "__main__":
    start = time.time()
    gct_file = r"/home/terry_0714/gct_data/GTEx_Analysis_2022-06-06_v10_RNASeQCv2.4.2_gene_tpm_non_lcm.gct"
    output_image = r"/home/terry_0714/tsne_plot/tsne_plot_new_5.png"
    # 讀取 GCT 檔案
    df = load_gct(gct_file)
    
    # 資料預處理
    processed_data, shape = preprocess_data(df)
    
    # 資料降維
    pca_data = perform_pca_np(processed_data)
    
    # t-SNE 降維
    tsne_results = perform_tsne_np(pca_data)
    
    # 作圖
    plot_tsne(tsne_results, output_image)
    print(" [i] t-SNE analysis is completed.")

    end = time.time()
    elapsed_time = int(end - start)
    hours = elapsed_time // 3600
    minutes = elapsed_time % 3600 // 60
    seconds = elapsed_time % 60

    print(f" [i] Time spent: {hours} hours, {minutes} minutes and {seconds} seconds.")

 [1] Detecting column names...
 [2] Detected Gene_ID column: Name
 [3] Streaming GCT data...
 [i] This may take a few minutes, please wait...
 [4] Merging processed data...
                     Description  GTEX-1117F-0005-SM-HL9SH  \
Gene_ID                                                      
ENSG00000236679        RPL23AP24                  0.000000   
ENSG00000116251            RPL22                 21.767600   
ENSG00000232848  ENSG00000232848                  0.040991   
ENSG00000234619          RPL7P11                  0.000000   
ENSG00000224315           RPL7P7                  0.000000   

                 GTEX-1117F-0011-R10b-SM-GI4VE  GTEX-1117F-0011-R11b-SM-GIN8R  \
Gene_ID                                                                         
ENSG00000236679                       0.105523                       0.153814   
ENSG00000116251                     100.741000                     101.544000   
ENSG00000232848                       0.152349                      

ValueError: could not convert string to float: 'RPL23AP24'