In [1]:
import pandas as pd
import pathlib
import sys
from sklearn.preprocessing import StandardScaler

script_directory = pathlib.Path("../utils/").resolve()
sys.path.insert(0, str(script_directory))
from data_loader import load_train_test_data

In [2]:
def split_rna_seq(rna_data, split_sample_ids, split_name, output_dir, top_genes):
    """
    Filters RNASeq data for a specific split (train, test, val) using precomputed top genes.
    Args:
        rna_data (pd.DataFrame): Full RNASeq data with 'SampleID' and gene columns.
        split_sample_ids (list): List of SampleIDs for the current split.
        split_name (str): Name of the split (e.g., 'train', 'test', 'val').
        output_dir (str): Directory to save the filtered split.
        top_genes (list): List of top genes selected globally by variance.
    Returns:
        pd.DataFrame: Filtered RNASeq data for the given split.
    """
    print(f"\nProcessing {split_name} split...")

    # Align RNASeq data to the current split's SampleIDs
    aligned_rna_data = rna_data[rna_data['SampleID'].isin(split_sample_ids)].copy()

    # Filter to include only the top genes
    filtered_rna_split = aligned_rna_data[['SampleID'] + list(top_genes)]

    # Save the filtered split
    if output_dir:
        output_path = pathlib.Path(output_dir) / f"RNASeq_{split_name}_filtered.parquet"
        filtered_rna_split.to_parquet(output_path, index=False)
        print(f"Saved {split_name} split to {output_path}")
        print(filtered_rna_split.shape)

    return filtered_rna_split

In [3]:
def zscore_normalize_split(rna_split_data, split_name, output_dir=None):
    """
    Z-score normalizes RNASeq data for a given split.

    Args:
        rna_split_data (pd.DataFrame): RNASeq data for the split with 'SampleID' and gene columns.
        split_name (str): Name of the split (e.g., 'train', 'test', 'val').
        output_dir (str, optional): Directory to save the z-scored data.

    Returns:
        pd.DataFrame: Z-score normalized RNASeq data for the split.
    """
    print(f"\nZ-Score Normalizing {split_name} split...")
    
    # Separate SampleID and gene data
    sample_ids = rna_split_data['SampleID']
    gene_data = rna_split_data.drop(columns=['SampleID'])

    # Z-score normalization
    scaler = StandardScaler()
    zscored_data = pd.DataFrame(
        scaler.fit_transform(gene_data),
        columns=gene_data.columns,
        index=gene_data.index
    )

    # Add SampleID back to the z-scored data
    zscored_data.insert(0, 'SampleID', sample_ids.values)

    # Save z-scored data if output directory is specified
    if output_dir:
        output_path = pathlib.Path(output_dir) / f"RNASeq_{split_name}_zscored.parquet"
        zscored_data.to_parquet(output_path, index=False)
        print(f"Saved {split_name} split to {output_path}")

    print(zscored_data.head())
    print(zscored_data.shape)
    return zscored_data

In [4]:
# Load RNAseq data
rna_seq_data = pd.read_parquet("../6.RNAseq/data/RNASeq.parquet")  # Replace with your file path
rna_seq_data.rename(columns={'Unnamed: 0': 'SampleID'}, inplace=True)

# Load gene dependency data
data_directory = pathlib.Path("../0.data-download/data").resolve()

train_df, test_df, val_df, load_gene_stats = load_train_test_data(
    data_directory, train_or_test="all", load_gene_stats=True, zero_one_normalize=False, drop_columns=False
)

train_data = pd.DataFrame(train_df)
test_data = pd.DataFrame(test_df)
val_data = pd.DataFrame(val_df)

FileNotFoundError: [Errno 2] No such file or directory: '../6.RNAseq/data/RNASeq.parquet'

In [None]:
rna_seq_data.head()

Unnamed: 0,SampleID,TSPAN6 (7105),TNMD (64102),DPM1 (8813),SCYL3 (57147),C1orf112 (55732),FGR (2268),CFH (3075),FUCA2 (2519),GCLC (2729),...,CDR1 (1038),ENSG00000288643,ENSG00000288645,ENSG00000288656,ENSG00000288658,ENSG00000288661,ENSG00000288669,ENSG00000288671,ENSG00000288674,ENSG00000288675
0,ACH-001113,4.361066,0.0,7.39309,2.873813,4.497612,0.028569,1.244887,3.06695,6.529665,...,0.0,0.0,0.0,1.244887,0.584963,0.0,0.0,0.0,0.555816,0.189034
1,ACH-001289,4.578939,0.584963,7.11676,2.580145,3.517276,0.0,0.189034,3.823749,4.232661,...,0.0,0.0,0.0,0.0,0.659925,0.0,0.0,0.0,0.275007,0.516015
2,ACH-001339,3.160275,0.0,7.388103,2.397803,4.246408,0.056584,1.316146,6.697246,3.69265,...,0.0,0.0,0.0,0.713696,0.189034,0.0,0.084064,0.0,0.263034,0.176323
3,ACH-001538,5.094236,0.0,7.160174,2.606442,3.082362,0.0,5.876517,6.173927,4.497612,...,0.0,0.0,0.0,0.62293,0.0,0.0,0.0,0.0,0.333424,0.111031
4,ACH-001794,3.889474,0.056584,6.777946,1.978196,3.166715,0.632268,6.962549,7.104442,4.260026,...,0.0,0.0,0.0,0.056584,0.545968,0.0,0.0,0.0,0.214125,1.049631


In [None]:
# Set the number of top genes by variance to select
top_n_genes = 2000

# Identify common genes between RNASeq and dependency data
dependency_genes = set(load_gene_stats['gene_ID'].unique())  # Extract gene names from dependency data
rna_genes = set(rna_seq_data.columns) - {'SampleID'}         # Extract gene columns (excluding SampleID)
common_genes = dependency_genes.intersection(rna_genes)

# Filter RNASeq data to include only common genes
rna_common_data = rna_seq_data[['SampleID'] + list(common_genes)]

# Calculate gene variances globally
gene_variances = rna_common_data.drop(columns=['SampleID']).var(axis=0)

# Select top N genes by variance
top_genes = gene_variances.nlargest(top_n_genes).index
print(f"Top {top_n_genes} genes selected")


Top 2000 genes selected


In [None]:
# Define splits
splits = [
    {"name": "train", "sample_ids": train_df['ModelID'].tolist()},
    {"name": "test", "sample_ids": test_df['ModelID'].tolist()},
    {"name": "val", "sample_ids": val_df['ModelID'].tolist()}
]

# Output directory
output_directory = "../6.RNAseq/data"

# Process and save RNASeq data for each split
split_rna_results = {}
for split in splits:
    split_rna_results[split["name"]] = split_rna_seq(
        rna_data=rna_common_data,
        split_sample_ids=split["sample_ids"],
        split_name=split["name"],
        output_dir=output_directory,
        top_genes=top_genes
    )


Processing train split...
Saved train split to ../6.RNAseq/data/RNASeq_train_filtered.parquet
(211, 2001)

Processing test split...
Saved test split to ../6.RNAseq/data/RNASeq_test_filtered.parquet
(44, 2001)

Processing val split...
Saved val split to ../6.RNAseq/data/RNASeq_val_filtered.parquet
(47, 2001)


In [None]:
#z score normalize 
# Define output directory
output_directory = "../6.RNAseq/data"

# Perform Z-score normalization on already split data
zscored_train = zscore_normalize_split(split_rna_results["train"], "train", output_dir=output_directory)
zscored_test = zscore_normalize_split(split_rna_results["test"], "test", output_dir=output_directory)
zscored_val = zscore_normalize_split(split_rna_results["val"], "val", output_dir=output_directory)


Z-Score Normalizing train split...
Saved train split to ../6.RNAseq/data/RNASeq_train_zscored.parquet
     SampleID  KRT19 (3880)  SPARC (6678)  C19orf33 (64073)  FN1 (2335)  \
5  ACH-002023      1.626926     -0.407555          1.233254   -1.137089   
6  ACH-001655     -0.870350      1.367515         -0.942039    1.788395   
7  ACH-001098      1.515919     -1.177483          1.673581   -0.530605   
8  ACH-002040     -0.880072      0.852443         -1.017840    0.631701   
9  ACH-001375      1.638071     -1.137870          1.262895   -0.148546   

   S100A6 (6277)  SLPI (6590)  VIM (7431)  FXYD3 (5349)  TGFBI (7045)  ...  \
5       0.825111     0.513542   -0.788164      0.794122      0.747557  ...   
6       0.640187    -0.584342    1.324625     -0.868910      1.103258  ...   
7       1.034229     1.219532   -1.075808      1.023206      0.280108  ...   
8       0.994831    -0.773139    0.701512      1.499840     -0.568665  ...   
9       0.829127     1.366579   -0.996571      1.439240 