In [1]:
import pandas as pd
import os

In [2]:
# --- CONFIGURATION ---
# 1. Path to your VALID PATIENTS list (created in the previous step)
valid_ids_path = r"F:\FYP_Preparation\FYP_Data\FYP_Final_Dataset\valid_patient_ids.csv"

# 2. Path to the RNA-Seq file (Genomics)
# Note: Based on your screenshot, this is the file name
genomics_path = r"F:\FYP_Preparation\FYP_Data\lgg_tcga\data_mrna_seq_v2_rsem.txt"

# 3. Output path
output_path = r"F:\FYP_Preparation\FYP_Data\FYP_Final_Dataset\clean_genomics_data.csv"

In [3]:
print("Step 1: Loading Patient IDs...")
valid_patients_df = pd.read_csv(valid_ids_path)
valid_ids_list = valid_patients_df['PATIENT_ID'].tolist()
print(f"Targeting {len(valid_ids_list)} patients.")

Step 1: Loading Patient IDs...
Targeting 110 patients.


In [4]:
print("Step 2: Loading Genomics Data (This might take 10-20 seconds)...")
# handling potential error lines by skipping them
try:
    genomics_df = pd.read_csv(genomics_path, sep='\t')
except Exception as e:
    print(f"Error reading file: {e}")
    exit()

Step 2: Loading Genomics Data (This might take 10-20 seconds)...


The genomics file usually looks like:

Hugo_Symbol  | Entrez_Gene_Id | TCGA-DU-6407-01 | TCGA-DU-6408-01 ...

We need to Transpose it so Rows = Patients, Columns = Genes

In [5]:
# 1. Check if the columns match our Patient IDs
# TCGA columns often have extra suffixes like '-01' (Sample Type). We need to trim them.
print("Step 3: Matching columns to patients...")

# Create a dictionary to map "TCGA-CS-4941-01" -> "TCGA-CS-4941"
found_columns = []
rename_map = {}

for col in genomics_df.columns:
    # Check if the first 12 characters (e.g., TCGA-CS-4941) match our list
    short_id = col[:12] 
    if short_id in valid_ids_list:
        found_columns.append(col)
        rename_map[col] = short_id

# Filter the dataframe to keep only found patients + the Gene Name column
final_cols = ['Hugo_Symbol'] + found_columns
genomics_df = genomics_df[final_cols]

# Rename columns to match our clean IDs
genomics_df = genomics_df.rename(columns=rename_map)

print(f"Found genomics data for {len(found_columns)} of our patients.")

Step 3: Matching columns to patients...
Found genomics data for 115 of our patients.


In [6]:
# Step 4: Transpose (Flip) the data
# Now: Rows = Genes. We want Rows = Patients.
genomics_df = genomics_df.set_index('Hugo_Symbol').T

# Step 5: Select ONLY Important Brain Cancer Genes (Feature Selection)
# This saves your model from overfitting.
target_genes = ['IDH1', 'TP53', 'ATRX', 'PTEN', 'EGFR', 'CIC', 'FUBP1', 'NOTCH1']
available_genes = [g for g in target_genes if g in genomics_df.columns]

print(f"Extracting key genes: {available_genes}")
final_genomics_df = genomics_df[available_genes]

# Reset index so 'PATIENT_ID' becomes a column
final_genomics_df.reset_index(inplace=True)
final_genomics_df = final_genomics_df.rename(columns={'index': 'PATIENT_ID'})

Extracting key genes: ['IDH1', 'TP53', 'EGFR', 'CIC', 'FUBP1', 'NOTCH1']


In [7]:
# Step 6: Save
final_genomics_df.to_csv(output_path, index=False)
print(f"DONE! Saved clean genomics data to: {output_path}")
print(final_genomics_df.head())

DONE! Saved clean genomics data to: F:\FYP_Preparation\FYP_Data\FYP_Final_Dataset\clean_genomics_data.csv
Hugo_Symbol    PATIENT_ID       IDH1       TP53        EGFR        CIC  \
0            TCGA-CS-4941  1961.0076  1310.5590  48018.0538  3306.0732   
1            TCGA-CS-4942  1523.5294  1290.3114   3437.6920  1328.0277   
2            TCGA-CS-4943  2645.7675  2756.3067   6477.4479  2105.4622   
3            TCGA-CS-4944  1293.6578   494.7674   1741.5156  2003.7826   
4            TCGA-CS-5393  1765.1806  1233.0038  11177.2927  2648.7795   

Hugo_Symbol      FUBP1      NOTCH1  
0             971.0145   4669.4272  
1             761.2457   4287.5433  
2            1509.0722  12281.9174  
3             567.8981   3630.3114  
4            1275.3682   6208.9974  
