# Explainable AI for B-ALL Subtype Classification Using RNA-Seq Data

In [58]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path

# Set base directory
base_dir = Path.cwd() / './gdc_data/gdc_unhealthy_aml/'

## Get all data

### Generate the first rows

In [59]:
tsv_file_path = base_dir / '0ab17801-20a1-4052-9ae6-585100bc410b.rna_seq.augmented_star_gene_counts.tsv'

df = pd.read_csv(tsv_file_path, sep='\t', header=1, )

# Select the needed rows
df_firstrows = df.loc[4:, ['gene_id', 'gene_name', 'gene_type', 'unstranded']]

# Flip the table
df_firstrows = df_firstrows.T

df_firstrows.columns = df_firstrows.iloc[0]  # Set the first row as the header
df_firstrows = df_firstrows[1:]  # Remove the first row now that it's the header
df_firstrows

gene_id,ENSG00000000003.15,ENSG00000000005.6,ENSG00000000419.13,ENSG00000000457.14,ENSG00000000460.17,ENSG00000000938.13,ENSG00000000971.16,ENSG00000001036.14,ENSG00000001084.13,ENSG00000001167.14,...,ENSG00000288661.1,ENSG00000288662.1,ENSG00000288663.1,ENSG00000288665.1,ENSG00000288667.1,ENSG00000288669.1,ENSG00000288670.1,ENSG00000288671.1,ENSG00000288674.1,ENSG00000288675.1
gene_name,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,NFYA,...,AL451106.1,AC092910.4,AC073611.1,AC136977.1,AC078856.1,AC008763.4,AL592295.6,AC006486.3,AL391628.1,AP006621.6
gene_type,protein_coding,protein_coding,protein_coding,protein_coding,protein_coding,protein_coding,protein_coding,protein_coding,protein_coding,protein_coding,...,protein_coding,lncRNA,lncRNA,lncRNA,lncRNA,protein_coding,lncRNA,protein_coding,protein_coding,protein_coding
unstranded,15,0,705,571,646,12084,48,539,8622,1795,...,0,35,35,0,14,0,67,0,12,15


### Populate all the rows

In [60]:
# Get all .tsv files recursively
tsv_files = list(base_dir.rglob('*.tsv'))

# List to hold individual dataframes
df_list = []

for file in tsv_files:
    # Read the TSV file, skipping the first line (comment) and using second line as header
    df = pd.read_csv(file, sep='\t', header=1)

    # Extract UUID from filename
    # Assuming filename format: uuid.rna_seq.augmented_star_gene_counts.tsv
    uuid = file.stem.split('.')[0]

    # Filter out rows that start with 'N_' (these are summary statistics, not genes)
    df = df[~df['gene_id'].str.startswith('N_')]

    # df['gene_id'] = df['gene_id'].str.split('.').str[0]

    # Select only the columns we need: gene_id, gene_name, gene_type, and tpm_unstranded
    df_subset = df[['gene_id', 'gene_name', 'gene_type', 'tpm_unstranded']].copy()

    # Rename tpm_unstranded column to include the UUID
    df_subset = df_subset.rename(columns={'tpm_unstranded': uuid})

    df_list.append(df_subset)

# Merge all dataframes on gene_id, gene_name, and gene_type
if df_list:
    # Start with the first dataframe
    unhealthy_df = df_list[0]

    # Merge with subsequent dataframes
    for df in df_list[1:]:
        unhealthy_df = pd.merge(
            unhealthy_df,
            df,
            on=['gene_id', 'gene_name', 'gene_type'],
            how='outer'
        )

    # Reorder columns to have gene info first, then sample columns
    gene_cols = ['gene_id', 'gene_name', 'gene_type']
    sample_cols = [col for col in unhealthy_df.columns if col not in gene_cols]
    unhealthy_df = unhealthy_df[gene_cols + sample_cols]

    # Sort by gene_id for consistency
    unhealthy_df = unhealthy_df.sort_values('gene_id').reset_index(drop=True)

    print(f"Successfully processed {len(df_list)} files")
    print(f"Final dataframe shape: {unhealthy_df.shape}")
    print("\nFirst few rows:")
    print(unhealthy_df.head())

else:
    print("No TSV files found!")
    unhealthy_df = None

Successfully processed 354 files
Final dataframe shape: (60660, 357)

First few rows:
              gene_id gene_name       gene_type  \
0  ENSG00000000003.15    TSPAN6  protein_coding   
1   ENSG00000000005.6      TNMD  protein_coding   
2  ENSG00000000419.13      DPM1  protein_coding   
3  ENSG00000000457.14     SCYL3  protein_coding   
4  ENSG00000000460.17  C1orf112  protein_coding   

   a9e276bb-593a-4723-b9b9-629618a0c657  15c888bd-e07c-4505-a65e-5619702f36f5  \
0                                0.5112                                0.1393   
1                                0.0000                                0.0389   
2                               40.7959                               36.9689   
3                                4.8853                                3.3040   
4                                2.7968                                3.0878   

   6f7ee2c2-8ee0-40f9-8ef3-0d389aa22a54  7c1f22be-3f25-4ac5-afe8-309341e3eeef  \
0                                0.3135

In [56]:
# drop the first row now that it's the header

In [61]:
unhealthy_df = unhealthy_df.T
unhealthy_df.columns = unhealthy_df.iloc[0]  # Set the first row as the header
unhealthy_df = unhealthy_df[1:]  # Remove the first row now that it's the header

unhealthy_df

gene_id,ENSG00000000003.15,ENSG00000000005.6,ENSG00000000419.13,ENSG00000000457.14,ENSG00000000460.17,ENSG00000000938.13,ENSG00000000971.16,ENSG00000001036.14,ENSG00000001084.13,ENSG00000001167.14,...,ENSG00000288661.1,ENSG00000288662.1,ENSG00000288663.1,ENSG00000288665.1,ENSG00000288667.1,ENSG00000288669.1,ENSG00000288670.1,ENSG00000288671.1,ENSG00000288674.1,ENSG00000288675.1
gene_name,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,NFYA,...,AL451106.1,AC092910.4,AC073611.1,AC136977.1,AC078856.1,AC008763.4,AL592295.6,AC006486.3,AL391628.1,AP006621.6
gene_type,protein_coding,protein_coding,protein_coding,protein_coding,protein_coding,protein_coding,protein_coding,protein_coding,protein_coding,protein_coding,...,protein_coding,lncRNA,lncRNA,lncRNA,lncRNA,protein_coding,lncRNA,protein_coding,protein_coding,protein_coding
a9e276bb-593a-4723-b9b9-629618a0c657,0.5112,0.0,40.7959,4.8853,2.7968,48.4787,0.5134,25.0639,25.1959,31.825,...,0.0,18.0369,1.6005,0.0,11.405,0.0582,4.449,0.0,0.675,2.4847
15c888bd-e07c-4505-a65e-5619702f36f5,0.1393,0.0389,36.9689,3.304,3.0878,525.7142,0.6189,31.5222,25.7086,32.2174,...,0.0,25.8425,0.393,0.0,11.2973,0.0,1.6208,0.0,0.1254,0.2735
6f7ee2c2-8ee0-40f9-8ef3-0d389aa22a54,0.3135,0.0,33.8329,2.2476,5.3636,328.0995,0.6985,19.7511,39.6436,39.5932,...,0.0,25.5938,0.499,0.0,13.9857,0.0,2.7068,0.0,0.1774,0.6432
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4b7e2f66-53ee-4578-8b42-5c89c456dfb1,0.1683,0.0,47.3646,4.1737,3.9167,252.2888,0.586,19.8863,32.7143,42.4237,...,0.0,58.8551,1.0048,0.0,28.1629,0.0239,1.2148,0.0,0.1786,0.4545
905b5922-dfd0-4e49-babb-a7150b2b2d15,0.6575,0.1225,61.9196,4.3328,4.072,129.7315,0.5436,31.3193,53.3544,31.2778,...,0.0,22.5928,0.7929,0.0,22.2224,0.0,3.1507,0.0,0.2255,0.8069
54b3ce4b-fd3d-48ec-a745-c74b660daf4b,0.2247,0.0,48.8084,3.2281,2.2194,362.6486,0.2299,20.611,10.5259,16.3855,...,0.0,29.4445,1.8838,0.0,15.5948,0.017,3.1587,0.0,0.113,0.8898
8e5a3caf-28eb-4ea7-ba86-1927734b4e24,0.1304,0.089,31.3054,2.4823,4.2268,561.873,0.6258,21.6796,13.5348,44.5911,...,0.0,56.9522,0.3575,0.0,30.1638,0.0165,1.091,0.0,0.3347,0.7041


In [62]:
import os

# Create the data directory if it doesn't exist
os.makedirs('./data', exist_ok=True)
unhealthy_df.columns = unhealthy_df.columns.str.split('.').str[0]  # Remove everything after the first dot in column names
# Save to parquet without index
unhealthy_df.to_csv('./data/merged_unstranded_unhealthy_data.csv')

print("Data saved to ./data/merged_unstranded_data.csv")
unhealthy_df

Data saved to ./data/merged_unstranded_data.csv


gene_id,ENSG00000000003,ENSG00000000005,ENSG00000000419,ENSG00000000457,ENSG00000000460,ENSG00000000938,ENSG00000000971,ENSG00000001036,ENSG00000001084,ENSG00000001167,...,ENSG00000288661,ENSG00000288662,ENSG00000288663,ENSG00000288665,ENSG00000288667,ENSG00000288669,ENSG00000288670,ENSG00000288671,ENSG00000288674,ENSG00000288675
gene_name,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,NFYA,...,AL451106.1,AC092910.4,AC073611.1,AC136977.1,AC078856.1,AC008763.4,AL592295.6,AC006486.3,AL391628.1,AP006621.6
gene_type,protein_coding,protein_coding,protein_coding,protein_coding,protein_coding,protein_coding,protein_coding,protein_coding,protein_coding,protein_coding,...,protein_coding,lncRNA,lncRNA,lncRNA,lncRNA,protein_coding,lncRNA,protein_coding,protein_coding,protein_coding
a9e276bb-593a-4723-b9b9-629618a0c657,0.5112,0.0,40.7959,4.8853,2.7968,48.4787,0.5134,25.0639,25.1959,31.825,...,0.0,18.0369,1.6005,0.0,11.405,0.0582,4.449,0.0,0.675,2.4847
15c888bd-e07c-4505-a65e-5619702f36f5,0.1393,0.0389,36.9689,3.304,3.0878,525.7142,0.6189,31.5222,25.7086,32.2174,...,0.0,25.8425,0.393,0.0,11.2973,0.0,1.6208,0.0,0.1254,0.2735
6f7ee2c2-8ee0-40f9-8ef3-0d389aa22a54,0.3135,0.0,33.8329,2.2476,5.3636,328.0995,0.6985,19.7511,39.6436,39.5932,...,0.0,25.5938,0.499,0.0,13.9857,0.0,2.7068,0.0,0.1774,0.6432
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4b7e2f66-53ee-4578-8b42-5c89c456dfb1,0.1683,0.0,47.3646,4.1737,3.9167,252.2888,0.586,19.8863,32.7143,42.4237,...,0.0,58.8551,1.0048,0.0,28.1629,0.0239,1.2148,0.0,0.1786,0.4545
905b5922-dfd0-4e49-babb-a7150b2b2d15,0.6575,0.1225,61.9196,4.3328,4.072,129.7315,0.5436,31.3193,53.3544,31.2778,...,0.0,22.5928,0.7929,0.0,22.2224,0.0,3.1507,0.0,0.2255,0.8069
54b3ce4b-fd3d-48ec-a745-c74b660daf4b,0.2247,0.0,48.8084,3.2281,2.2194,362.6486,0.2299,20.611,10.5259,16.3855,...,0.0,29.4445,1.8838,0.0,15.5948,0.017,3.1587,0.0,0.113,0.8898
8e5a3caf-28eb-4ea7-ba86-1927734b4e24,0.1304,0.089,31.3054,2.4823,4.2268,561.873,0.6258,21.6796,13.5348,44.5911,...,0.0,56.9522,0.3575,0.0,30.1638,0.0165,1.091,0.0,0.3347,0.7041


In [74]:
unhealthy_df = pd.read_csv('./data/merged_unstranded_unhealthy_data.csv', index_col=0, low_memory=False)
# Display the first few rows of the merged dataframe
healthy_df = pd.read_csv('./data/merged_unstranded_healthy_data.csv', index_col=0, low_memory=False)

# Match the df on the gene_id
unhealthy_df

Unnamed: 0,ENSG00000000003,ENSG00000000005,ENSG00000000419,ENSG00000000457,ENSG00000000460,ENSG00000000938,ENSG00000000971,ENSG00000001036,ENSG00000001084,ENSG00000001167,...,ENSG00000288661,ENSG00000288662,ENSG00000288663,ENSG00000288665,ENSG00000288667,ENSG00000288669,ENSG00000288670,ENSG00000288671,ENSG00000288674,ENSG00000288675
gene_name,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,NFYA,...,AL451106.1,AC092910.4,AC073611.1,AC136977.1,AC078856.1,AC008763.4,AL592295.6,AC006486.3,AL391628.1,AP006621.6
gene_type,protein_coding,protein_coding,protein_coding,protein_coding,protein_coding,protein_coding,protein_coding,protein_coding,protein_coding,protein_coding,...,protein_coding,lncRNA,lncRNA,lncRNA,lncRNA,protein_coding,lncRNA,protein_coding,protein_coding,protein_coding
a9e276bb-593a-4723-b9b9-629618a0c657,0.5112,0.0,40.7959,4.8853,2.7968,48.4787,0.5134,25.0639,25.1959,31.825,...,0.0,18.0369,1.6005,0.0,11.405,0.0582,4.449,0.0,0.675,2.4847
15c888bd-e07c-4505-a65e-5619702f36f5,0.1393,0.0389,36.9689,3.304,3.0878,525.7142,0.6189,31.5222,25.7086,32.2174,...,0.0,25.8425,0.393,0.0,11.2973,0.0,1.6208,0.0,0.1254,0.2735
6f7ee2c2-8ee0-40f9-8ef3-0d389aa22a54,0.3135,0.0,33.8329,2.2476,5.3636,328.0995,0.6985,19.7511,39.6436,39.5932,...,0.0,25.5938,0.499,0.0,13.9857,0.0,2.7068,0.0,0.1774,0.6432
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4b7e2f66-53ee-4578-8b42-5c89c456dfb1,0.1683,0.0,47.3646,4.1737,3.9167,252.2888,0.586,19.8863,32.7143,42.4237,...,0.0,58.8551,1.0048,0.0,28.1629,0.0239,1.2148,0.0,0.1786,0.4545
905b5922-dfd0-4e49-babb-a7150b2b2d15,0.6575,0.1225,61.9196,4.3328,4.072,129.7315,0.5436,31.3193,53.3544,31.2778,...,0.0,22.5928,0.7929,0.0,22.2224,0.0,3.1507,0.0,0.2255,0.8069
54b3ce4b-fd3d-48ec-a745-c74b660daf4b,0.2247,0.0,48.8084,3.2281,2.2194,362.6486,0.2299,20.611,10.5259,16.3855,...,0.0,29.4445,1.8838,0.0,15.5948,0.017,3.1587,0.0,0.113,0.8898
8e5a3caf-28eb-4ea7-ba86-1927734b4e24,0.1304,0.089,31.3054,2.4823,4.2268,561.873,0.6258,21.6796,13.5348,44.5911,...,0.0,56.9522,0.3575,0.0,30.1638,0.0165,1.091,0.0,0.3347,0.7041


In [66]:
healthy_df

Unnamed: 0,ENSG00000000003,ENSG00000000005,ENSG00000000419,ENSG00000000457,ENSG00000000460,ENSG00000000938,ENSG00000000971,ENSG00000001036,ENSG00000001084,ENSG00000001167,...,ENSG00000288661,ENSG00000288662,ENSG00000288663,ENSG00000288665,ENSG00000288667,ENSG00000288669,ENSG00000288670,ENSG00000288671,ENSG00000288674,ENSG00000288675
gene_name,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,NFYA,...,AL451106.1,AC092910.4,AC073611.1,AC136977.1,AC078856.1,AC008763.4,AL592295.6,AC006486.3,AL391628.1,AP006621.6
gene_type,protein_coding,protein_coding,protein_coding,protein_coding,protein_coding,protein_coding,protein_coding,protein_coding,protein_coding,protein_coding,...,protein_coding,lncRNA,lncRNA,lncRNA,lncRNA,protein_coding,lncRNA,protein_coding,protein_coding,protein_coding
c69538be-27a1-4949-b04b-5386e4102386,0.1786,0.0,40.1829,7.5173,9.5783,365.8802,1.0444,12.1824,23.7471,38.8174,...,0.0,48.2307,0.7312,0.0,58.8256,0.029,6.7902,0.0,0.1083,1.4469
8a594674-acc2-40b1-bb50-80d103a9e50a,0.1379,0.0,39.3911,9.4773,5.1061,390.9828,1.3042,17.9262,14.5216,29.2344,...,0.0,6.6351,0.8233,0.0,9.3233,0.0,4.957,0.0,0.068,0.8802
db3fc680-b411-42af-8845-8dd1732815ea,1.2609,0.0615,48.8917,6.5555,7.2996,322.1344,2.1502,24.6434,22.8814,43.7146,...,0.0,27.2365,0.924,0.0,20.8366,0.0228,3.7682,0.0,0.151,0.7025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2cec6f6b-1f3f-4205-9a85-cff81b8f19b2,1.1156,0.0,32.0934,4.0815,6.1232,180.0875,2.9191,15.9526,31.8689,47.5256,...,0.0,29.0816,0.6124,0.0,24.3141,0.0875,2.1727,0.0,0.2086,0.9867
6e1ba022-7faf-491a-bc80-b237d4a26342,1.4128,0.0,59.7687,6.0521,10.8654,263.1912,1.8558,23.3235,39.9052,55.5391,...,0.0,25.9811,0.9878,0.0,41.1722,0.0,6.0388,0.0,0.162,0.9279
5250e024-f516-4b0b-98da-62f89e49c831,1.5918,0.0,55.9303,7.2801,12.2842,180.4202,2.3656,22.1957,52.4108,52.868,...,0.0,26.0984,0.9312,0.0,31.3751,0.0873,6.3549,0.0,0.1356,0.7767
bc124f6c-417c-44cf-bcdf-e318fb09bf36,0.5271,0.0736,40.4698,15.0224,9.7293,295.5646,2.1108,13.6896,24.2105,57.0296,...,0.0,63.3907,1.5922,0.0,97.0903,0.0545,12.3283,0.0,0.2316,1.8112


In [75]:
# drop first two rows of both dataframes
healthy_df = healthy_df.iloc[2:, :]
unhealthy_df = unhealthy_df.iloc[2:, :]

# add the rows from the healthy_df to the unhealthy_df
combined_df = pd.concat([healthy_df, unhealthy_df])

combined_df
# Write the combined dataframe to a CSV file

Unnamed: 0,ENSG00000000003,ENSG00000000005,ENSG00000000419,ENSG00000000457,ENSG00000000460,ENSG00000000938,ENSG00000000971,ENSG00000001036,ENSG00000001084,ENSG00000001167,...,ENSG00000288661,ENSG00000288662,ENSG00000288663,ENSG00000288665,ENSG00000288667,ENSG00000288669,ENSG00000288670,ENSG00000288671,ENSG00000288674,ENSG00000288675
c69538be-27a1-4949-b04b-5386e4102386,0.1786,0.0,40.1829,7.5173,9.5783,365.8802,1.0444,12.1824,23.7471,38.8174,...,0.0,48.2307,0.7312,0.0,58.8256,0.029,6.7902,0.0,0.1083,1.4469
8a594674-acc2-40b1-bb50-80d103a9e50a,0.1379,0.0,39.3911,9.4773,5.1061,390.9828,1.3042,17.9262,14.5216,29.2344,...,0.0,6.6351,0.8233,0.0,9.3233,0.0,4.957,0.0,0.068,0.8802
db3fc680-b411-42af-8845-8dd1732815ea,1.2609,0.0615,48.8917,6.5555,7.2996,322.1344,2.1502,24.6434,22.8814,43.7146,...,0.0,27.2365,0.924,0.0,20.8366,0.0228,3.7682,0.0,0.151,0.7025
ee083116-281e-406c-bdd5-b425f5d2eab0,1.0037,0.0717,58.7735,7.9222,12.06,177.6982,2.3617,22.6993,39.5852,56.4823,...,0.0,35.2934,0.9847,0.0,24.3003,0.0,7.0313,0.0,0.088,0.5042
61547668-0038-4e0e-978d-80bfc6307249,1.0594,0.0,52.4929,6.9943,8.377,448.4092,2.6875,19.7084,28.0034,56.7174,...,0.0,37.078,1.3272,0.0,21.8821,0.0893,5.9587,0.0,0.111,0.6886
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4b7e2f66-53ee-4578-8b42-5c89c456dfb1,0.1683,0.0,47.3646,4.1737,3.9167,252.2888,0.586,19.8863,32.7143,42.4237,...,0.0,58.8551,1.0048,0.0,28.1629,0.0239,1.2148,0.0,0.1786,0.4545
905b5922-dfd0-4e49-babb-a7150b2b2d15,0.6575,0.1225,61.9196,4.3328,4.072,129.7315,0.5436,31.3193,53.3544,31.2778,...,0.0,22.5928,0.7929,0.0,22.2224,0.0,3.1507,0.0,0.2255,0.8069
54b3ce4b-fd3d-48ec-a745-c74b660daf4b,0.2247,0.0,48.8084,3.2281,2.2194,362.6486,0.2299,20.611,10.5259,16.3855,...,0.0,29.4445,1.8838,0.0,15.5948,0.017,3.1587,0.0,0.113,0.8898
8e5a3caf-28eb-4ea7-ba86-1927734b4e24,0.1304,0.089,31.3054,2.4823,4.2268,561.873,0.6258,21.6796,13.5348,44.5911,...,0.0,56.9522,0.3575,0.0,30.1638,0.0165,1.091,0.0,0.3347,0.7041


In [76]:
combined_df.to_csv('./data/merged_unstranded_combined_data.csv')

In [77]:
combined_df = pd.read_csv('./data/merged_unstranded_combined_data.csv', index_col=0, low_memory=False)
# Display the first few rows of the combined dataframe
combined_df


Unnamed: 0,ENSG00000000003,ENSG00000000005,ENSG00000000419,ENSG00000000457,ENSG00000000460,ENSG00000000938,ENSG00000000971,ENSG00000001036,ENSG00000001084,ENSG00000001167,...,ENSG00000288661,ENSG00000288662,ENSG00000288663,ENSG00000288665,ENSG00000288667,ENSG00000288669,ENSG00000288670,ENSG00000288671,ENSG00000288674,ENSG00000288675
c69538be-27a1-4949-b04b-5386e4102386,0.1786,0.0000,40.1829,7.5173,9.5783,365.8802,1.0444,12.1824,23.7471,38.8174,...,0.0,48.2307,0.7312,0.0,58.8256,0.0290,6.7902,0.0,0.1083,1.4469
8a594674-acc2-40b1-bb50-80d103a9e50a,0.1379,0.0000,39.3911,9.4773,5.1061,390.9828,1.3042,17.9262,14.5216,29.2344,...,0.0,6.6351,0.8233,0.0,9.3233,0.0000,4.9570,0.0,0.0680,0.8802
db3fc680-b411-42af-8845-8dd1732815ea,1.2609,0.0615,48.8917,6.5555,7.2996,322.1344,2.1502,24.6434,22.8814,43.7146,...,0.0,27.2365,0.9240,0.0,20.8366,0.0228,3.7682,0.0,0.1510,0.7025
ee083116-281e-406c-bdd5-b425f5d2eab0,1.0037,0.0717,58.7735,7.9222,12.0600,177.6982,2.3617,22.6993,39.5852,56.4823,...,0.0,35.2934,0.9847,0.0,24.3003,0.0000,7.0313,0.0,0.0880,0.5042
61547668-0038-4e0e-978d-80bfc6307249,1.0594,0.0000,52.4929,6.9943,8.3770,448.4092,2.6875,19.7084,28.0034,56.7174,...,0.0,37.0780,1.3272,0.0,21.8821,0.0893,5.9587,0.0,0.1110,0.6886
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4b7e2f66-53ee-4578-8b42-5c89c456dfb1,0.1683,0.0000,47.3646,4.1737,3.9167,252.2888,0.5860,19.8863,32.7143,42.4237,...,0.0,58.8551,1.0048,0.0,28.1629,0.0239,1.2148,0.0,0.1786,0.4545
905b5922-dfd0-4e49-babb-a7150b2b2d15,0.6575,0.1225,61.9196,4.3328,4.0720,129.7315,0.5436,31.3193,53.3544,31.2778,...,0.0,22.5928,0.7929,0.0,22.2224,0.0000,3.1507,0.0,0.2255,0.8069
54b3ce4b-fd3d-48ec-a745-c74b660daf4b,0.2247,0.0000,48.8084,3.2281,2.2194,362.6486,0.2299,20.6110,10.5259,16.3855,...,0.0,29.4445,1.8838,0.0,15.5948,0.0170,3.1587,0.0,0.1130,0.8898
8e5a3caf-28eb-4ea7-ba86-1927734b4e24,0.1304,0.0890,31.3054,2.4823,4.2268,561.8730,0.6258,21.6796,13.5348,44.5911,...,0.0,56.9522,0.3575,0.0,30.1638,0.0165,1.0910,0.0,0.3347,0.7041


In [83]:
# d0b7f0c0-4155-4069-986e-595d66299b8b

# Find the first index that matches the UUID format
def find_first_uuid_index(df):
    for i, row in df.iterrows():
        if isinstance(row.index, str) and len(row.index) == 36 and '-' in row.index:
            return i
    return None

first_uuid_index = find_first_uuid_index(combined_df)
if first_uuid_index is not None:
    print(f"First UUID found at index: {first_uuid_index}")
else:
    print("No UUID found in the 'gene_id' column.")

No UUID found in the 'gene_id' column.


In [46]:
# Load in the metadata
metadata_path = 'metadata/unhealthy_aml.json'
metadata_df = pd.read_json(metadata_path)

#  "file_name": "1e7f268d-b650-4124-affc-57f769ef4f51.rna_seq.augmented_star_gene_counts.tsv",
# Extract the UUID from the file_name
metadata_df['uuid'] = metadata_df['file_name'].str.split('.').str[0]

# add the condition column healthy
metadata_df['condition'] = 'unhealthy'

# #   ],
#   "file_name": "1e7f268d-b650-4124-affc-57f769ef4f51.rna_seq.augmented_star_gene_counts.tsv",
#   "submitter_id": "8a07f5b2-946a-446b-8930-79ef04135f73",
#   "data_category": "Transcriptome Profiling",
#   "analysis": {
#     "workflow_version": "122a0dd1445b2664b1b40b7df7b0e2240183d712",

# Add the workflow_version column
metadata_df['analysis'] = metadata_df['analysis'].dropna()
metadata_df['workflow_version'] = metadata_df['analysis'].apply(lambda x: x.get('workflow_version', 'unknown') if isinstance(x, dict) else '')

# Select relevant columns
metadata_df = metadata_df[['uuid', 'condition', 'workflow_version']]

# drop any rows with NaN values in 'uuid' or 'workflow_version'
metadata_df = metadata_df.dropna(subset=['uuid', 'workflow_version'])

metadata_df.head()

Unnamed: 0,uuid,condition,workflow_version
0,7dd02562-87b8-4cf9-80fc-9c477c1a690a,unhealthy,61fd5ef8ab410a784da2e89eca063ca3c66998ec
3,d0b7f0c0-4155-4069-986e-595d66299b8b,unhealthy,61fd5ef8ab410a784da2e89eca063ca3c66998ec
4,ce8c7b30-5766-4f20-ac55-95794089c0bc,unhealthy,61fd5ef8ab410a784da2e89eca063ca3c66998ec
5,bf8a5612-2f71-4044-9e65-6bc74b98383c,unhealthy,61fd5ef8ab410a784da2e89eca063ca3c66998ec
6,bb558fef-2292-43a8-8840-6b1a79a4ff99,unhealthy,61fd5ef8ab410a784da2e89eca063ca3c66998ec


In [47]:
metadata_df

Unnamed: 0,uuid,condition,workflow_version
0,7dd02562-87b8-4cf9-80fc-9c477c1a690a,unhealthy,61fd5ef8ab410a784da2e89eca063ca3c66998ec
3,d0b7f0c0-4155-4069-986e-595d66299b8b,unhealthy,61fd5ef8ab410a784da2e89eca063ca3c66998ec
4,ce8c7b30-5766-4f20-ac55-95794089c0bc,unhealthy,61fd5ef8ab410a784da2e89eca063ca3c66998ec
5,bf8a5612-2f71-4044-9e65-6bc74b98383c,unhealthy,61fd5ef8ab410a784da2e89eca063ca3c66998ec
6,bb558fef-2292-43a8-8840-6b1a79a4ff99,unhealthy,61fd5ef8ab410a784da2e89eca063ca3c66998ec
...,...,...,...
351,22b865e9-87ba-4dd6-bb36-0561a26f89d5,unhealthy,61fd5ef8ab410a784da2e89eca063ca3c66998ec
352,329d5549-e3c2-4929-a64f-3a02872c3ecf,unhealthy,61fd5ef8ab410a784da2e89eca063ca3c66998ec
353,c22a8a0b-21fe-47f0-bf79-371edc0d22ab,unhealthy,61fd5ef8ab410a784da2e89eca063ca3c66998ec
354,cc3f8c96-36c7-464f-8a1b-a789a463be6d,unhealthy,61fd5ef8ab410a784da2e89eca063ca3c66998ec


In [48]:
# Write the metadata to a CSV file
metadata_df.to_csv('./data/unhealthy_aml_metadata.csv', index=False)

### Create full table

In [None]:
final_df = pd.concat([df_firstrows, combined_df], ignore_index=False)
final_df.head(5)

In [None]:
final_df.to_csv('rna_seq_stranded_second.csv')