# Explainable AI for B-ALL Subtype Classification Using RNA-Seq Data

In [7]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

## Get all data

### Generate the first rows

In [27]:
df = pd.read_csv('7e8776a3-a56c-4ebc-a32b-d8cb7b7cea14.rna_seq.augmented_star_gene_counts.tsv', sep='\t', header=1, )

# Select the needed rows
df_firstrows = df.loc[4:, ['gene_id', 'gene_name', 'gene_type']]

# Flip the table
df_firstrows = df_firstrows.T

df_firstrows.columns = df_firstrows.iloc[0]  # Set the first row as the header
df_firstrows = df_firstrows[1:]  # Remove the first row now that it's the header
df_firstrows

gene_id,ENSG00000000003.15,ENSG00000000005.6,ENSG00000000419.13,ENSG00000000457.14,ENSG00000000460.17,ENSG00000000938.13,ENSG00000000971.16,ENSG00000001036.14,ENSG00000001084.13,ENSG00000001167.14,...,ENSG00000288661.1,ENSG00000288662.1,ENSG00000288663.1,ENSG00000288665.1,ENSG00000288667.1,ENSG00000288669.1,ENSG00000288670.1,ENSG00000288671.1,ENSG00000288674.1,ENSG00000288675.1
gene_name,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,NFYA,...,AL451106.1,AC092910.4,AC073611.1,AC136977.1,AC078856.1,AC008763.4,AL592295.6,AC006486.3,AL391628.1,AP006621.6
gene_type,protein_coding,protein_coding,protein_coding,protein_coding,protein_coding,protein_coding,protein_coding,protein_coding,protein_coding,protein_coding,...,protein_coding,lncRNA,lncRNA,lncRNA,lncRNA,protein_coding,lncRNA,protein_coding,protein_coding,protein_coding


### Populate all the rows

In [45]:
from pathlib import Path

# Set base directory
base_dir = Path.home() / 'Downloads' / 'GDC'

# Get all .tsv files recursively
tsv_files = list(base_dir.rglob('*.tsv'))

# List to hold individual dataframes
df_list = []

for file in tsv_files:
    df = pd.read_csv(file, sep='\t', header=1)
    
    # Select the needed rows
    df = df.loc[4:, ['gene_id', 'stranded_second']]
    
    # Flip the table
    df = df.T
    df.columns = df.iloc[0]  # Set the first row as the header
    df = df[1:]  # Remove the first row now that it's the header

    # Add an identifier for the file (e.g., file stem)
    df['sample_id'] = file.stem.split('.')[0]
    df_list.append(df)

# Combine all into one DataFrame
combined_df = pd.concat(df_list, ignore_index=True)

# Optionally set sample_id as index
combined_df.set_index('sample_id', inplace=True)

combined_df


gene_id,ENSG00000000003.15,ENSG00000000005.6,ENSG00000000419.13,ENSG00000000457.14,ENSG00000000460.17,ENSG00000000938.13,ENSG00000000971.16,ENSG00000001036.14,ENSG00000001084.13,ENSG00000001167.14,...,ENSG00000288661.1,ENSG00000288662.1,ENSG00000288663.1,ENSG00000288665.1,ENSG00000288667.1,ENSG00000288669.1,ENSG00000288670.1,ENSG00000288671.1,ENSG00000288674.1,ENSG00000288675.1
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ee972b33-01e8-476a-ad1d-d75353333c43,6,0,1702,266,139,125,16,14,478,524,...,0,11,25,0,7,0,48,0,0,14
a063fd75-4ca6-468d-9134-86ff468fb1a8,18,0,896,574,572,501,37,109,1811,3548,...,0,5,23,0,11,0,39,0,1,26
4718568e-7816-4adf-96bd-7f5ad7944295,5,0,1690,329,245,102,6,13,992,3706,...,0,13,50,0,11,0,71,0,0,26
4082cc4c-5c76-4f59-b7cf-e20e844485fa,68,0,782,485,342,150,12,48,1502,2314,...,0,6,15,0,1,0,37,0,4,16
7e8776a3-a56c-4ebc-a32b-d8cb7b7cea14,39,0,1372,477,633,4958,393,1381,3125,1794,...,0,5,14,0,0,0,50,0,5,34
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
fe9ca1c7-58be-4123-8742-68843ff9881b,25,0,1075,278,89,288,20,51,929,680,...,0,11,26,0,4,0,31,0,5,23
fff4dfba-2d68-4c05-9d1d-70f410afe6d4,23,1,1307,284,255,327,31,53,1206,937,...,0,6,33,0,12,0,31,0,1,49
c2f59459-f7e1-48ae-b1e0-7720085ca685,1,0,1475,208,276,47,3,16,723,1381,...,0,7,16,0,17,1,34,0,0,30
571b4d3b-fce0-403b-a99c-2e58c338f70d,0,0,976,214,411,534,0,0,362,5076,...,0,0,96,0,109,0,0,0,0,8


### Create full table

In [46]:
final_df = pd.concat([df_firstrows, combined_df], ignore_index=False)
final_df.head(5)

gene_id,ENSG00000000003.15,ENSG00000000005.6,ENSG00000000419.13,ENSG00000000457.14,ENSG00000000460.17,ENSG00000000938.13,ENSG00000000971.16,ENSG00000001036.14,ENSG00000001084.13,ENSG00000001167.14,...,ENSG00000288661.1,ENSG00000288662.1,ENSG00000288663.1,ENSG00000288665.1,ENSG00000288667.1,ENSG00000288669.1,ENSG00000288670.1,ENSG00000288671.1,ENSG00000288674.1,ENSG00000288675.1
gene_name,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,NFYA,...,AL451106.1,AC092910.4,AC073611.1,AC136977.1,AC078856.1,AC008763.4,AL592295.6,AC006486.3,AL391628.1,AP006621.6
gene_type,protein_coding,protein_coding,protein_coding,protein_coding,protein_coding,protein_coding,protein_coding,protein_coding,protein_coding,protein_coding,...,protein_coding,lncRNA,lncRNA,lncRNA,lncRNA,protein_coding,lncRNA,protein_coding,protein_coding,protein_coding
ee972b33-01e8-476a-ad1d-d75353333c43,6,0,1702,266,139,125,16,14,478,524,...,0,11,25,0,7,0,48,0,0,14
a063fd75-4ca6-468d-9134-86ff468fb1a8,18,0,896,574,572,501,37,109,1811,3548,...,0,5,23,0,11,0,39,0,1,26
4718568e-7816-4adf-96bd-7f5ad7944295,5,0,1690,329,245,102,6,13,992,3706,...,0,13,50,0,11,0,71,0,0,26


In [47]:
final_df.to_csv('rna_seq_stranded_second.csv')