# Explainable AI for B-ALL Subtype Classification Using RNA-Seq Data

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path

# Set base directory
base_dir = Path.cwd() / 'GDC'

## Get all data

### Generate the first rows

In [None]:
tsv_file_path = base_dir / '7e8776a3-a56c-4ebc-a32b-d8cb7b7cea14.rna_seq.augmented_star_gene_counts.tsv'

df = pd.read_csv(tsv_file_path, sep='\t', header=1, )

# Select the needed rows
df_firstrows = df.loc[4:, ['gene_id', 'gene_name', 'gene_type']]

# Flip the table
df_firstrows = df_firstrows.T

df_firstrows.columns = df_firstrows.iloc[0]  # Set the first row as the header
df_firstrows = df_firstrows[1:]  # Remove the first row now that it's the header
df_firstrows

### Populate all the rows

In [None]:
# Get all .tsv files recursively
tsv_files = list(base_dir.rglob('*.tsv'))

# List to hold individual dataframes
df_list = []

for file in tsv_files:
    df = pd.read_csv(file, sep='\t', header=1)
    
    # Select the needed rows
    df = df.loc[4:, ['gene_id', 'stranded_second']]
    
    # Flip the table
    df = df.T
    df.columns = df.iloc[0]  # Set the first row as the header
    df = df[1:]  # Remove the first row now that it's the header

    # Add an identifier for the file (e.g., file stem)
    df['sample_id'] = file.stem.split('.')[0]
    df_list.append(df)

# Combine all into one DataFrame
combined_df = pd.concat(df_list, ignore_index=True)

# Optionally set sample_id as index
combined_df.set_index('sample_id', inplace=True)

combined_df


### Create full table

In [None]:
final_df = pd.concat([df_firstrows, combined_df], ignore_index=False)
final_df.head(5)

In [None]:
final_df.to_csv('rna_seq_stranded_second.csv')