# Library Import and Functions

In [1]:
## Import Libraries
import pandas as pd
import numpy as np
import re

## Display all rows of pandas dataframes
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [2]:
'''
name: relative_transcript_abundance

purpose: calculate relative transcript abundance

input: a dataframe with a ref_gene_id column identifying the transcript gene of origin and a cov columns with 
the coverage for the transcripts.

output: the same dataframe with median relative abundance and mean gene CPM columns added
'''

def relative_transcript_abundance(df):
    
    first=True
    
    dff = df.copy()

    # for each sample's CPM column, calculate the relative abundance of the transcript in the gene and create a new column with that value
    for col in dff.filter(regex='bam_CPM').columns:
        
        col_gene_name = col.split("_CP")[0] + "_total_gene_CPM"
        col_relative_abundance = col.split("_CP")[0] + "_relative_abundance"
    
        dff_sums = dff[["gene_id", col]].groupby("gene_id").sum()

        dff_sums[col_gene_name] = dff_sums[col].copy()

        dff_sums.drop(columns=col, inplace=True)

        if first:
            merged_dff = pd.merge(dff, dff_sums, how='inner', on="gene_id")
            merged_dff[col_relative_abundance] = ((merged_dff[col]/merged_dff[col_gene_name]) * 100)
            
        else:
            merged_dff = pd.merge(merged_dff, dff_sums, how='inner', on="gene_id")
            merged_dff[col_relative_abundance] = ((merged_dff[col]/merged_dff[col_gene_name]) * 100)
        
        first=False
        
    for col_count in merged_dff.filter(regex='bam_count').columns:
        
        col_gene_name = col_count.split("_count")[0] + "_total_gene_counts"
        
        dff_sums = merged_dff[["gene_id", col_count]].groupby("gene_id").sum()
        dff_sums[col_gene_name] = dff_sums[col_count].copy()
    
        
        dff_sums.drop(columns=col_count, inplace=True)
        
        merged_dff = pd.merge(merged_dff, dff_sums, how='inner', on="gene_id")
        
    merged_dff.fillna(value=0, inplace=True)
    
    # calculate the median relative abundance for all the samples and drop the sample specific relative abundance columns
    rel_ab_col = merged_dff.filter(regex='bam_relative_abundance').columns
    merged_dff['median_relative_abundance'] = merged_dff[rel_ab_col].copy().median(axis=1)
    merged_dff.drop(columns = rel_ab_col, inplace = True)
    
    # calculate the median gene CPM and drop sample specific gene CPM
    gene_cpm_col = merged_dff.filter(regex='bam_total_gene_CPM').columns
    merged_dff['median_gene_cpm'] = merged_dff[gene_cpm_col].copy().median(axis=1)
    merged_dff.drop(columns = gene_cpm_col, inplace = True)
        
    return merged_dff

In [3]:
'''
function name: fix_column_names

purpose: Fixing the column names, making them smaller, informative, and consistent

input: The raw counts dataframe for either genes or transcripts 

output: Same dataframe with improved column names
'''

def fix_column_names(df, is_gene=False):
    
    ## Check if this is a gene counts object
    if is_gene:
        
        ## Get count column names and create list of new column names
        count_columns = df.columns.tolist()
        list_new_names = ["gene_id"]
        
        ## gene_id comes in as index for gene counts data, make it into the first column instead
        df["gene_id"] = df.index
        cols = list(df.columns)
        cols = [cols[-1]] + cols[:-1]
        df = df[cols]
        df.reset_index(inplace=True, drop=True)
    
    ## If it is a transcript dataset
    else:
        ## Set count columns and create list of new names
        count_columns = df.columns[2:].tolist()
        list_new_names = [ "transcript_id", "gene_id"]
    
    ## Fix names one by one and add to list of new names
    for col in count_columns:
        col = col.split("_mapped")[0] + "_counts"
        list_new_names.append(col)
    
    ## Rename columns
    df.columns = list_new_names
    
    return df 

In [4]:
'''
function name: parse_df_columns

purpose: parsing the last aggregate column of the gtf/gff3 into useful columns and cleaning non-relevant columns

input: dataframe containining "raw" gtf/gff

output: dataframe containing gtf with useful columns ["gene_id", "transcript_id", etc...]
'''

def parse_df_columns(df, is_ref=True, is_transcript=False, is_prot=False):

    if is_ref:

        ## Get gene ids
        df["gene_id"] = df["other"].str.split("ne_id \"", expand=True)[1].str.split('\";', expand=True)[0]
        
        ## Get gene names
        df["gene_name"] = df["other"].str.split("gene_name \"", expand=True)[1].str.split('\";', expand=True)[0]
        
        ## Get get transcript biotype
        df["gene_biotype"] = df["other"].str.split('gene_biotype "', expand=True)[1].str.split('"', expand=True)[0]
        
        ## If is transcript get transcript id and transcript biotype
        if is_transcript:
            df["transcript_id"] = df["other"].str.split('transcript_id "', expand=True)[1].str.split('"', expand=True)[0]
            df["transcript_biotype"] = df["other"].str.split('transcript_biotype "', expand=True)[1].str.split('"', expand=True)[0]
            
            ## If is prot get protein_id
            if is_prot:
                df["protein_id"] = df["other"].str.split('protein_id "', expand=True)[1].str.split('"', expand=True)[0]
                df["ccds_id"] = df["other"].str.split('ccds_id "', expand=True)[1].str.split('"', expand=True)[0]
                df["exon_number"] = df["other"].str.split('exon_number "', expand=True)[1].str.split('"', expand=True)[0]

        ## Drop "other" column
        df.drop(columns=["other", "dot_1", "dot_2"], inplace=True)
        

    else:

        ## Get gene ids
        df["gene_id"] = df["other"].str.split('";', expand=True)[0].str.extract("([^ \"]*$)", expand=True)

        ## Get transcript ids
        df["transcript_id"] = df["other"].str.split('transcript_id "', expand=True)[1].str.split('"', expand=True)[0]

        ## Get exon number
        df["exon_number"] = df["other"].str.split('exon_number "', expand=True)[1].str.split('"', expand=True)[0]

        ## Drop "other" column
        df.drop(columns=["other", "dot_1", "dot_2"], inplace=True)

    for col in df.columns:
        df.loc[df[col].isnull(), col] = np.NaN
        

    return df

In [5]:
'''
function name: calculate_cpm

purpose: Calculate CPM for the each sample given

input: Counts dataset

output: Counts dataset with CPM columns as well
'''

def calculate_cpm(df, is_gene=False):

    ## Set count columns if dataframe is gene counts
    if is_gene:
        count_columns = df.columns[1:].tolist()
    
    ## Set count columns if dataframe is transcript counts
    else:
        count_columns = df.columns[2:].tolist()

    ## Loop through counts columns to calculate CPM and add to the dataframe
    for col in count_columns:
        
        df[col] = round(df[col], 2)
        cpm_name = col.replace("_counts", "_CPM")
        df[cpm_name] = round(((df[col]/(df[col].sum())) * 1000000), 2)
    
    return df  

In [6]:
'''
function name: get_samp_id

purpose: 

input: sample string name

output: sample string name with bam appended
'''


def get_samp_id(s):
    
    name_array = s.split('.')
    last_element = name_array[-1].split('_')[0]
    string_1 = '-'.join(name_array[:-1])
    string_2 = f"{last_element}.bam"
    return f"{string_1}.{string_2}"

In [7]:
tissues_to_use = ["Brain - Cerebellar Hemisphere",
                   "Brain - Frontal Cortex (BA9)",
                   "Brain - Putamen (basal ganglia)",
                   "Cells - Cultured fibroblasts",
                   "Heart - Atrial Appendage",
                   "Heart - Left Ventricle",
                   "Liver",
                   "Lung",
                   "Muscle - Skeletal"]

# Expression of transcripts in GTEx Long-read dataset

### Import 2023 reference and disease gene lists

In [8]:
## Open original reference
ref_2023 = pd.read_csv("../../references/Homo_sapiens.GRCh38.109.gtf", header=None, delimiter="\t", low_memory=False, 
                       names=["chr", "source", "type", "start", "end", "dot_1", "strand", "dot_2", "other"], comment="#")

# filter out anything with ERCC (shouldn't be there, but just in case)
ref_2023 = ref_2023.loc[~ref_2023["chr"].str.startswith("ERCC-")]

## Parse through reference to get gene names and ids
gene_ref_2023 = ref_2023.loc[ref_2023["type"]=="gene"].copy()
gene_ref_2023 = parse_df_columns(gene_ref_2023, is_ref=True)
gene_info = gene_ref_2023[['gene_id', 'gene_biotype', 'gene_name']].copy()

## Create 2023 transcript ref
transcript_ref_2023 = ref_2023.loc[ref_2023["type"]=="transcript"].copy()
transcript_ref_2023 = parse_df_columns(transcript_ref_2023, is_ref=True, is_transcript=True)
# get the transcript biotypes
transcript_biotype = transcript_ref_2023[['transcript_id', 'transcript_biotype']].copy()

## Import disease relevant genes
disease_relevant_genes = pd.read_csv("../../references/medically_relevant_genes.tsv", sep="\t")

## Brain disease genes
brain_disease_gene_ids = pd.read_csv("../../references/brain_disease_genes_only_IDs.tsv", sep="\t")
brain_disease_annotations = pd.read_csv("../../references/brain_disease_genes_with_disease.tsv", sep="\t")


## Create disease relevant list including chromosome
disease_relevant_genes_annotated = disease_relevant_genes.merge(gene_ref_2023[["gene_id", "gene_name", "chr"]], 
                                                               how="inner", on=["gene_id", "gene_name"])

## Create list of protein coding genes
protein_coding_ref_2023 = gene_ref_2023.loc[gene_ref_2023["gene_biotype"] == "protein_coding"].copy()

In [9]:
## get number of annotated isoforms per gene
annotated_n_iso_per_gene = transcript_ref_2023.copy()[['transcript_id', 'gene_id', 'gene_biotype', 'gene_name']].groupby(["gene_id", 'gene_biotype', "gene_name"]).size().reset_index(name='n_anno_tx')
annotated_n_iso_per_gene.head()

Unnamed: 0,gene_id,gene_biotype,gene_name,n_anno_tx
0,ENSG00000000003,protein_coding,TSPAN6,5
1,ENSG00000000005,protein_coding,TNMD,2
2,ENSG00000000419,protein_coding,DPM1,16
3,ENSG00000000457,protein_coding,SCYL3,5
4,ENSG00000000460,protein_coding,C1orf112,9


## Create reference for protein coding sequences

In [10]:
## Make reference only including protein coding transcripts with a defined CDS
ref_2023_cds = ref_2023.loc[ref_2023["type"]=="CDS"].copy()
ref_2023_cds = parse_df_columns(ref_2023_cds.copy(), is_ref=True, is_transcript=True, is_prot=True)

ref_2023_cds = ref_2023_cds.loc[ref_2023_cds["transcript_biotype"] == "protein_coding"].copy()

#### Create unique identifier for protein id based on individual CDS entries for the protein

In [11]:
## Create datagrame for unique protein id 
unique_protein_id = pd.DataFrame()

## Create a reference with columns as strings instead of integers so they can be concatenated
string_ref_2023_cds = ref_2023_cds[['protein_id', 'start', 'end', "chr", "strand"]].copy().astype(str)

## Join the coordinates for all CDS based on protein id to create a unique identifier
unique_protein_id = string_ref_2023_cds.groupby(['protein_id'])[["protein_id", "chr", "strand",
                                                                 "start", "end"]].transform(lambda x: '-'.join(x))

## Add protein id column
unique_protein_id["protein_id"] = unique_protein_id["protein_id"].str.split("-", expand=True)[0]

## Add unique identifier column
unique_protein_id["unique_identifier"] = "Chromosome: " + unique_protein_id["chr"].str.split("-", expand=True)[0] + "\n" \
                                            + "Strand: " + unique_protein_id["strand"].str.split("-", expand=True)[0] + "\n" \
                                            + "Start coordinates: " + unique_protein_id["start"].copy() \
                                            + "End coordinates:" + unique_protein_id["end"].copy()

## Drop all columns besided protein id and unique identifier
unique_protein_id = unique_protein_id[["protein_id", "unique_identifier"]].copy()

## Drop any duplicates
unique_protein_id.drop_duplicates(inplace=True)

## Get transcript_id column back by merging on index with ref_2023_cds
unique_protein_id = unique_protein_id.merge(ref_2023_cds["transcript_id"], left_index=True, right_index=True)

## Merge back to ref_2023_cds on protein_id and transcript id to so we can add unique identifier
ref_2023_cds = ref_2023_cds.merge(unique_protein_id, on=["protein_id", "transcript_id"], how="inner")

## Make protein coding ID into unique identifier
ref_2023_cds["protein_id"] = ref_2023_cds["unique_identifier"].copy()
ref_2023_cds.drop(columns="unique_identifier", inplace=True)

## Create final CDS reference only containing non NA valued for protein id
ref_cds = ref_2023_cds.loc[~ref_2023_cds["protein_id"].isna()].copy()

## Only keep transcript id and unique protein id in the ref_cds final reference
ref_cds = ref_cds[["transcript_id", "protein_id"]].copy().drop_duplicates()

In [12]:
## Output references with transcripts_id and unique protein_id as a tsv to the processed data folder

ref_cds.to_csv("../../tables/transcript_id_and_unique_protein_id_reference.tsv", sep="\t", index=False)

## Load transcript counts matrix and CPM normalize it

In [13]:
# samples we are not using due to replicates, low read depth, and poor PCA clustering
filter_out_samples = [ 
    "GTEX-Q2AG-0011-R11A-SM-2EBL2_rep2.FAK44637.bam", 
    "GTEX-Q2AG-0011-R11A-SM-2EBL2_rep.FAK49243.bam", 
    "GTEX-T5JC-0011-R10A-SM-2TT23.FAK91589.bam", 
    "GTEX-QEG5-0008-SM-3QHW2_exp.FAK30166.bam", 
    "GTEX-QV44-0008-SM-3QNG7_ctrl1.FAK55556.bam",
    "GTEX-QV44-0008-SM-3QNG7_exp.FAK52124.bam",
    "GTEX-RWS6-0008-SM-3QHWG_rep.FAK49207.bam", 
    "GTEX-S4Z8-0008-SM-2Y983_exp1.FAK55723.bam",
    "GTEX-S4Z8-0008-SM-2Y983_exp2.FAK47416.bam",
    "GTEX-S95S-0008-SM-3RQ8B_exp1.FAK55217.bam",
    "GTEX-S95S-0008-SM-3RQ8B_exp2.FAK47088.bam",
    "GTEX-WY7C-0008-SM-3NZB5_ctrl.FAK55679.bam",
    "GTEX-1GN1W-0226-SM-7AGLJ_rep.FAK91654.bam",
    "GTEX-WY7C-1126-SM-3GS2X_rep2.FAK49168.bam",
    "GTEX-WY7C-1126-SM-3GS2X.FAK39149.bam",
    "GTEX-Y5LM-0426-SM-3YX99.FAK52212.bam",
    "GTEX-Y5LM-0426-SM-3YX99_rep2.FAK41279.bam",
    "GTEX-14BMU-0526-SM-5CA2F.FAK44778.bam",
    "GTEX-14BMU-0526-SM-5CA2F_rep.FAK93376.bam",
    "GTEX-13QJ3-0726-SM-7LDHS.FAK49189.bam",
    "GTEX-ZT9X-1826-SM-4V2KV_rep.FAK39773.bam",
    "GTEX-ZT9X-1826-SM-4V2KV.FAK49260.bam", 
    "GTEX-WY7C-0726-SM-3GLGQ.FAK46872.bam" ]


In [None]:
## Load matrix
df = pd.read_csv("../../data/raw/GRCh38_quant_mapq10_gtf_109_and_high-confidence_GTEx_DATA/bambu_quant/counts_transcript.txt", sep="\t")
# load unique counts matrix
df_unique = pd.read_csv("../../data/raw/GRCh38_quant_mapq10_gtf_109_and_high-confidence_GTEx_DATA/bambu_quant/uniqueCounts_transcript.txt", sep="\t")
# load gtex metadata
gtex_metadata = pd.read_csv("/pscratch/mteb223_uksr/new_RNA_isoform_expression_across_tissues/data/GTEx_v9_ONT_metadata.txt", sep = "\t")

## Convert sample names to bam files
columns_to_rename = df.columns[2:]
new_col_names = {col: get_samp_id(col) for col in columns_to_rename}
df.rename(columns=new_col_names, inplace=True)
df = df[df.columns.difference(filter_out_samples)]
df_unique.rename(columns=new_col_names, inplace=True)
df_unique = df_unique[df_unique.columns.difference(filter_out_samples)]

# get the number of bambu isoforms and genes
bambu = df.copy().loc[df['TXNAME'].str.startswith('Bambu')]
print('n_isoforms')
print(bambu.shape[0])
print('n_genes')
print(len(bambu['GENEID'].unique()))
print('new_genes')
print(len(bambu.loc[bambu['GENEID'].str.startswith('Bambu')]['GENEID'].unique()))

# get the list of samples so that we can separate out the columns below by tissue
df_cols_dict = {}
for tissue in gtex_metadata["tissue_site_detail"].copy().unique():
    if tissue in tissues_to_use:
        df_cols_dict[tissue] = [el for el in gtex_metadata.loc[gtex_metadata['tissue_site_detail'] == tissue, 'bam_file'].values.tolist() if not 'direct' in el]
        df_cols_dict[tissue] = [x for x in df_cols_dict[tissue] if x not in filter_out_samples]

tx_rel_ab = pd.DataFrame(columns = ['transcript_id', 'gene_id', 'gene_name'])
df_by_tissue = {}

# for each tissue
for key in df_cols_dict:
    # grab the columns we will need for this tissue
    columns = ["TXNAME", "GENEID"] + df_cols_dict[key]
    tmp = df[columns].copy()
    
    ## Fix columns names
    tmp = fix_column_names(tmp, is_gene=False)
    
    ## Add total counts column
    tmp["total_counts"] = tmp[tmp.filter(regex='bam_count').columns].copy().sum(axis=1)
    
    ## CPM normalize counts matrix
    tmp = calculate_cpm(tmp, is_gene=False)
    tmp["median_CPM"] = round(tmp[tmp.filter(regex='bam_CPM').columns].copy().median(axis=1), 2)
    tmp.drop(columns=tmp.filter(regex='count').columns, inplace=True)
    
    tmp_u = df_unique[columns].copy()
    
    ## Fix columns names for unique counts
    tmp_u = fix_column_names(tmp_u, is_gene=False)
    ## Add median unique counts column
    tmp_u["median_unique_counts"] = tmp_u[tmp_u.filter(regex='bam_count').columns].copy().median(axis=1)
    # for the unique dataframe, keep only these columns for merging
    tmp_u = tmp_u[['transcript_id', 'gene_id', 'median_unique_counts']].drop_duplicates()

    # merge the all counts and unique counts together
    tmp = pd.merge(tmp, tmp_u, how='inner')
    # calculate the median relative transcript abundance
    tmp = relative_transcript_abundance(tmp.copy())
    # add the tissue name
    tmp['tissue_site_detail'] = key
    # add on the biotype of the transcript and gene
    tmp = tmp.merge(transcript_biotype, how='left')
    tmp = tmp.merge(gene_info, how='left')
    tmp = tmp.merge(annotated_n_iso_per_gene, how='left')
    tmp_rel_ab = tmp[['transcript_id', 'gene_id', 'gene_name', 'median_relative_abundance']].copy()
    tmp_rel_ab = tmp_rel_ab.rename(columns={"median_relative_abundance": key})
    tx_rel_ab = tx_rel_ab.merge(tmp_rel_ab, how='outer')
    # write table to file
    tmp.to_csv("../../data/processed/GTEx/GTEX_" + key + "_cpm_transcript.tsv", sep="\t", index=False)
    # filter -> transcripts must be present with at least one unique count in at least half the samples
    tmp = tmp.loc[tmp["median_unique_counts"] >= 1].copy()
    df_by_tissue[key] = tmp
    


n_isoforms
700
n_genes
630
245


In [16]:
df_by_tissue['Lung'].head()

Unnamed: 0,transcript_id,gene_id,GTEX-1211K-0826-SM-7LDFQ.FAK46515.bam_CPM,GTEX-14BMU-0526-SM-5CA2F_rep2.FAK49039.bam_CPM,GTEX-1I6K7-1226-SM-AAEQX.FAK44642.bam_CPM,GTEX-1KXAM-0426-SM-CYKMP.FAK44752.bam_CPM,GTEX-WYVS-0526-SM-3H5V7.FAK54827.bam_CPM,GTEX-ZT9X-0326-SM-4U9QG.FAK44894.bam_CPM,total_CPM,median_CPM,median_unique_counts,median_relative_abundance,median_gene_cpm,tissue_site_detail,transcript_biotype,gene_biotype,gene_name,n_anno_tx
0,BambuTx1,ENSG00000227232,0.45,1.67,3.33,2.07,0.56,1.31,1.56,1.49,6.5,73.231896,2.6,Lung,,unprocessed_pseudogene,WASH7P,1.0
1,ENST00000488147,ENSG00000227232,0.0,0.42,0.56,1.04,0.37,1.83,0.72,0.49,2.0,26.768104,2.6,Lung,unprocessed_pseudogene,unprocessed_pseudogene,WASH7P,1.0
2,BambuTx100,ENSG00000215861,3.3,0.0,1.96,3.19,2.72,3.22,2.27,2.96,9.5,15.137348,19.685,Lung,,transcribed_unprocessed_pseudogene,,
4,BambuTx97,ENSG00000215861,11.94,21.86,22.99,22.5,9.86,12.88,17.01,17.37,41.5,79.02445,19.685,Lung,,transcribed_unprocessed_pseudogene,,
8,BambuTx1006,ENSG00000125611,0.0,0.84,0.28,0.26,0.19,0.78,0.42,0.27,1.0,0.324856,86.4,Lung,,protein_coding,CHCHD5,5.0


In [17]:
# verify the number of samples being used from each tissue
samps_we_use = []
total_samples = 0
for tiss in df_cols_dict:
    # print the tissue, the sample ids, and the number of samples
    print(tiss)
    print(df_cols_dict[tiss])
    samps_we_use = samps_we_use + df_cols_dict[tiss]
    print(len(df_cols_dict[tiss]))
    total_samples = total_samples + len(df_cols_dict[tiss])
print(total_samples)

Brain - Frontal Cortex (BA9)
['GTEX-1192X-0011-R10a-SM-4RXXZ.FAK49046.bam', 'GTEX-13X6J-0011-R10b-SM-5CEKT.FAK44896.bam', 'GTEX-14BIL-0011-R10a-SM-5EQV4.FAK49209.bam', 'GTEX-15DCD-0011-R10b-SM-5S51M.FAK42101.bam', 'GTEX-QDT8-0011-R10A-SM-2FKJB.FAK49182.bam']
5
Brain - Cerebellar Hemisphere
['GTEX-11H98-0011-R11b-SM-4SFLZ.FAK46829.bam', 'GTEX-13VXU-0011-R11b-SM-5BFQZ.FAK44611.bam', 'GTEX-17F97-0011-R11b-SM-63KY2.FAK41775.bam', 'GTEX-1H3NZ-0011-R11b-SM-AUNOV.FAK49024.bam', 'GTEX-Q2AG-0011-R11A-SM-2EBL2.FAK42265.bam', 'GTEX-T5JC-0011-R11A-SM-2TT24.FAK54838.bam']
6
Brain - Putamen (basal ganglia)
['GTEX-11TTK-0011-R7b-SM-4TVFS.FAK39197.bam', 'GTEX-1313W-0011-R7b-SM-4ZL3U.FAK44754.bam', 'GTEX-13RTJ-0011-R7b-SM-5CTCB.FAK49257.bam', 'GTEX-14C5O-0011-R7b-SM-5GUPO.FAK54887.bam', 'GTEX-15ER7-0011-R7a-SM-5QYP2.FAK44704.bam', 'GTEX-T5JC-0011-R7A-SM-2TT1Z.FAK42170.bam']
6
Lung
['GTEX-1211K-0826-SM-7LDFQ.FAK46515.bam', 'GTEX-14BMU-0526-SM-5CA2F_rep2.FAK49039.bam', 'GTEX-1I6K7-1226-SM-AAEQX.FAK44642.

In [18]:
# print the relative abundance to file
tx_rel_ab.to_csv('../../tables/GTEx_expression/GTEx_tx_relative_abundance.tsv', sep='\t', index=False)
tx_rel_ab.head()

Unnamed: 0,transcript_id,gene_id,gene_name,Brain - Frontal Cortex (BA9),Brain - Cerebellar Hemisphere,Brain - Putamen (basal ganglia),Lung,Heart - Left Ventricle,Muscle - Skeletal,Heart - Atrial Appendage,Cells - Cultured fibroblasts,Liver
0,BambuTx1,ENSG00000227232,WASH7P,77.900552,86.199497,47.717894,73.231896,0.0,50.0,66.927083,37.864078,50.0
1,ENST00000488147,ENSG00000227232,WASH7P,22.099448,13.800503,0.0,26.768104,0.0,50.0,33.072917,0.0,33.333333
2,BambuTx100,ENSG00000215861,,13.155571,10.71178,8.208511,15.137348,3.768644,6.144411,3.889834,14.605671,20.725389
3,BambuTx95,ENSG00000215861,,0.0,0.0,3.385151,2.890098,0.0,0.0,0.0,0.0,7.179487
4,BambuTx97,ENSG00000215861,,54.286093,76.086699,62.771872,79.02445,91.420043,87.449384,86.443275,39.805825,51.495017


In [19]:
# calculate the total unique counts across all used samples for each transcript
filter_col = ["GENEID", "TXNAME"] + samps_we_use
unique_counts = df_unique[filter_col].copy()
unique_counts['total_unique_counts'] = unique_counts.select_dtypes(include=[np.number]).sum(axis=1)
unique_counts = pd.merge(unique_counts, gene_info, left_on='GENEID', right_on='gene_id', how='left')
unique_counts.to_csv('../../tables/GTEx_expression/GTEx_total_unique_counts_all_samples_by_tx.tsv', sep='\t', index=False)
unique_counts.head()

Unnamed: 0,GENEID,TXNAME,GTEX-1192X-0011-R10a-SM-4RXXZ.FAK49046.bam,GTEX-13X6J-0011-R10b-SM-5CEKT.FAK44896.bam,GTEX-14BIL-0011-R10a-SM-5EQV4.FAK49209.bam,GTEX-15DCD-0011-R10b-SM-5S51M.FAK42101.bam,GTEX-QDT8-0011-R10A-SM-2FKJB.FAK49182.bam,GTEX-11H98-0011-R11b-SM-4SFLZ.FAK46829.bam,GTEX-13VXU-0011-R11b-SM-5BFQZ.FAK44611.bam,GTEX-17F97-0011-R11b-SM-63KY2.FAK41775.bam,...,GTEX-WY7C-0008-SM-3NZB5_ctrl.FAK55628.bam,GTEX-R53T-0326-SM-2K8S4.FAK44579.bam,GTEX-UTHO-2426-SM-38ZXF.FAK46748.bam,GTEX-Y5LM-0426-SM-3YX99_rep.FAK49269.bam,GTEX-ZF29-2026-SM-4TVXH.FAK44621.bam,GTEX-ZPU1-0826-SM-4UJSC.FAK41797.bam,total_unique_counts,gene_id,gene_biotype,gene_name
0,ENSG00000227232,BambuTx1,4,1,10,7,21,5,1,3,...,0,2,0,1,2,0,193,ENSG00000227232,unprocessed_pseudogene,WASH7P
1,ENSG00000215861,BambuTx100,52,8,27,20,35,36,5,13,...,33,2,0,0,15,1,2901,ENSG00000215861,transcribed_unprocessed_pseudogene,
2,ENSG00000125611,BambuTx1005,31,15,22,21,29,36,13,3,...,1,0,0,0,1,0,866,ENSG00000125611,protein_coding,CHCHD5
3,ENSG00000125611,BambuTx1006,13,5,9,5,4,2,20,8,...,1,3,0,1,4,0,189,ENSG00000125611,protein_coding,CHCHD5
4,ENSG00000240356,BambuTx1009,9,7,12,5,29,5,9,4,...,64,13,1,9,1,3,734,ENSG00000240356,transcribed_processed_pseudogene,RPL23AP7


## Look at the number of isoforms expressed in tissue vs gene CPM

In [20]:
# look at the number of isoforms expressed in each tissue
cpm_vs_n_iso_diff_annotated = pd.DataFrame(columns=['gene_id', 'gene_biotype', 'median_gene_cpm', 'n_tx', 'n_anno_tx', 'n_tx_diff', 'tissue'])
# for each tissue
for tissue in df_by_tissue:
    print(tissue)
    tmp = df_by_tissue[tissue].copy()
    tmp = tmp.copy()[['transcript_id', 'gene_id', 'median_gene_cpm', 'gene_biotype', 'n_anno_tx']]
    # get the number of expressed isoforms
    n_iso_tmp = tmp.copy().groupby(["gene_id", 'gene_biotype', "median_gene_cpm", 'n_anno_tx']).size().reset_index(name='n_tx')
    n_iso_tmp['tissue'] = tissue
    n_iso_tmp['n_tx_diff'] = n_iso_tmp['n_anno_tx'] - n_iso_tmp['n_tx']
    cpm_vs_n_iso_diff_annotated = pd.concat([cpm_vs_n_iso_diff_annotated, n_iso_tmp])

cpm_vs_n_iso_diff_annotated.to_csv('../../tables/GTEx_expression/GTEx_n_iso_diff_vs_gene_CPM.tsv', sep='\t', index=False)

Brain - Frontal Cortex (BA9)
Brain - Cerebellar Hemisphere
Brain - Putamen (basal ganglia)
Lung
Heart - Left Ventricle
Muscle - Skeletal
Heart - Atrial Appendage
Cells - Cultured fibroblasts
Liver


## Make tables looking at tissue expression of isoforms

In [21]:
# initialize variables
# list of isoforms and protein_coding isoforms
df_transcripts = {}
df_ccds = {}

# create list of thresholds
list_cpm_thresh = [x/100 for x in range(0,1002)]

print(list_cpm_thresh[1])
print(list_cpm_thresh[101])
print(list_cpm_thresh[501])
print(list_cpm_thresh[1001])

## Create lists with numbers of transcripts expressed across CPM thresholds for different categories
list_2023_all_transcript_median = {}
list_2023_all_gene_median = {}
list_2023_cds_transcript_median = {}
list_2023_med_relevant_transcript_median = {}
list_2023_med_relevant_cds_transcript_median = {}
list_2023_brain_relevant_transcript_median = {}
list_2023_brain_relevant_cds_transcript_median = {}

# look at the number of isoforms expressed in each tissue
isoforms_in_tissue_0 = pd.DataFrame(columns=['transcript_id', 'gene_id'])
isoforms_in_tissue_1 = pd.DataFrame(columns=['transcript_id', 'gene_id'])
isoforms_in_tissue_5 = pd.DataFrame(columns=['transcript_id', 'gene_id'])
isoforms_in_tissue_10 = pd.DataFrame(columns=['transcript_id', 'gene_id'])
isoforms_in_tissue = pd.DataFrame(columns=['transcript_id', 'gene_id'])


values_at_cpm_threshold = pd.DataFrame(columns=['cpm_threshold', 'tissue', 
                                                'all_transcript_median', 
                                                'all_gene_median',
                                                'cds_transcript_median', 
                                                'med_relevant_transcript_median', 
                                               'med_relevant_cds_transcript_median',
                                               'brain_relevant_transcript_median',
                                               'brain_relevant_cds_transcript_median'])

# create dataframes for number of isoforms per gene (or protein-coding gene) by tissue
n_tx_per_gene_by_tissue = pd.DataFrame(columns=['gene_id', 'gene_biotype'])
n_tx_per_pc_gene_by_tissue = pd.DataFrame(columns=['gene_id', 'gene_biotype'])

# create sets of isoforms at CPM gt 1
sets_of_isoforms_at_cpm_gt_1 = {}
sets_of_pc_isoforms_at_cpm_gt_1 = {}

for tissue in df_by_tissue:
    print(tissue)
    
    tmp = df_by_tissue[tissue].copy()
    df_transcripts[tissue] = tmp.copy()
    df_ccds[tissue] = tmp.loc[tmp["transcript_id"].isin(ref_cds["transcript_id"])].copy()
    
    # create empty list for the tissue
    # all isoforms
    list_2023_all_transcript_median[tissue] = []
    # all genes
    list_2023_all_gene_median[tissue] = []
    # all protein-coding isoforms
    list_2023_cds_transcript_median[tissue] = []
    # all isoforms from medically relevant genes
    list_2023_med_relevant_transcript_median[tissue] = []
    # all isoforms from medically relevant protein-coding genes
    list_2023_med_relevant_cds_transcript_median[tissue] = []
    # all isoforms from brain disease relevant genes
    list_2023_brain_relevant_transcript_median[tissue] = []
    # all isoforms from brain disease relevant protein-coding genes
    list_2023_brain_relevant_cds_transcript_median[tissue] = []
    
    # for a range of CPM thresholds
    for i in range(0, 1002):

        cpm_thresh = i/100
        
        # get the isoforms that fit each category
        median_2023 = tmp.loc[tmp["median_CPM"] >= cpm_thresh].copy()
        cds_median_2023 = df_ccds[tissue].loc[df_ccds[tissue]["median_CPM"] >= cpm_thresh].copy()
        med_relevant_median_2023 = median_2023.loc[median_2023["gene_id"].isin(disease_relevant_genes["gene_id"])].copy()
        med_relevant_cds_median_2023 = med_relevant_median_2023.loc[med_relevant_median_2023["transcript_id"].isin(cds_median_2023["transcript_id"])].copy()
        brain_relevant_median_2023 = median_2023.loc[median_2023["gene_id"].isin(brain_disease_gene_ids["gene_id"])].copy()
        brain_relevant_cds_median_2023 = brain_relevant_median_2023.loc[brain_relevant_median_2023["transcript_id"].isin(cds_median_2023["transcript_id"])].copy()
        
        # at CPM > 0
        if cpm_thresh == 0.01:
            n_tx_per_gene_tmp = median_2023.copy()
            tmp_isoforms_in_tissue = n_tx_per_gene_tmp.copy()[['transcript_id', 'gene_id', 'gene_name', 'gene_biotype', 'median_CPM']]
            tmp_isoforms_in_tissue['threshold'] = cpm_thresh
            tmp_isoforms_in_tissue = tmp_isoforms_in_tissue.rename(columns={'median_CPM': tissue})
            isoforms_in_tissue_0 = isoforms_in_tissue_0.merge(tmp_isoforms_in_tissue, how='outer')
             
        # at CPM > 1 (in this case, because we round values to 2 decimals, CPM == 1.01 is equivalent to CPM > 1)
        if cpm_thresh == 1.01:
            n_tx_per_gene_tmp = median_2023.copy()
            # get the set of all isoforms expressed in this tissue above our thresholds
            sets_of_isoforms_at_cpm_gt_1[tissue] = set(n_tx_per_gene_tmp.copy()['transcript_id'])
            # get the median CPM for each isoform and then merge into larger dataframe
            tmp_isoforms_in_tissue = n_tx_per_gene_tmp.copy()[['transcript_id', 'gene_id', 'gene_name', 'gene_biotype', 'median_CPM']]
            tmp_isoforms_in_tissue = tmp_isoforms_in_tissue.rename(columns={'median_CPM': tissue})
            tmp_isoforms_in_tissue['threshold'] = cpm_thresh
            isoforms_in_tissue_1 = isoforms_in_tissue_1.merge(tmp_isoforms_in_tissue, how='outer')
            # calculate the number of isoforms per gene in the tissue
            n_tx_per_gene_tmp = n_tx_per_gene_tmp.groupby(['gene_id', 'gene_biotype', 'gene_name']).size().reset_index(name="n_tx")
            n_tx_per_gene_tmp = n_tx_per_gene_tmp.rename(columns={"n_tx": tissue})
            n_tx_per_gene_by_tissue = n_tx_per_gene_by_tissue.merge(n_tx_per_gene_tmp, how='outer')
            print(n_tx_per_gene_tmp.head())

            # looking at protein-coding isoforms
            n_tx_per_pc_gene_tmp = cds_median_2023.copy()
            # get the set of all protein-coding isoforms expressed in this tissue above our thresholds
            sets_of_pc_isoforms_at_cpm_gt_1[tissue] = set(n_tx_per_pc_gene_tmp.copy()['transcript_id'])
            # get the number of protein-coding isoforms per gene in the tissue
            n_tx_per_pc_gene_tmp = n_tx_per_pc_gene_tmp.groupby(['gene_id', 'gene_biotype', 'gene_name']).size().reset_index(name="n_tx")
            n_tx_per_pc_gene_tmp = n_tx_per_pc_gene_tmp.rename(columns={"n_tx": tissue})
            n_tx_per_pc_gene_by_tissue = n_tx_per_pc_gene_by_tissue.merge(n_tx_per_pc_gene_tmp, how='outer')
            
        # at CPM > 5
        if cpm_thresh == 5.01:
            n_tx_per_gene_tmp = median_2023.copy()
            tmp_isoforms_in_tissue = n_tx_per_gene_tmp.copy()[['transcript_id', 'gene_id', 'gene_name', 'gene_biotype', 'median_CPM']]
            tmp_isoforms_in_tissue['threshold'] = cpm_thresh
            tmp_isoforms_in_tissue = tmp_isoforms_in_tissue.rename(columns={'median_CPM': tissue})
            isoforms_in_tissue_5 = isoforms_in_tissue_5.merge(tmp_isoforms_in_tissue, how='outer')
        
        # at CPM > 10
        if cpm_thresh == 10.01:
            n_tx_per_gene_tmp = median_2023.copy()
            tmp_isoforms_in_tissue = n_tx_per_gene_tmp.copy()[['transcript_id', 'gene_id', 'gene_name', 'gene_biotype', 'median_CPM']]
            tmp_isoforms_in_tissue['threshold'] = cpm_thresh
            tmp_isoforms_in_tissue = tmp_isoforms_in_tissue.rename(columns={'median_CPM': tissue})
            isoforms_in_tissue_10 = isoforms_in_tissue_10.merge(tmp_isoforms_in_tissue, how='outer')
            

        # add just the total number in each category to the list
        list_2023_all_transcript_median[tissue].append(median_2023.shape[0])
        list_2023_all_gene_median[tissue].append(median_2023['gene_id'].nunique())
        list_2023_cds_transcript_median[tissue].append(cds_median_2023.shape[0])
        list_2023_med_relevant_transcript_median[tissue].append(med_relevant_median_2023.shape[0])
        list_2023_med_relevant_cds_transcript_median[tissue].append(med_relevant_cds_median_2023.shape[0])
        list_2023_brain_relevant_transcript_median[tissue].append(brain_relevant_median_2023.shape[0])
        list_2023_brain_relevant_cds_transcript_median[tissue].append(brain_relevant_cds_median_2023.shape[0])
    
    # concat values to dataframe holding all the numbers for the thresholds for all tissues
    values_at_cpm_threshold = pd.concat([values_at_cpm_threshold, pd.DataFrame({
        'cpm_threshold': list_cpm_thresh,
        'tissue': tissue,
        'all_transcript_median': list_2023_all_transcript_median[tissue],
        'all_gene_median': list_2023_all_gene_median[tissue],
        'cds_transcript_median': list_2023_cds_transcript_median[tissue],
        'med_relevant_transcript_median': list_2023_med_relevant_transcript_median[tissue],
        'med_relevant_cds_transcript_median': list_2023_med_relevant_cds_transcript_median[tissue],
        'brain_relevant_transcript_median': list_2023_brain_relevant_transcript_median[tissue],
        'brain_relevant_cds_transcript_median': list_2023_brain_relevant_cds_transcript_median[tissue]
    })])

# write dataframe to file
values_at_cpm_threshold.to_csv("../../tables/GTEx_expression/gtex_values_at_cpm_thresholds.tsv", sep = '\t', index=False)

0.01
1.01
5.01
10.01
Brain - Frontal Cortex (BA9)
           gene_id    gene_biotype gene_name  Brain - Frontal Cortex (BA9)
0  ENSG00000000003  protein_coding    TSPAN6                             1
1  ENSG00000000005  protein_coding      TNMD                             1
2  ENSG00000000419  protein_coding      DPM1                             5
3  ENSG00000000457  protein_coding     SCYL3                             1
4  ENSG00000001036  protein_coding     FUCA2                             1
Brain - Cerebellar Hemisphere
           gene_id    gene_biotype gene_name  Brain - Cerebellar Hemisphere
0  ENSG00000000003  protein_coding    TSPAN6                              1
1  ENSG00000000419  protein_coding      DPM1                              4
2  ENSG00000000457  protein_coding     SCYL3                              1
3  ENSG00000001036  protein_coding     FUCA2                              1
4  ENSG00000001084  protein_coding      GCLC                              2
Brain - Putame

In [22]:
# table of the median CPM of an isoform in each tissue at 4 thresholds
isoforms_in_tissue = pd.concat([isoforms_in_tissue_0, isoforms_in_tissue_1, isoforms_in_tissue_5, isoforms_in_tissue_10])
column_to_move = isoforms_in_tissue.pop("threshold")
isoforms_in_tissue.insert(4, "threshold", column_to_move)
isoforms_in_tissue.to_csv('../../tables/GTEx_expression/GTEx_isoforms_in_tissues_passing_med_CPM_gt_0_1_5_10.tsv', sep='\t', index=False)
# table of the median CPM of an isoform in each tissue at CPM > 1
column_to_move = isoforms_in_tissue_1.pop("threshold")
isoforms_in_tissue_1.insert(4, "threshold", column_to_move)
isoforms_in_tissue_1
isoforms_in_tissue_1.to_csv('../../tables/GTEx_expression/GTEx_isoforms_in_tissues_passing_med_CPM_gt_1.tsv', sep='\t', index=False)
isoforms_in_tissue.head()

Unnamed: 0,transcript_id,gene_id,gene_name,gene_biotype,threshold,Brain - Frontal Cortex (BA9),Brain - Cerebellar Hemisphere,Brain - Putamen (basal ganglia),Lung,Heart - Left Ventricle,Muscle - Skeletal,Heart - Atrial Appendage,Cells - Cultured fibroblasts,Liver
0,BambuTx1,ENSG00000227232,WASH7P,unprocessed_pseudogene,0.01,2.71,1.76,0.64,1.49,,0.3,0.42,0.33,0.18
1,ENST00000488147,ENSG00000227232,WASH7P,unprocessed_pseudogene,0.01,0.8,0.24,,0.49,,0.32,0.22,,0.18
2,BambuTx100,ENSG00000215861,,transcribed_unprocessed_pseudogene,0.01,7.88,7.47,3.88,2.96,20.29,41.38,26.0,6.13,0.29
3,BambuTx97,ENSG00000215861,,transcribed_unprocessed_pseudogene,0.01,31.86,58.28,30.9,17.37,518.1,502.58,484.26,13.9,1.13
4,BambuTx98,ENSG00000215861,,transcribed_unprocessed_pseudogene,0.01,15.49,10.12,13.65,,,,,10.15,


In [23]:
# table of the number of isoforms passing our thresholds (median-unique-counts >= 1 and CPM > 1) per gene
n_tx_per_gene_by_tissue.to_csv("../../tables/GTEx_expression/GTEx_number_of_tx_per_gene_passing_thresholds_2023.tsv", sep='\t', index=False)
# tables of the number of protein-coding isoforms passing our thresholds (median-unique-counts >= 1 and CPM > 1) per gene
n_tx_per_pc_gene_by_tissue.to_csv("../../tables/GTEx_expression/GTEx_number_of_protein_coding_tx_per_gene_passing_thresholds_2023.tsv", sep='\t', index=False)
n_tx_per_gene_by_tissue.head()

Unnamed: 0,gene_id,gene_biotype,gene_name,Brain - Frontal Cortex (BA9),Brain - Cerebellar Hemisphere,Brain - Putamen (basal ganglia),Lung,Heart - Left Ventricle,Muscle - Skeletal,Heart - Atrial Appendage,Cells - Cultured fibroblasts,Liver
0,ENSG00000000003,protein_coding,TSPAN6,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,ENSG00000000005,protein_coding,TNMD,1.0,,,,,1.0,,,
2,ENSG00000000419,protein_coding,DPM1,5.0,4.0,4.0,5.0,4.0,5.0,4.0,5.0,3.0
3,ENSG00000000457,protein_coding,SCYL3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,ENSG00000001036,protein_coding,FUCA2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Identify the isoforms unique to each tissue

In [24]:
# 'tissue specific' isoforms
unique_tiss_isoforms = pd.DataFrame(columns = ['n_tx', 'tissue', 'type'])

tiss_spec_iso = {}
for tiss in tissues_to_use:
    tiss_iso_df = pd.DataFrame(list(sets_of_isoforms_at_cpm_gt_1[tiss]), columns=['isoform'])
    tiss_iso_df['tissue'] = tiss
    tmp_iso_set = set()
    for tiss2 in tissues_to_use:
        if tiss != tiss2:
            tmp_iso_set.update(sets_of_isoforms_at_cpm_gt_1[tiss2])
            
    tiss_spec_iso[tiss] = sets_of_isoforms_at_cpm_gt_1[tiss] - tmp_iso_set
    unique_tiss_isoforms = pd.concat([unique_tiss_isoforms, pd.DataFrame([{'n_tx': len(tiss_spec_iso[tiss]), 'tissue': tiss, 'type': 'all'}])])
    unique_tiss_isoforms = pd.concat([unique_tiss_isoforms, pd.DataFrame([{'n_tx': len(tiss_spec_iso[tiss] & sets_of_pc_isoforms_at_cpm_gt_1[tiss]), 'tissue': tiss, 'type': 'pc'}])])

unique_tiss_isoforms.to_csv('../../tables/GTEx_expression/GTEx_tissue_unique_isoforms_by_type_CPM_gt_1.tsv', sep='\t', index=False)
unique_tiss_isoforms.head()

Unnamed: 0,n_tx,tissue,type
0,3726,Brain - Cerebellar Hemisphere,all
0,1152,Brain - Cerebellar Hemisphere,pc
0,1075,Brain - Frontal Cortex (BA9),all
0,554,Brain - Frontal Cortex (BA9),pc
0,615,Brain - Putamen (basal ganglia),all


## Transcript biotype by tissue

In [25]:
biotype_by_tissue = pd.DataFrame()
# showing the top 5 isoform biotypes by tissue
for tissue in df_by_tissue:
    if tissue in tissues_to_use:
        print(tissue)
        tmp = df_by_tissue[tissue].copy()
        tmp = tmp.loc[tmp["median_CPM"] > 1]
        o_t = tmp['transcript_biotype'].value_counts().to_frame().head()
        o_t['tissue'] = tissue
        biotype_by_tissue = pd.concat([biotype_by_tissue, o_t])

biotype_by_tissue.to_csv('../../tables/GTEx_expression/GTEx_transcript_biotype_expr_by_tissue_2023.tsv', sep='\t')
biotype_by_tissue.head()

Brain - Frontal Cortex (BA9)
Brain - Cerebellar Hemisphere
Brain - Putamen (basal ganglia)
Lung
Heart - Left Ventricle
Muscle - Skeletal
Heart - Atrial Appendage
Cells - Cultured fibroblasts
Liver


Unnamed: 0_level_0,count,tissue
transcript_biotype,Unnamed: 1_level_1,Unnamed: 2_level_1
protein_coding,13820,Brain - Frontal Cortex (BA9)
lncRNA,1988,Brain - Frontal Cortex (BA9)
retained_intron,1096,Brain - Frontal Cortex (BA9)
nonsense_mediated_decay,765,Brain - Frontal Cortex (BA9)
protein_coding_CDS_not_defined,614,Brain - Frontal Cortex (BA9)


## Look at the number of isoforms expressed in tissue according to relative abundance

In [25]:
# Filter isoforms by median relative abundance instead of CPM and determine the number of isoforms expressed per gene
median_rel_ab_iso_in_tis = pd.DataFrame(columns=['gene_id', 'transcript_id'])
median_rel_ab_iso_in_tis_n_iso = pd.DataFrame(columns=['gene_id'])

# for each tissue
for tissue in df_by_tissue:
    print(tissue)
    tmp = df_by_tissue[tissue].copy()
    # isoforms need to account for at least 10 percent of the relative abundance of a gene
    tmp = tmp.loc[tmp['median_relative_abundance'] > 10]
    tmp_formatted = tmp.copy()[['transcript_id', 'gene_id', 'gene_name', 'median_relative_abundance']].rename(columns={"median_relative_abundance": tissue})
    median_rel_ab_iso_in_tis = median_rel_ab_iso_in_tis.merge(tmp_formatted, how='outer')
    n_iso_tmp = tmp.copy()[["gene_id", "transcript_id"]].groupby("gene_id").size().reset_index(name=tissue)
    median_rel_ab_iso_in_tis_n_iso = median_rel_ab_iso_in_tis_n_iso.merge(n_iso_tmp, how='outer')

# write the isoforms and their median relative abundance to file
# write the number of isoforms per gene to file
median_rel_ab_iso_in_tis.to_csv('../../tables/GTEx_expression/GTEx_isoforms_in_tissues_passing_median_rel_abund_gt_10.tsv', sep='\t', index=False)
median_rel_ab_iso_in_tis_n_iso.to_csv('../../tables/GTEx_expression/GTEx_number_of_tx_per_gene_passing_median_rel_abund_gt_10_threshold_2023.tsv', sep='\t', index=False)
median_rel_ab_iso_in_tis.head()

Brain - Frontal Cortex (BA9)
Brain - Cerebellar Hemisphere
Brain - Putamen (basal ganglia)
Lung
Heart - Left Ventricle
Muscle - Skeletal
Heart - Atrial Appendage
Cells - Cultured fibroblasts
Liver


Unnamed: 0,transcript_id,gene_id,gene_name,Brain - Frontal Cortex (BA9),Brain - Cerebellar Hemisphere,Brain - Putamen (basal ganglia),Lung,Heart - Left Ventricle,Muscle - Skeletal,Heart - Atrial Appendage,Cells - Cultured fibroblasts,Liver
0,BambuTx1,ENSG00000227232,WASH7P,77.900552,86.199497,47.717894,73.231896,,50.0,66.927083,37.864078,50.0
1,ENST00000488147,ENSG00000227232,WASH7P,22.099448,13.800503,,26.768104,,50.0,33.072917,,33.333333
2,BambuTx100,ENSG00000215861,,13.155571,10.71178,,15.137348,,,,14.605671,20.725389
3,BambuTx97,ENSG00000215861,,54.286093,76.086699,62.771872,79.02445,91.420043,87.449384,86.443275,39.805825,51.495017
4,BambuTx98,ENSG00000215861,,27.352993,13.840701,24.130873,,,,,35.549202,
