# Library Import and Functions

This notebook is used to conduct analyses and create tables from the annotations

In [1]:
## Import Libraries
import pandas as pd
import numpy as np
from matplotlib_venn import venn2, venn2_circles, venn2_unweighted
from matplotlib_venn import venn3, venn3_circles, venn3_unweighted
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
from wordcloud import WordCloud
import re


## Display all rows of pandas dataframes
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [2]:
'''
name: extract_gene_id

input: GTF attribute column

output: gene_id for each entry

purpose: Creating a gene_id columns from attributes column in GTF
'''


def extract_gene_id(attributes):
    match = re.search('gene_id "([^"]+)', attributes)
    return match.group(1) if match else None

In [3]:
'''
name: extract_gene_biotype

input: GTF attribute column

output: gene_biotype for each entry

purpose: Creating a gene_biotype columns from attributes column in GTF
'''


def extract_gene_biotype(attributes):
    match = re.search('gene_biotype "([^"]+)', attributes)
    return match.group(1) if match else None

In [4]:
'''
name: extract_transcript_id

input: GTF attribute column

output: transcript_id for each entry

purpose: Creating a transcript_id columns from attributes column in GTF
'''


def extract_transcript_id(attributes):
    match = re.search('transcript_id "([^"]+)', attributes)
    return match.group(1) if match else None

In [5]:
'''
name: extract_transcript_biotype

input: GTF attribute column

output: transcript_biotype for each entry

purpose: Creating a transcript_biotype columns from attributes column in GTF
'''


def extract_transcript_biotype(attributes):
    match = re.search('transcript_biotype "([^"]+)', attributes)
    return match.group(1) if match else None

# Quantify number of RNA isoforms annotated per year

### 2014 Dataset

In [6]:
# Define the column names for the GTF file
column_names = ["seqname", "source", "feature", "start", "end", "score", "strand", "frame", "attributes"]

# Load the GTF file
df2014 = pd.read_csv('../../references/Homo_sapiens.GRCh38.76.gtf',
                     sep='\t', comment='#', names=column_names, low_memory=False)
df2014 = df2014.loc[df2014["feature"] == "transcript"].copy()

# Parse the "attributes" column to extract the important values
df2014['gene_id'] = df2014['attributes'].apply(extract_gene_id)
df2014['gene_biotype'] = df2014['attributes'].apply(extract_gene_biotype)
df2014['transcript_id'] = df2014['attributes'].apply(extract_transcript_id)
# 2014 doesn't have a `transcript biotype` attribute in the attributes column, it is instead in the 2nd column, which we labeled source
df2014['transcript_biotype'] = df2014['source']

### 2015 Dataset

In [7]:
# Define the column names for the GTF file
column_names = ["seqname", "source", "feature", "start", "end", "score", "strand", "frame", "attributes"]

# Load the GTF file
df2015 = pd.read_csv('../../references/Homo_sapiens.GRCh38.79.gtf',
                     sep='\t', comment='#', names=column_names, low_memory=False)

df2015 = df2015.loc[df2015["feature"] == "transcript"].copy()

# Parse the "attributes" column to extract the important values
df2015['gene_id'] = df2015['attributes'].apply(extract_gene_id)
df2015['gene_biotype'] = df2015['attributes'].apply(extract_gene_biotype)
df2015['transcript_id'] = df2015['attributes'].apply(extract_transcript_id)
df2015['transcript_biotype'] = df2015['attributes'].apply(extract_transcript_biotype)

### 2016 Dataset

In [8]:
# Define the column names for the GTF file
column_names = ["seqname", "source", "feature", "start", "end", "score", "strand", "frame", "attributes"]

# Load the GTF file
df2016 = pd.read_csv('../../references/Homo_sapiens.GRCh38.85.gtf',
                     sep='\t', comment='#', names=column_names, low_memory=False)

df2016 = df2016.loc[df2016["feature"] == "transcript"].copy()


# Parse the "attributes" column to extract the important values
df2016['gene_id'] = df2016['attributes'].apply(extract_gene_id)
df2016['gene_biotype'] = df2016['attributes'].apply(extract_gene_biotype)
df2016['transcript_id'] = df2016['attributes'].apply(extract_transcript_id)
df2016['transcript_biotype'] = df2016['attributes'].apply(extract_transcript_biotype)

### 2017 Dataset

In [9]:
# Define the column names for the GTF file
column_names = ["seqname", "source", "feature", "start", "end", "score", "strand", "frame", "attributes"]

# Load the GTF file
df2017 = pd.read_csv('../../references/Homo_sapiens.GRCh38.88.gtf',
                     sep='\t', comment='#', names=column_names, low_memory=False)

df2017 = df2017.loc[df2017["feature"] == "transcript"].copy()


# Parse the "attributes" column to extract the important values
df2017['gene_id'] = df2017['attributes'].apply(extract_gene_id)
df2017['gene_biotype'] = df2017['attributes'].apply(extract_gene_biotype)
df2017['transcript_id'] = df2017['attributes'].apply(extract_transcript_id)
df2017['transcript_biotype'] = df2017['attributes'].apply(extract_transcript_biotype)

### 2018 Dataset

In [10]:
# Define the column names for the GTF file
column_names = ["seqname", "source", "feature", "start", "end", "score", "strand", "frame", "attributes"]

# Load the GTF file
df2018 = pd.read_csv('../../references/Homo_sapiens.GRCh38.92.gtf', 
                     sep='\t', comment='#', names=column_names, low_memory=False)

df2018 = df2018.loc[df2018["feature"] == "transcript"].copy()


# Parse the "attributes" column to extract the important values
df2018['gene_id'] = df2018['attributes'].apply(extract_gene_id)
df2018['gene_biotype'] = df2018['attributes'].apply(extract_gene_biotype)
df2018['transcript_id'] = df2018['attributes'].apply(extract_transcript_id)
df2018['transcript_biotype'] = df2018['attributes'].apply(extract_transcript_biotype)

### 2019 Dataset

In [11]:
# Define the column names for the GTF file
column_names = ["seqname", "source", "feature", "start", "end", "score", "strand", "frame", "attributes"]

# Load the GTF file
df2019 = pd.read_csv('../../references/Homo_sapiens.GRCh38.96.gtf', 
                     sep='\t', comment='#', names=column_names, low_memory=False)

df2019 = df2019.loc[df2019["feature"] == "transcript"].copy()


# Parse the "attributes" column to extract the important values
df2019['gene_id'] = df2019['attributes'].apply(extract_gene_id)
df2019['gene_biotype'] = df2019['attributes'].apply(extract_gene_biotype)
df2019['transcript_id'] = df2019['attributes'].apply(extract_transcript_id)
df2019['transcript_biotype'] = df2019['attributes'].apply(extract_transcript_biotype)

### 2020 Dataset

In [12]:
# Define the column names for the GTF file
column_names = ["seqname", "source", "feature", "start", "end", "score", "strand", "frame", "attributes"]

# Load the GTF file
df2020 = pd.read_csv('../../references/Homo_sapiens.GRCh38.99.gtf', 
                     sep='\t', comment='#', names=column_names, low_memory=False)

df2020 = df2020.loc[df2020["feature"] == "transcript"].copy()


# Parse the "attributes" column to extract the important values
df2020['gene_id'] = df2020['attributes'].apply(extract_gene_id)
df2020['gene_biotype'] = df2020['attributes'].apply(extract_gene_biotype)
df2020['transcript_id'] = df2020['attributes'].apply(extract_transcript_id)
df2020['transcript_biotype'] = df2020['attributes'].apply(extract_transcript_biotype)

### 2021 Dataset

In [13]:
# Define the column names for the GTF file
column_names = ["seqname", "source", "feature", "start", "end", "score", "strand", "frame", "attributes"]

# Load the GTF file
df2021 = pd.read_csv('../../references/Homo_sapiens.GRCh38.103.gtf', 
                     sep='\t', comment='#', names=column_names, low_memory=False)

df2021 = df2021.loc[df2021["feature"] == "transcript"].copy()


# Parse the "attributes" column to extract the important values
df2021['gene_id'] = df2021['attributes'].apply(extract_gene_id)
df2021['gene_biotype'] = df2021['attributes'].apply(extract_gene_biotype)
df2021['transcript_id'] = df2021['attributes'].apply(extract_transcript_id)
df2021['transcript_biotype'] = df2021['attributes'].apply(extract_transcript_biotype)

### 2022 Dataset

In [14]:
# Define the column names for the GTF file
column_names = ["seqname", "source", "feature", "start", "end", "score", "strand", "frame", "attributes"]

# Load the GTF file
df2022 = pd.read_csv('../../references/Homo_sapiens.GRCh38.106.gtf', 
                     sep='\t', comment='#', names=column_names, low_memory=False)

df2022 = df2022.loc[df2022["feature"] == "transcript"].copy()


# Parse the "attributes" column to extract the important values
df2022['gene_id'] = df2022['attributes'].apply(extract_gene_id)
df2022['gene_biotype'] = df2022['attributes'].apply(extract_gene_biotype)
df2022['transcript_id'] = df2022['attributes'].apply(extract_transcript_id)
df2022['transcript_biotype'] = df2022['attributes'].apply(extract_transcript_biotype)

### 2023 Dataset

In [15]:
# Define the column names for the GTF file
column_names = ["seqname", "source", "feature", "start", "end", "score", "strand", "frame", "attributes"]

# Load the GTF file
df2023 = pd.read_csv('../../references/Homo_sapiens.GRCh38.109.gtf', 
                     sep='\t', comment='#', names=column_names, low_memory=False)

df2023 = df2023.loc[df2023["feature"] == "transcript"].copy()


# Parse the "attributes" column to extract the important values
df2023['gene_id'] = df2023['attributes'].apply(extract_gene_id)
df2023['gene_biotype'] = df2023['attributes'].apply(extract_gene_biotype)
df2023['transcript_id'] = df2023['attributes'].apply(extract_transcript_id)
df2023['transcript_biotype'] = df2023['attributes'].apply(extract_transcript_biotype)

#### Add years column to dataframes

In [16]:
# we only want to keep chromosomes that are not alternate contigs
df2014 = df2014.loc[df2014["seqname"].isin(df2016["seqname"])].copy()
df2015 = df2015.loc[df2015["seqname"].isin(df2016["seqname"])].copy()

In [17]:
df2014['Year'] = 2014
df2015['Year'] = 2015
df2016['Year'] = 2016
df2017['Year'] = 2017
df2018['Year'] = 2018
df2019['Year'] = 2019
df2020['Year'] = 2020
df2021['Year'] = 2021
df2022['Year'] = 2022
df2023['Year'] = 2023

#### Dataframe of geneID's and year of each year

In [18]:
genes_df2014 = pd.merge(df2014['gene_id'], df2014['Year'],left_index=True, right_index=True)
genes_df2015 = pd.merge(df2015['gene_id'], df2015['Year'],left_index=True, right_index=True)
genes_df2016 = pd.merge(df2016['gene_id'], df2016['Year'],left_index=True, right_index=True)
genes_df2017 = pd.merge(df2017['gene_id'], df2017['Year'],left_index=True, right_index=True)
genes_df2018 = pd.merge(df2018['gene_id'], df2018['Year'],left_index=True, right_index=True)
genes_df2019 = pd.merge(df2019['gene_id'], df2019['Year'],left_index=True, right_index=True)
genes_df2020 = pd.merge(df2020['gene_id'], df2020['Year'],left_index=True, right_index=True)
genes_df2021 = pd.merge(df2021['gene_id'], df2021['Year'],left_index=True, right_index=True)
genes_df2022 = pd.merge(df2022['gene_id'], df2022['Year'],left_index=True, right_index=True)
genes_df2023 = pd.merge(df2023['gene_id'], df2023['Year'],left_index=True, right_index=True)

#### Dataframe of transcriptID's and year of each year

In [19]:
trans_df2014 = pd.merge(df2014['transcript_id'], df2014['Year'],left_index=True, right_index=True)
trans_df2015 = pd.merge(df2015['transcript_id'], df2015['Year'],left_index=True, right_index=True)
trans_df2016 = pd.merge(df2016['transcript_id'], df2016['Year'],left_index=True, right_index=True)
trans_df2017 = pd.merge(df2017['transcript_id'], df2017['Year'],left_index=True, right_index=True)
trans_df2018 = pd.merge(df2018['transcript_id'], df2018['Year'],left_index=True, right_index=True)
trans_df2019 = pd.merge(df2019['transcript_id'], df2019['Year'],left_index=True, right_index=True)
trans_df2020 = pd.merge(df2020['transcript_id'], df2020['Year'],left_index=True, right_index=True)
trans_df2021 = pd.merge(df2021['transcript_id'], df2021['Year'],left_index=True, right_index=True)
trans_df2022 = pd.merge(df2022['transcript_id'], df2022['Year'],left_index=True, right_index=True)
trans_df2023 = pd.merge(df2023['transcript_id'], df2023['Year'],left_index=True, right_index=True)

### Number of unique ids

In [20]:
# number of unique gene ids
gunique_count2014 = genes_df2014['gene_id'].nunique()
gunique_count2015 = genes_df2015['gene_id'].nunique()
gunique_count2016 = genes_df2016['gene_id'].nunique()
gunique_count2017 = genes_df2017['gene_id'].nunique()
gunique_count2018 = genes_df2018['gene_id'].nunique()
gunique_count2019 = genes_df2019['gene_id'].nunique()
gunique_count2020 = genes_df2020['gene_id'].nunique()
gunique_count2021 = genes_df2021['gene_id'].nunique()
gunique_count2022 = genes_df2022['gene_id'].nunique()
gunique_count2023 = genes_df2023['gene_id'].nunique()

In [21]:
# number of unique protein-coding gene ids
gunique_count2014_pc = df2014.loc[df2014['gene_biotype'] == 'protein_coding'].copy()['gene_id'].nunique()
gunique_count2015_pc = df2015.loc[df2015['gene_biotype'] == 'protein_coding'].copy()['gene_id'].nunique()
gunique_count2016_pc = df2016.loc[df2016['gene_biotype'] == 'protein_coding'].copy()['gene_id'].nunique()
gunique_count2017_pc = df2017.loc[df2017['gene_biotype'] == 'protein_coding'].copy()['gene_id'].nunique()
gunique_count2018_pc = df2018.loc[df2018['gene_biotype'] == 'protein_coding'].copy()['gene_id'].nunique()
gunique_count2019_pc = df2019.loc[df2019['gene_biotype'] == 'protein_coding'].copy()['gene_id'].nunique()
gunique_count2020_pc = df2020.loc[df2020['gene_biotype'] == 'protein_coding'].copy()['gene_id'].nunique()
gunique_count2021_pc = df2021.loc[df2021['gene_biotype'] == 'protein_coding'].copy()['gene_id'].nunique()
gunique_count2022_pc = df2022.loc[df2022['gene_biotype'] == 'protein_coding'].copy()['gene_id'].nunique()
gunique_count2023_pc = df2023.loc[df2023['gene_biotype'] == 'protein_coding'].copy()['gene_id'].nunique()

In [22]:
# number of unique transcript ids
unique_count2014 = trans_df2014['transcript_id'].nunique()
unique_count2015 = trans_df2015['transcript_id'].nunique()
unique_count2016 = trans_df2016['transcript_id'].nunique()
unique_count2017 = trans_df2017['transcript_id'].nunique()
unique_count2018 = trans_df2018['transcript_id'].nunique()
unique_count2019 = trans_df2019['transcript_id'].nunique()
unique_count2020 = trans_df2020['transcript_id'].nunique()
unique_count2021 = trans_df2021['transcript_id'].nunique()
unique_count2022 = trans_df2022['transcript_id'].nunique()
unique_count2023 = trans_df2023['transcript_id'].nunique()

In [23]:
# number of unique protein-coding transcript ids
unique_count2014_pc = df2014.loc[df2014['transcript_biotype'] == 'protein_coding'].copy()['transcript_id'].nunique()
unique_count2015_pc = df2015.loc[df2015['transcript_biotype'] == 'protein_coding'].copy()['transcript_id'].nunique()
unique_count2016_pc = df2016.loc[df2016['transcript_biotype'] == 'protein_coding'].copy()['transcript_id'].nunique()
unique_count2017_pc = df2017.loc[df2017['transcript_biotype'] == 'protein_coding'].copy()['transcript_id'].nunique()
unique_count2018_pc = df2018.loc[df2018['transcript_biotype'] == 'protein_coding'].copy()['transcript_id'].nunique()
unique_count2019_pc = df2019.loc[df2019['transcript_biotype'] == 'protein_coding'].copy()['transcript_id'].nunique()
unique_count2020_pc = df2020.loc[df2020['transcript_biotype'] == 'protein_coding'].copy()['transcript_id'].nunique()
unique_count2021_pc = df2021.loc[df2021['transcript_biotype'] == 'protein_coding'].copy()['transcript_id'].nunique()
unique_count2022_pc = df2022.loc[df2022['transcript_biotype'] == 'protein_coding'].copy()['transcript_id'].nunique()
unique_count2023_pc = df2023.loc[df2023['transcript_biotype'] == 'protein_coding'].copy()['transcript_id'].nunique()

In [24]:
# Lists of number of unique ids over the annotations
genes = [gunique_count2014, gunique_count2015, gunique_count2016, gunique_count2017, gunique_count2018,
           gunique_count2019, gunique_count2020, gunique_count2021, gunique_count2022, gunique_count2023]
genes_pc = [gunique_count2014_pc, gunique_count2015_pc, gunique_count2016_pc, gunique_count2017_pc, gunique_count2018_pc,
           gunique_count2019_pc, gunique_count2020_pc, gunique_count2021_pc, gunique_count2022_pc, gunique_count2023_pc]

transcripts = [unique_count2014, unique_count2015, unique_count2016, unique_count2017, unique_count2018,
           unique_count2019, unique_count2020, unique_count2021, unique_count2022, unique_count2023]
transcripts_pc = [unique_count2014_pc, unique_count2015_pc, unique_count2016_pc, unique_count2017_pc, unique_count2018_pc,
           unique_count2019_pc, unique_count2020_pc, unique_count2021_pc, unique_count2022_pc, unique_count2023_pc]

# Create a Pandas Series from the list
years = [2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]


### Create tables of number of genes/transcripts per year

In [25]:
## Output number of genes to tables directory

number_of_genes_per_year = pd.DataFrame()

number_of_genes_per_year["Year"] = years
number_of_genes_per_year["n_genes"] = genes

number_of_genes_per_year.to_csv("../../tables/annotation_comparison/number_of_genes_per_year.tsv", sep='\t', index=False)

In [26]:
## Output number of protein coding genes to tables directory

number_of_pc_genes_per_year = pd.DataFrame()

number_of_pc_genes_per_year["Year"] = years
number_of_pc_genes_per_year["n_pc_genes"] = genes_pc

number_of_pc_genes_per_year.to_csv("../../tables/annotation_comparison/number_of_protein_coding_genes_per_year.tsv", sep='\t', index=False)

In [27]:
## Output number of transcripts to tables directory

number_of_transcripts_per_year = pd.DataFrame()

number_of_transcripts_per_year["Year"] = years
number_of_transcripts_per_year["Number of transcripts"] = transcripts

number_of_transcripts_per_year.to_csv("../../tables/annotation_comparison/number_of_transcripts_per_year.tsv", sep='\t', index=False)

In [28]:
## Output number of protein coding transcripts to tables directory

number_of_transcripts_per_year = pd.DataFrame()

number_of_transcripts_per_year["Year"] = years
number_of_transcripts_per_year["Number of transcripts"] = transcripts

number_of_transcripts_per_year.to_csv("../../tables/annotation_comparison/number_of_protein_coding_transcripts_per_year.tsv", sep='\t', index=False)

### Breakdown of Transcript Biotype of all RNA Isoforms in Ensembl v109

In [29]:
# get the biotype of each gene/transcript for v109
df2023_tx_biotype = df2023.copy()[['gene_id', 'gene_biotype', 'transcript_id', 'transcript_biotype']]
df2023_tx_biotype.to_csv("../../tables/annotation_comparison/tx_and_gene_biotype_for_2023.tsv", sep='\t', index=False)

## Venn Comparisons of gene/isoform ids across 2019, 2021, and 2023

In [30]:
# calculate 3 way comparison genes 2019, 2021, 2023

gdf2019_set = set(df2019['gene_id'].copy())
gdf2021_set = set(df2021['gene_id'].copy())
gdf2023_set = set(df2023['gene_id'].copy())

# Get the full intersect of 'transcript_id' from all DataFrames
full_intersect = gdf2019_set & gdf2021_set & gdf2023_set

# Get the intersect of 'transcript_id' between two DataFrames
g_2019_and_2021 = gdf2019_set & gdf2021_set
g_2021_and_2023 = gdf2021_set & gdf2023_set
g_2023_and_2019 = gdf2023_set & gdf2019_set

# Calculate the lengths for each set operation
overlap_lengths = {
    '2019,2021,2023': len(full_intersect),
    '2019,2021': len(g_2019_and_2021 - full_intersect),
    '2021,2023': len(g_2021_and_2023 - full_intersect),
    '2023,2019': len(g_2023_and_2019 - full_intersect),
    '2019': len(gdf2019_set - (g_2019_and_2021 | g_2023_and_2019)),
    '2021': len(gdf2021_set - (g_2019_and_2021 | g_2021_and_2023)),
    '2023': len(gdf2023_set - (g_2021_and_2023 | g_2023_and_2019))
}

# Create a DataFrame from the overlap_lengths dictionary
gene_overlap_venn_values = pd.DataFrame.from_dict(overlap_lengths, orient='index', columns=['value'])
gene_overlap_venn_values.index.name = 'type'
gene_overlap_venn_values.reset_index(inplace=True)

gene_overlap_venn_values.to_csv("../../tables/annotation_comparison/gene_overlap_venn_values_2019_2021_2023.tsv", sep='\t', index=False)

In [31]:
# calculate 3 way comparison protein-coding genes 2019, 2021, 2023
pcgdf2019_set = set(df2019.loc[df2019['gene_biotype'] == 'protein_coding'].copy()['gene_id'].copy())
pcgdf2021_set = set(df2021.loc[df2021['gene_biotype'] == 'protein_coding'].copy()['gene_id'].copy())
pcgdf2023_set = set(df2023.loc[df2023['gene_biotype'] == 'protein_coding'].copy()['gene_id'].copy())

# Get the full intersect of 'transcript_id' from all DataFrames
full_intersect_gpc = pcgdf2019_set & pcgdf2021_set & pcgdf2023_set

# Get the intersect of 'transcript_id' between two DataFrames
pcg_2019_and_2021 = pcgdf2019_set & pcgdf2021_set
pcg_2021_and_2023 = pcgdf2021_set & pcgdf2023_set
pcg_2023_and_2019 = pcgdf2023_set & pcgdf2019_set

# Calculate the lengths for each set operation
overlap_lengths = {
    '2019,2021,2023': len(full_intersect_gpc),
    '2019,2021': len(pcg_2019_and_2021 - full_intersect_gpc),
    '2021,2023': len(pcg_2021_and_2023 - full_intersect_gpc),
    '2023,2019': len(pcg_2023_and_2019 - full_intersect_gpc),
    '2019': len(pcgdf2019_set - (pcg_2019_and_2021 | pcg_2023_and_2019)),
    '2021': len(pcgdf2021_set - (pcg_2019_and_2021 | pcg_2021_and_2023)),
    '2023': len(pcgdf2023_set - (pcg_2021_and_2023 | pcg_2023_and_2019))
}

# Create a DataFrame from the overlap_lengths dictionary
pc_gene_overlap_venn_values = pd.DataFrame.from_dict(overlap_lengths, orient='index', columns=['value'])
pc_gene_overlap_venn_values.index.name = 'type'
pc_gene_overlap_venn_values.reset_index(inplace=True)

pc_gene_overlap_venn_values.to_csv("../../tables/annotation_comparison/pc_gene_overlap_venn_values_2019_2021_2023.tsv", sep='\t', index=False)

In [32]:
# print out the gene that is labeled as protein coding in 2019 and 2023 but NOT 2021
print(pcg_2023_and_2019 - full_intersect_gpc)

{'ENSG00000204397'}


In [33]:
# calculate 3 way transcript id comparison for 2019, 2021, 2023

df2019_set = set(df2019['transcript_id'].copy())
df2021_set = set(df2021['transcript_id'].copy())
df2023_set = set(df2023['transcript_id'].copy())

# Get the full intersect of 'transcript_id' from all DataFrames
full_intersect = df2019_set & df2021_set & df2023_set

# Get the intersect of 'transcript_id' between two DataFrames
i_2019_and_2021 = df2019_set & df2021_set
i_2021_and_2023 = df2021_set & df2023_set
i_2023_and_2019 = df2023_set & df2019_set

# Calculate the lengths for each set operation
overlap_lengths = {
    '2019,2021,2023': len(full_intersect),
    '2019,2021': len(i_2019_and_2021 - full_intersect),
    '2021,2023': len(i_2021_and_2023 - full_intersect),
    '2023,2019': len(i_2023_and_2019 - full_intersect),
    '2019': len(df2019_set - (i_2019_and_2021 | i_2023_and_2019)),
    '2021': len(df2021_set - (i_2019_and_2021 | i_2021_and_2023)),
    '2023': len(df2023_set - (i_2021_and_2023 | i_2023_and_2019))
}

# Create a DataFrame from the overlap_lengths dictionary
transcript_overlap_venn_values = pd.DataFrame.from_dict(overlap_lengths, orient='index', columns=['value'])
transcript_overlap_venn_values.index.name = 'type'
transcript_overlap_venn_values.reset_index(inplace=True)

transcript_overlap_venn_values.to_csv("../../tables/annotation_comparison/venn_overlap_transcript_2019_2021_2023.tsv", sep='\t', index=False)

In [34]:
# 3 way comparison protein coding transcript ids 2019, 2021, 2023
df2019_set_pc = set(df2019.loc[df2019['transcript_biotype'] == 'protein_coding'].copy()['transcript_id'].copy())
df2021_set_pc = set(df2021.loc[df2021['transcript_biotype'] == 'protein_coding'].copy()['transcript_id'].copy())
df2023_set_pc = set(df2023.loc[df2023['transcript_biotype'] == 'protein_coding'].copy()['transcript_id'].copy())

# Get the full intersect of 'transcript_id' from all DataFrames
full_intersect_pc = df2019_set_pc & df2021_set_pc & df2023_set_pc

# Get the intersect of 'transcript_id' between two DataFrames
i_2019_and_2021_pc = df2019_set_pc & df2021_set_pc
i_2021_and_2023_pc = df2021_set_pc & df2023_set_pc
i_2023_and_2019_pc = df2023_set_pc & df2019_set_pc

# Calculate the lengths for each set operation
overlap_lengths_pc = {
    '2019,2021,2023': len(full_intersect_pc),
    '2019,2021': len(i_2019_and_2021_pc - full_intersect_pc),
    '2021,2023': len(i_2021_and_2023_pc - full_intersect_pc),
    '2023,2019': len(i_2023_and_2019_pc - full_intersect_pc),
    '2019': len(df2019_set_pc - (i_2019_and_2021_pc | i_2023_and_2019_pc)),
    '2021': len(df2021_set_pc - (i_2019_and_2021_pc | i_2021_and_2023_pc)),
    '2023': len(df2023_set_pc - (i_2021_and_2023_pc | i_2023_and_2019_pc))
}

# Create a DataFrame from the overlap_lengths dictionary
transcript_overlap_venn_values_pc = pd.DataFrame.from_dict(overlap_lengths_pc, orient='index', columns=['value'])
transcript_overlap_venn_values_pc.index.name = 'type'
transcript_overlap_venn_values_pc.reset_index(inplace=True)

transcript_overlap_venn_values_pc.to_csv("../../tables/annotation_comparison/venn_overlap_transcript_2019_2021_2023_protein_coding.tsv", sep='\t', index=False)

In [35]:
# print out the transcripts that are labeled as protein coding in 2019 and 2023 but NOT 2021
print(i_2023_and_2019_pc - full_intersect_pc)

{'ENST00000343811', 'ENST00000400440', 'ENST00000421643', 'ENST00000528513', 'ENST00000522719'}


## Number of transcripts per gene in 2023 annotation

In [36]:
# all genes
n_tx_per_gene_2023 = df2023.copy().groupby(['gene_id']).size().reset_index(name="n_tx")
n_tx_per_gene_2023 = n_tx_per_gene_2023.sort_values(by=['n_tx'])
n_tx_per_gene_2023.to_csv("../../tables/annotation_comparison/n_tx_per_gene_2023.tsv", sep='\t', index=False)

# non protein-coding genes
n_tx_per_gene_2023_npc = df2023.loc[df2023['gene_biotype'] != 'protein_coding'].copy().groupby(['gene_id']).size().reset_index(name="n_tx")
n_tx_per_gene_2023_npc = n_tx_per_gene_2023_npc.sort_values(by=['n_tx'])
n_tx_per_gene_2023_npc.to_csv("../../tables/annotation_comparison/n_tx_per_npc_gene_2023.tsv", sep='\t', index=False)

# protein-coding genes
n_tx_per_gene_2023_pc = df2023.loc[df2023['gene_biotype'] == 'protein_coding'].copy().groupby(['gene_id']).size().reset_index(name="n_tx")
n_tx_per_gene_2023_pc = n_tx_per_gene_2023_pc.sort_values(by=['n_tx'])
n_tx_per_gene_2023_pc.to_csv("../../tables/annotation_comparison/n_tx_per_pc_gene_2023.tsv", sep='\t', index=False)

In [37]:
n_tx_per_gene_2023_over_60 = n_tx_per_gene_2023.loc[n_tx_per_gene_2023['n_tx'] >= 60].copy()
n_tx_per_gene_2023_over_60.to_csv("../../tables/annotation_comparison/n_tx_per_gene_2023_over_60.tsv", sep='\t', index=False)

In [38]:
print('Number of genes with a single annotated isoform')
print(len(n_tx_per_gene_2023.loc[n_tx_per_gene_2023['n_tx']==1].copy()['gene_id']))
print('Number of genes with >= 10 annotated isoforms')
print(len(n_tx_per_gene_2023.loc[n_tx_per_gene_2023['n_tx']>=10].copy()['gene_id']))
print('Number of genes with >= 60 annotated isoforms')
print(len(n_tx_per_gene_2023.loc[n_tx_per_gene_2023['n_tx']>=60].copy()['gene_id']))

Number of genes with a single annotated isoform
38690
Number of genes with >= 10 annotated isoforms
7255
Number of genes with >= 60 annotated isoforms
155


In [39]:
print('Number of pc genes with a single annotated isoform')
print(len(n_tx_per_gene_2023_pc.loc[n_tx_per_gene_2023_pc['n_tx']==1].copy()['gene_id']))
print('Number of pc genes with >= 10 annotated isoforms')
print(len(n_tx_per_gene_2023_pc.loc[n_tx_per_gene_2023_pc['n_tx']>=10].copy()['gene_id']))
print('Number of pc genes with >= 60 annotated isoforms')
print(len(n_tx_per_gene_2023_pc.loc[n_tx_per_gene_2023_pc['n_tx']>=60].copy()['gene_id']))

Number of pc genes with a single annotated isoform
2922
Number of pc genes with >= 10 annotated isoforms
6162
Number of pc genes with >= 60 annotated isoforms
94


### Mean and Median number of RNA isoforms per gene by year

In [40]:
all_years =pd.concat([df2014, df2015, df2016, df2017, df2018, df2019, df2020, df2021, df2022, df2023], ignore_index=True)
grouped = all_years.groupby(['gene_id', 'Year']).size().reset_index(name='n_tx')
grouped['median_n_tx_per_year'] = grouped.groupby('Year')['n_tx'].transform('median')
grouped['mean_n_tx_per_year'] = grouped.groupby('Year')['n_tx'].transform('mean')
grouped = grouped[['Year', 'median_n_tx_per_year', 'mean_n_tx_per_year']].drop_duplicates()
grouped

Unnamed: 0,Year,median_n_tx_per_year,mean_n_tx_per_year
0,2014,1.0,3.306531
1,2015,1.0,3.276455
2,2016,1.0,3.410828
3,2017,1.0,3.421325
4,2018,1.0,3.489032
5,2019,1.0,3.541319
6,2020,1.0,3.754664
7,2021,1.0,3.863663
8,2022,1.0,4.004923
9,2023,1.0,4.031223


### Mean and Median number of RNA isoforms per protein-coding gene by year

In [41]:
all_years =pd.concat([df2014, df2015, df2016, df2017, df2018, df2019, df2020, df2021, df2022, df2023], ignore_index=True)
pc_all_years = all_years[all_years['gene_biotype'] == 'protein_coding']
pc_grouped = pc_all_years.groupby(['gene_id', 'Year']).size().reset_index(name='n_tx')
pc_grouped['median_n_tx_per_year'] = pc_grouped.groupby('Year')['n_tx'].transform('median')
pc_grouped['mean_n_tx_per_year'] = pc_grouped.groupby('Year')['n_tx'].transform('mean')
pc_grouped = pc_grouped[['Year', 'median_n_tx_per_year', 'mean_n_tx_per_year']].drop_duplicates()
pc_grouped

Unnamed: 0,Year,median_n_tx_per_year,mean_n_tx_per_year
0,2014,6.0,7.175713
1,2015,6.0,7.240555
2,2016,6.0,7.283453
3,2017,6.0,7.375983
4,2018,6.0,7.50904
5,2019,6.0,7.611242
6,2020,6.0,7.660206
7,2021,6.0,7.986124
8,2022,6.0,8.308092
9,2023,6.0,8.509278
