In [1]:
import pandas as pd

In [11]:
human_genes = pd.read_csv("human_protein_coding_genes_havana.txt")
mouse_genes = pd.read_csv("mouse_protein_coding_genes_havana.txt")

  human_genes = pd.read_csv("human_protein_coding_genes_havana.txt")


Select the furthest UTR boundary for every gene for human and mouse. There are 16573 human genes and 945 mouse genes in the Havana dataset.

In [12]:
human_genes_utrs = human_genes.groupby(["Gene stable ID"]).agg(
    utr5_start=("5' UTR start", min),
    utr5_end=("5' UTR end", max),
    utr3_start=("3' UTR start", min),
    utr3_end=("3' UTR end", max)
)

mouse_genes_utrs = mouse_genes.groupby(["Gene stable ID"]).agg(
    utr5_start=("5' UTR start", min),
    utr5_end=("5' UTR end", max),
    utr3_start=("3' UTR start", min),
    utr3_end=("3' UTR end", max)
)

print(len(human_genes_utrs))
print(len(mouse_genes_utrs))

16573
945


Almost 0.26% of the human genes in the Havana dataset don't have annotated UTRs boundaries.

In [4]:
human_genes_utrs.isnull().sum() / human_genes_utrs.shape[0]

utr5_start    0.164225
utr5_end      0.164225
utr3_start    0.159946
utr3_end      0.159946
dtype: float64

Almost 24% of the mouse genes in the Havana dataset don't have annotated 5' UTR boundaries and almost 19% of mouse genes in the Havana dataset don't have annotated 19% boundaries.

In [5]:
mouse_genes_utrs.isnull().sum() / mouse_genes_utrs.shape[0]

utr5_start    0.249735
utr5_end      0.249735
utr3_start    0.198942
utr3_end      0.198942
dtype: float64

Keep the genes with annotated UTRs boundaries.

In [6]:
human_genes_utrs = human_genes_utrs.dropna()

In [7]:
mouse_genes_utrs = mouse_genes_utrs.dropna()

Calculate the length of the 3' UTR and 5' UTR for every gene.

In [8]:
human_genes_utrs = human_genes_utrs.assign(
    utr5_len = human_genes_utrs['utr5_end'] - human_genes_utrs['utr5_start'],
    utr3_len = human_genes_utrs['utr3_end'] - human_genes_utrs['utr3_start']
)

mouse_genes_utrs = mouse_genes_utrs.assign(
    utr5_len = mouse_genes_utrs['utr5_end'] - mouse_genes_utrs['utr5_start'],
    utr3_len = mouse_genes_utrs['utr3_end'] - mouse_genes_utrs['utr3_start']
)

75% of the human protein-coding genes in the Havana dataset have 5' UTR length at most 6715 nucleotides. 75% of the human protein-coding genes in the Havana dataset have 3' UTR length at most 2120 nucleotides.

In [9]:
human_genes_utrs[['utr5_len', 'utr3_len']].describe()\
.round(2)

Unnamed: 0,utr5_len,utr3_len
count,12685.0,12685.0
mean,12783.65,5199.4
std,54112.14,29879.33
min,0.0,0.0
25%,124.0,337.0
50%,877.0,887.0
75%,6504.0,2113.0
max,1948259.0,1501408.0


75% of the mouse protein-coding genes in the Havana dataset have 5' UTR length at most 1860 nucleotides. 75% of the human protein-coding genes in the Havana dataset have 3' UTR length at most 2396 nucleotides. 

In [10]:
mouse_genes_utrs[['utr5_len', 'utr3_len']].describe()\
.round(2)

Unnamed: 0,utr5_len,utr3_len
count,659.0,659.0
mean,6635.72,6124.83
std,61019.7,23821.47
min,1.0,1.0
25%,65.0,195.0
50%,259.0,728.0
75%,1860.5,2396.0
max,1359782.0,374901.0


For both human and mouse genes in Havana, the maximum length of UTRs reaches million of nucleotides!