# Figure 4c - `Nonsense-mediated decay in sex-biased alternative splicing`

This notebook generates an interactive HTML plot for Figure 4c. This notebook is seperated from the other Figure 4 notebooks because a python kernel is required whereas the other plots require an R kernel.

In [2]:
library(Biostrings)

In [6]:
event.type='SE'
fromGtfFileName <- paste0('../data/fromGTF.',event.type,'.txt')
from.gtf<-read.table(file = fromGtfFileName, sep = "\t", quote = "\"", header = T, stringsAsFactors = F)
res<-matrix(c(rep(0,nrow(from.gtf)),rep(0,nrow(from.gtf)),rep('',nrow(from.gtf))),ncol=3)
colnames(res)<-c('num.nmd','num.transcripts','nmd.ids')

In [None]:


fasta.file='/Users/karleg/STAR/STAR/bin/MacOSX_x86_64/data/GRCh38/sequence/GRCh38_r91.all.fa'


finished=0

for (chr in as.character(unique(from.gtf$chr)))
{
  
  if (chr=='chrY')
    
    next
  
  system(paste0('grep ^',gsub('chr','',chr),' /Users/karleg/STAR/STAR/bin/MacOSX_x86_64/data/GRCh38/annotation/Homo_sapiens.GRCh38.91.gtf > cur_chrom.gtf'))
  
  cur.gtf<-read.table('cur_chrom.gtf',sep='\t',quote="")
  
  for (exon.itr in ((1:nrow(from.gtf))[from.gtf$chr==chr]))
  {
    exon.rows<-which((cur.gtf$V4==from.gtf$exonStart_0base[exon.itr]+1) & (cur.gtf$V5==from.gtf$exonEnd[exon.itr]) & cur.gtf$V3=='exon')
    
    for (exon.row in exon.rows)
    {
      
      transcript.first.row<-max(which((cur.gtf$V3=='transcript') & ((1:nrow(cur.gtf))<exon.row)  ))
      
      if (sum((cur.gtf$V3 %in% c('transcript','gene')) & ((1:nrow(cur.gtf))>exon.row)  )==0)
      {
        transcript.last.row<-nrow(cur.gtf)
      }else{
      transcript.last.row<-min(which((cur.gtf$V3 %in% c('transcript','gene')) & ((1:nrow(cur.gtf))>exon.row)  ))-1
      }
      
      out.gtf<-cur.gtf[transcript.first.row:transcript.last.row,]
      
      if (sum(c(sum((out.gtf$V4==from.gtf$upstreamES[exon.itr]+1) & out.gtf$V5==from.gtf$upstreamEE[exon.itr])>0, 
      
      sum((out.gtf$V4==from.gtf$downstreamES[exon.itr]+1) & out.gtf$V5==from.gtf$downstreamEE[exon.itr])>0))<2)
        
        next
      
      if (sum(out.gtf$V3=='CDS')<3)
        
        next
      
      write.table(out.gtf,"transcript.gtf",sep='\t',col.names = FALSE,row.names = FALSE,quote = FALSE)
      
      system(paste('./gffread -y gene.fa -g ',fasta.file,' transcript.gtf',sep=''))
      
      seq<-readAAStringSet('gene.fa')
      
      if (length(seq)==0)
        
        next
      
      l.inc<-length(seq[[1]])
      
      out.gtf<-out.gtf[(out.gtf$V4!=(from.gtf$exonStart_0base[exon.itr]+1)) & (out.gtf$V5!=(from.gtf$exonEnd[exon.itr])),]
      
      write.table(out.gtf,"transcript.gtf",sep='\t',col.names = FALSE,row.names = FALSE,quote = FALSE)
      
      system(paste('./gffread -y gene.fa -g ',fasta.file,' transcript.gtf',sep=''))
      
      seq<-readAAStringSet('gene.fa')
      
      l.skip<-length(seq[[1]])
      
      skip.exon.aa.length<-(from.gtf$exonEnd[exon.itr]-(from.gtf$exonStart_0base[exon.itr]+1))/3
        
      res[exon.itr,2]<-as.integer(res[exon.itr,2])+1
      
      if (l.inc<(l.skip+skip.exon.aa.length-1))
      {
        res[exon.itr,1]<-as.integer(res[exon.itr,1])+1
        
        res[exon.itr,3]<-paste(res[exon.itr,3],cur.gtf$V9[transcript.first.row],sep='***')
      
      }
    }
    
  }
  
  system('rm cur_chrom.gtf')
  
  finished<-finished+sum(from.gtf$chr==chr)
  
  print(paste0("Finished: ",finished))
  
}


write.table(res,"NMD_summary.txt",sep='\t',row.names = TRUE,col.names = TRUE,quote = FALSE)

## Loading dependencies

In [2]:
import pandas as pd

from plotly.offline import iplot, init_notebook_mode
import plotly.offline as offline
import plotly.graph_objs as go
import plotly.io as pio

import os
import numpy as np

## Figure 4c
3-dimensional plot illustrating the relationship between  gene  expression  and  inclusion  and  exclusion  counts  for  an  exon  skipping  event  in  CDKN2A  in  mammary  tissue.  Females  show  both  higher  gene  expression  as  well  as  higher  skip  counts  than  males.  The  skipped  exon  is  present in isoform 5 of CDKN2A (NM_001195132.1) and causes a frameshift that is predicted to induce NMD.

### Load the input data

In [3]:
data3d = pd.read_csv('../dimorphAS/figures/figure3/3dtable.txt',sep='\t',dtype='object')

### Produce the HTML scatter plot

In [4]:
#init_notebook_mode(connected=True)

Male = go.Scatter3d(
    x = data3d.loc[data3d['Sex']=='male',data3d.columns[0]],
    y = data3d.loc[data3d['Sex']=='male',data3d.columns[1]],
    z=  data3d.loc[data3d['Sex']=='male',data3d.columns[2]],
    mode = 'markers',name='Male', marker=dict(
        size=12,color='rgba(60,84,136,1)',

        opacity=0.8
    )
)

Female = go.Scatter3d(
    x = data3d.loc[data3d['Sex']=='female',data3d.columns[0]],
    y = data3d.loc[data3d['Sex']=='female',data3d.columns[1]],
    z=  data3d.loc[data3d['Sex']=='female',data3d.columns[2]],
    mode = 'markers',name='Female', marker=dict(
        size=12,color='rgba(220,0,0,1)',
        opacity=0.8
    )
)


layout = go.Layout(
    autosize=True,
    font=dict(family='Courier New, monospace',size=18,color='black'),
    scene = dict(
        xaxis = dict(
            title=list(data3d.columns.values)[0]),
        yaxis = dict(
            title=list(data3d.columns.values)[1]),
        zaxis = dict(
            title=list(data3d.columns.values)[2]),),
    width=2000,
    height=1000,
    legend=dict(x=0.45,y=0.3,font=dict(
            family='sans-serif',
            size=36)
    ),
    margin=dict(r=50, b=25,l=25, t=30),
)

data = [Male,Female]

fig = go.Figure(data=data, layout=layout)

offline.plot(fig, filename='basic-scatter')

#init_notebook_mode(connected=True)
#pio.write_image(fig, 'fig3d.pdf')

#pio.write_image(fig, '3d-scatter-colorscale.pdf')


Your filename `basic-scatter` didn't end with .html. Adding .html to the end of your file.



'basic-scatter.html'

## Metadata

For replicability and reproducibility purposes, we also print the following metadata:

1. Checksums of **'artefacts'**, files generated during the analysis and stored in the folder directory **`data`**
2. List of environment metadata, dependencies, versions of libraries using `conda list`

### 1. Checksums with the sha256 algorithm

In [5]:
figure_id = "figure_4c"
os.system("echo true")

print("Generating sha256 checksums of the artefacts in the `..data/` directory .. ")
os.system(f"cd ../data/ && sha256sum * > ../metadata/{figure_id}_sha256sums.txt")
print("Done!\n")

pd.read_csv(f"../metadata/{figure_id}_sha256sums.txt")

Generating sha256 checksums of the artefacts in the `..data/` directory .. 
Done!



Unnamed: 0,ec38fac35613da014f73140da90c95294ba52c6e7923e380a5236762f1ca3793 GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct
0,65683ffcf6df3a68ce9e3401f2f66408231b9b9acd1c04...
1,0133c46eac7fde518f6d851287cd1933443ae3ea759d22...
2,cbabf87ae994cf76eb9b47709f6efb59e43a52af0a65f9...
3,8291be77c7ad6cd73d9f7797658e3f1ffc197532a23385...
4,5b1d46a8d2a5a2556e81d5262a50aa1ff2f31ee621193f...
...,...
4854,295ebbc27fa4e169b131781d772b4e69d528b82e728243...
4855,84f06a1cb756a11cd0306595d171d8739a8cf7fcebcc70...
4856,14d846231f11b4ff7acee9858fe6b39ecaad4079b3c1c5...
4857,66083588f477e50baa7229ee5ca9c34fbf53f7dad7940e...


### 2. Libraries metadata

In [5]:
figure_id = "figure_4c"

print(f"Saving `conda list` packages in ../metadata/{figure_id}_conda_list.txt  ..")
os.system(f"conda list > ../metadata/{figure_id}_conda_list.txt")
print("Done!\n")

Saving `conda list` packages in ../metadata/figure_4c_conda_list.txt  ..
Done!

