In [21]:
events.tab<-read.table('data/fromGTF.SE.txt',sep='\t',header=T)

up.intron.s<-events.tab$upstreamEE

up.intron.e<-events.tab$exonStart_0base

down.intron.s<-events.tab$exonEnd

down.intron.e<-events.tab$downstreamES

out.tab<-cbind(as.character(events.tab$chr),up.intron.s,up.intron.e,as.character(events.tab$geneSymbol),
               up.intron.e-up.intron.s-1,as.character(events.tab$strand),events.tab$ID)

out.tab<-rbind(out.tab,cbind(as.character(events.tab$chr),down.intron.s,down.intron.e,as.character(events.tab$geneSymbol),
               down.intron.e-down.intron.s-1,as.character(events.tab$strand),events.tab$ID))

write.table(out.tab,'flanking_introns.bed',sep='\t',col.names = F,row.names = F,quote = F)


In [22]:
system('conda install -c bioconda bedtools')

In [23]:
system('bedtools intersect -wa -wb -s -F 1 -a flanking_introns.bed -b repeats.bed > intron_alu_intersections.bed')

all.intron.alu.intr<-read.table('intron_alu_intersections.bed',sep='\t',header = F)

colnames(all.intron.alu.intr)<-c('chr','up_intr_s','up_intr_e','gene','up_intr_l','strand','ID','chr','alu_s','alu_e','alu','blank','strand')


In [24]:
all.intron.alu.intr<-all.intron.alu.intr[grepl('^Alu',all.intron.alu.intr$alu),]

In [25]:
all.intron.alu.intr=all.intron.alu.intr[!duplicated(paste0(all.intron.alu.intr$ID,all.intron.alu.intr$alu)),]

In [26]:
tissue.list=read.csv('data/tissue_list.csv',header=FALSE)

sb.events=c()

all.events=c()

for (tissue in tissue.list$V2)
{
    
    all.tissue.events=read.csv(paste0('data/se_',tissue,'_AS_model_B_sex_as_events.csv'))
  
    tissue.all.event.ids=lapply(lapply(lapply(as.character(rownames(all.tissue.events)),strsplit,split='-'),unlist)
                                ,'[[',2)
    
    all.events=unique(c(all.events,tissue.all.event.ids))
                        
    tissue.sb.events=read.csv(paste0('data/se_',tissue,'_AS_model_B_sex_as_events_refined.csv'))
    
    tissue.sb.event.ids=lapply(lapply(lapply(as.character(rownames(tissue.sb.events)),strsplit,split='-'),unlist)
                               ,'[[',2)
  
    sb.events=unique(c(sb.events,tissue.sb.event.ids))
    
}

In [59]:
alu.enrich=matrix(ncol=7,nrow=0)

colnames(alu.enrich)<-c('Alu','AluASCount','AluSBCount','TotalAS','TotalSB','PVal','FDR')

num.sb.events=length(sb.events) #total number of sex-biased events

num.as.events=length(all.events) #total number of alternative splicing events

all.events.per.alu=table(all.intron.alu.intr$alu[all.intron.alu.intr$ID %in% all.events])   #number of events per Alu family

sb.events.per.alu=table(all.intron.alu.intr$alu[all.intron.alu.intr$ID %in% sb.events]) #number of sex-biased events per Alu family

for (i in (1:length(all.events.per.alu)))    
{
  
  cur.alu<-names(all.events.per.alu)[i]  #get the name of the next Alu family
  
  if (!cur.alu %in% names(sb.events.per.alu) | all.events.per.alu[cur.alu]<10 | !grepl('^Alu',cur.alu))
    
    next
  
  alu.sb<-sb.events.per.alu[cur.alu]  #sex-biased events with this Alu family
  
  alu.as<-all.events.per.alu[cur.alu]  #all AS events with this Alu family
  
  pval<-1-exp(phyper(q=alu.sb, m=alu.as, n=num.as.events-alu.as, k=num.sb.events,log.p = T))
  
  alu.enrich<-rbind(alu.enrich, c(cur.alu,alu.as,alu.sb,num.as.events,num.sb.events,pval,0))
  
}



In [60]:
alu.enrich[,'FDR']=p.adjust(alu.enrich[,'PVal'],method='BH')
alu.enrich[order(alu.enrich[,'PVal']),]

Alu,AluASCount,AluSBCount,TotalAS,TotalSB,PVal,FDR
AluSx3,74,16,39970,3948,0.0007370365364345,0.0125182490698012
AluSq,53,12,39970,3948,0.0015900662971025,0.0125182490698012
AluJr,136,24,39970,3948,0.0017070339640638,0.0125182490698012
AluY,162,26,39970,3948,0.0048060426980332,0.0220953626227518
AluSx,242,36,39970,3948,0.0050216733233526,0.0220953626227518
AluSp,139,21,39970,3948,0.0180048318219698,0.0660177166805559
AluJb,202,28,39970,3948,0.0261116687331677,0.0820652445899556
AluSz6,126,18,39970,3948,0.0410455072282917,0.109617340347962
AluSx1,187,25,39970,3948,0.0471120529244781,0.109617340347962
AluSx4,21,4,39970,3948,0.0498260637945281,0.109617340347962


In [29]:
esr.tab<-read.table('data/all-byclass-noM-plusminus-hits-uniq.txt',sep=' ',header = T)

chr<-unlist(lapply(lapply(lapply(lapply(as.character(esr.tab$item_coordinates),strsplit,split=':'),'[[',1),'[[',1),'[',1))

start<-unlist(lapply(lapply(lapply(as.character(esr.tab$item_coordinates),strsplit,split=':'),'[[',1),'[',2))

start<-unlist(lapply(lapply(lapply(start,strsplit,split='-'),'[[',1),'[',1))

end<-unlist(lapply(lapply(lapply(as.character(esr.tab$item_coordinates),strsplit,split=':'),'[[',1),'[',2))

end<-unlist(lapply(lapply(lapply(end,strsplit,split='-'),'[[',1),'[',2))

end<-unlist(lapply(lapply(lapply(end,strsplit,split=','),'[[',1),'[',1))

strand<-unlist(lapply(lapply(lapply(as.character(esr.tab$item_coordinates),strsplit,split=':'),'[[',1),'[',2))

strand<-unlist(lapply(lapply(lapply(strand,strsplit,split=','),'[[',1),'[',2))

out.tab<-cbind(chr,start,end,rep('NA',length(chr)),as.integer(end)-as.integer(start)-1,strand,as.character(esr.tab$sequence_name),esr.tab$item_antisense)

write.table(out.tab,'alu_with_esr.bed',sep='\t',col.names = F,row.names = F,quote = F)

system('bedtools intersect -wa -wb -s -F 1 -a flanking_introns.bed -b alu_with_esr.bed > intron_esr_intersections.bed')


In [30]:
rm(out.tab)

rm(esr.tab)

rm(chr)

rm(start)

rm(end)

rm(strand)

In [55]:
alu.esr.tab<-read.table('intron_esr_intersections.bed',sep='\t',header = F)

alu.esr.tab<-alu.esr.tab[grepl('^Alu',alu.esr.tab$V14),]

alu.esr.tab<-alu.esr.tab[alu.esr.tab$V15==TRUE,]

alu.esr.tab<-alu.esr.tab[!duplicated(paste0(alu.esr.tab$V7,alu.esr.tab$V14)),]

alu.esr.tab<-alu.esr.tab[alu.esr.tab$V7 %in% all.events,]

alu.types<-table(as.character(alu.esr.tab$V14))

sb.esr.tab<-table(as.character(alu.esr.tab$V14[alu.esr.tab$V7 %in% sb.events]))

alu.esr.enrich<-matrix(ncol=7,nrow=0)

colnames(alu.esr.enrich)<-c('Alu','CountAS','CountSB','TotalAS','TotalSB','PVal','FDR')

for (i in (1:length(alu.types)))
{
  
  cur.cat<-names(alu.types)[i]  
  
  count.sb<-sb.esr.tab[cur.cat]
  
  count.as<-alu.types[cur.cat]
  
  if (is.na(count.sb) || all.events.per.alu[cur.cat]+sb.events.per.alu[cur.cat]<25 || count.sb<25)
    
    next
          
  pval<-1exp(phyper(q=count.sb, m=count.as, n=all.events.per.alu[cur.cat]-count.as, k=sb.events.per.alu[cur.cat],log.p = T))
  
  alu.esr.enrich<-rbind(alu.esr.enrich, c(cur.cat,count.as,count.sb,all.events.per.alu[cur.cat],sb.events.per.alu[cur.cat],pval,0))
}



In [37]:
alu.esr.enrich[order(alu.esr.enrich[,'PVal']),]

Alu,CountAS,CountSB,TotalAS,TotalSB,PVal,FDR
AluJo,98,14,130,14,0.0,0
AluSc,78,9,79,9,0.0,0
AluSc8,54,8,57,8,0.0,0
AluSg,90,8,90,8,0.0,0
AluSg4,17,1,17,1,0.0,0
AluSg7,23,3,24,3,0.0,0
AluSq,52,12,53,12,0.0,0
AluSq2,136,16,137,16,0.0,0
AluSq4,4,2,4,2,0.0,0
AluYf1,7,2,8,2,0.0,0


In [51]:

cur.cat='AluSc'
 
  
  count.sb<-sb.esr.tab[cur.cat]
  
  count.as<-alu.types[cur.cat]
  
  pval<-(phyper(q=count.sb, m=count.as, n=all.events.per.alu[cur.cat]-count.as, k=sb.events.per.alu[cur.cat],log.p = T,lower.tail = F))
  
print(count.as)

AluSc 
   78 


In [35]:
alu.types


   AluJb    AluJo    AluJr   AluJr4    AluSc   AluSc5   AluSc8    AluSg 
     182       98      106       32       78       16       54       90 
  AluSg4   AluSg7    AluSp    AluSq  AluSq10   AluSq2   AluSq4    AluSx 
      17       23      138       52        5      136        4      227 
  AluSx1   AluSx3   AluSx4    AluSz   AluSz6     AluY   AluYb8    AluYc 
     184       69       17      181      119       82        1        1 
  AluYf1 AluYh3a3   AluYj4  AluYk11 
       7        7        1        1 