In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from scipy.stats import stats
from statannot import add_stat_annotation
from Bio.SeqUtils import GC

## Import of Data

In [None]:
#Import of RIP-seq data 
df=pd.read_excel(r"C:\...\Enriched transcripts.xlsx",header=[1])

In [None]:
df2=pd.read_excel(r"C:\...\co_regulons.xlsx", sheet_name="NS APEAL RIP up 0.8")
coRegs_mock=list(df2['Co-regulons'].dropna().unique())

In [None]:
df2=pd.read_excel(r"C:\...\co_regulons.xlsx", sheet_name="HS APEAL RIP up 0.8")
coRegs_heat=list(df2['Co-regulons'].dropna().unique())

In [None]:
#Import of 5' UTR sequences downloaded from TAIR
#Code block may be used to import 3' UTR and CDS sequences 
from collections import defaultdict

dic=defaultdict(str)
with open(r"C:\...\Araport11_5_utr_20220504") as f:
    line='1'
    while line:
        line=f.readline().strip()
        if ">" in line:
            name=line[1:line.index("|")-1]
            continue
        dic[name]+=line  

UTR5=dict(dic)
UTR5=dict(map(lambda x: (x[0][:-2],x[1]),filter(lambda x: ".1" in x[0],UTR5.items())))

## Data generation for plots

In [None]:
#Getting the LogFC in mock conditions for the whole transcriptome
ids=df["geneID"].to_list()
mock_LogFC=df["Log ratio mock"].to_list()

LogFC=dict(zip(ids,mock_LogFC))

In [None]:
#Getting the LogFC in heat conditions for the whole transcriptome //To be used instead of above code block for heat
ids=df["geneID"].to_list()
heat_LogFC=df["Log ratio heat"].to_list()

LogFC=dict(zip(ids,heat_LogFC))

In [None]:
#Subset of genes enriched in PBs based on LogFC cutoff in mock conditions
mock_genes=df[df["Log ratio mock"]>1]["geneID"].to_list()

#Calculation of GC% for mRNAs enriched in PBs// To calculate Length use len() instead
mock_enriched=dict(map(lambda x: (x, GC(UTR5[x])), filter(lambda x: x in UTR5.keys(),mock_genes)))

In [None]:
#Subset of genes enriched in PBs based on LogFC cutoff in heat conditions
heat_genes=df[df["Log ratio heat"]>1]["geneID"].to_list()

#Calculation of GC% for mRNAs enriched in PBs// To calculate Length use len() instead
heat_enriched=dict(map(lambda x: (x, GC(UTR5[x])), filter(lambda x: x in UTR5.keys(),heat_genes)))

In [None]:
#Calculation of GC% for all mRNAs in transcriptome
all_genes=dict(map(lambda x: (x, GC(UTR5[x])), filter(lambda x: x in UTR5.keys(),df["geneID"].to_list())))

In [None]:
#Calculation of GC% for coRegulon mRNAs enriched in PBs// To calculate Length use len() instead
CoRegs_mock=dict(map(lambda x: (x, GC(UTR5[x])), filter(lambda x: x in UTR5.keys(),coRegs_mock)))

In [None]:
#Calculation of GC% for coRegulon mRNAs enriched in PBs// To calculate Length use len() instead
CoRegs_heat=dict(map(lambda x: (x, GC(UTR5[x])), filter(lambda x: x in UTR5.keys(),coRegs_heat)))

## Binning of Data

In [None]:
#A single function that bins an input dictionary of sequences either based on GC% or Sequence Length

def binning(inputs,category="CDS"):
    bins=defaultdict(list)
    
    if category=='CDS':
        
        for i in range(40,48,2):
            if i==40:
                bins["<42"]=list(map(lambda y:LogFC[y[0]],filter(lambda x: x[1]<42,inputs.items())))
                continue
            if i==46:
                bins["46<"]=list(map(lambda y:LogFC[y[0]],filter(lambda x: 46<=x[1],inputs.items())))
                continue
            bins[f"{i}-{i+2}"]=list(map(lambda y:LogFC[y[0]],filter(lambda x: i<=x[1]<i+2,inputs.items())))
        return bins
    
    if category=='UTR_5':
        
        for i in range(30,50,5):
            if i==30:
                bins["<35"]=list(map(lambda y:LogFC[y[0]],filter(lambda x: x[1]<35,inputs.items())))
                continue
            if i==45:
                bins["45<"]=list(map(lambda y:LogFC[y[0]],filter(lambda x: 45<=x[1],inputs.items())))
                continue
            bins[f"{i}-{i+5}"]=list(map(lambda y:LogFC[y[0]],filter(lambda x: i<=x[1]<i+5,inputs.items())))
        return bins
    
    if category=='UTR_3':
        
        for i in range(25,45,5):
            if i==25:
                bins["<30"]=list(map(lambda y:LogFC[y[0]],filter(lambda x: x[1]<30,inputs.items())))
                continue
            if i==40:
                bins["40<"]=list(map(lambda y:LogFC[y[0]],filter(lambda x: 40<=x[1],inputs.items())))
                continue
            bins[f"{i}-{i+5}"]=list(map(lambda y:LogFC[y[0]],filter(lambda x: i<=x[1]<i+5,inputs.items())))
        return bins
    
    if category=='CDS_len':
        
        for i in range(0,3200,800):
            if i==0:
                bins["<800"]=list(map(lambda y:LogFC[y[0]],filter(lambda x: x[1]<800,inputs.items())))
                continue
            if i==2400:
                bins["2400<"]=list(map(lambda y:LogFC[y[0]],filter(lambda x: 2400<=x[1],inputs.items())))
                continue
            bins[f"{i}-{i+800}"]=list(map(lambda y:LogFC[y[0]],filter(lambda x: i<=x[1]<i+800,inputs.items())))
        return bins
    
    if category=='UTR_5_len':
        
        for i in range(0,400,100):
            if i==0:
                bins["<100"]=list(map(lambda y:LogFC[y[0]],filter(lambda x: x[1]<100,inputs.items())))
                continue
            if i==300:
                bins["300<"]=list(map(lambda y:LogFC[y[0]],filter(lambda x: 300<=x[1],inputs.items())))
                continue
            bins[f"{i}-{i+100}"]=list(map(lambda y:LogFC[y[0]],filter(lambda x: i<=x[1]<i+100,inputs.items())))
        return bins
    
    if category=='UTR_3_len':
        
        for i in range(0,600,150):
            if i==0:
                bins["<150"]=list(map(lambda y:LogFC[y[0]],filter(lambda x: x[1]<150,inputs.items())))
                continue
            if i==450:
                bins["450<"]=list(map(lambda y:LogFC[y[0]],filter(lambda x: 450<=x[1],inputs.items())))
                continue
            bins[f"{i}-{i+150}"]=list(map(lambda y:LogFC[y[0]],filter(lambda x: i<=x[1]<i+150,inputs.items())))
        return bins
    
    if category=='CDS_coRegs':
        
        for i in range(42,48,2):
            if i==42:
                bins["<44"]=list(map(lambda y:LogFC[y[0]],filter(lambda x: x[1]<44,inputs.items())))
                continue
            if i==46:
                bins["46<"]=list(map(lambda y:LogFC[y[0]],filter(lambda x: 46<=x[1],inputs.items())))
                continue
            bins[f"{i}-{i+2}"]=list(map(lambda y:LogFC[y[0]],filter(lambda x: i<=x[1]<i+2,inputs.items())))
        return bins
    
    if category=='UTR_5_coRegs':
        
        for i in range(30,45,5):
            if i==30:
                bins["<35"]=list(map(lambda y:LogFC[y[0]],filter(lambda x: x[1]<35,inputs.items())))
                continue
            if i==40:
                bins["40<"]=list(map(lambda y:LogFC[y[0]],filter(lambda x: 40<=x[1],inputs.items())))
                continue
            bins[f"{i}-{i+5}"]=list(map(lambda y:LogFC[y[0]],filter(lambda x: i<=x[1]<i+5,inputs.items())))
        return bins
    
    if category=='UTR_3_coRegs':
        
        for i in range(25,40,5):
            if i==25:
                bins["<30"]=list(map(lambda y:LogFC[y[0]],filter(lambda x: x[1]<30,inputs.items())))
                continue
            if i==35:
                bins["35<"]=list(map(lambda y:LogFC[y[0]],filter(lambda x: 35<=x[1],inputs.items())))
                continue
            bins[f"{i}-{i+5}"]=list(map(lambda y:LogFC[y[0]],filter(lambda x: i<=x[1]<i+5,inputs.items())))
        return bins

In [None]:
#Example of binning PB-enriched transcripts based on 5' UTR GC% // Could be used for other types of binning
binned=binning(inputs=mock_enriched,category="UTR_5")

## Statistical Analysis

In [None]:
#Non-parametric ANOVA to check median differences across bins before using a post-hoc test
stats.kruskal(binned["<35"],binned["35-40"],binned["40-45"],binned["45<"])

In [None]:
#Dunn's test to find significant pairwise differences in median values
import scikit_posthocs as sp

results=np.triu(sp.posthoc_dunn(list(binned.values()), p_adjust = 'bonferroni'),1)
indices=np.nonzero(results)
p_vals=results[indices].tolist()

In [None]:
#Custom Dunn's p-value text annotations // Values greater than 0.05 appear as ns
custom_text=list(map(lambda x: "ns"if x>=0.05 else "$"+f"{x:.2e}"f"{x}"[:f"{x:.2e}".index("-")].replace("e","\cdot10^{%s}"%f"{x:.2e}"[f"{x:.2e}".index("-"):])+"$",p_vals))

## Data Visualization

In [None]:
#Restricting stripplot points to a specific range, so that outliers do not affect y range valuee.
points={k:list(filter(lambda x: -3<=x<=4,v))for k,v in binned.items()}

In [None]:
#Template for figure generation
from matplotlib import gridspec

gs=gridspec.GridSpec(2,1,height_ratios=[4,1])

ax0=plt.subplot(gs[0,0])
sns.set_style("whitegrid")

PROPS = {
    'boxprops':{'facecolor':'#d0d9dd', 'edgecolor':(0,0,0,0.7)},
    'medianprops':{'color':'k'},
    'whiskerprops':{'color':'k'},
    'capprops':{'color':'k'}
}

sns.boxplot(data=list(binned.values()),showfliers=False,**PROPS)
sns.stripplot(data=list(points.values()),alpha=.8,color="#020202",linewidth=0.1,dodge=True,jitter=True,size=5)

test_results=add_stat_annotation(ax=ax0,x=list(binned.keys()),
                                           y=list(map(lambda x: np.median(x),binned.values())),
                                           box_pairs=list(combinations(list(binned.keys()),2)),
                                           pvalues=p_vals,
                                           text_annot_custom=custom_text,
                                           text_format="simple",
                                           loc="inside",
                                           verbose=1,
                                           perform_stat_test=False,
                                           line_offset_to_box=0.58,
                                           line_height=0.005,
                                           line_offset=0.08)
ax0.set_xticklabels(list(binned.keys()))
ax0.set_ylim([0,4])
ax0.set_title("5' UTR mock")
ax0.set_ylabel("LogFC")
            
ax1=plt.subplot(gs[1,0])
sns.scatterplot(x=list(LogFC.values()),y=list(binned.values()),color="k")
ax1.set_xlim([0.6,4.3])
ax1.set_xlabel("LogFC")
ax1.set_ylabel("GC%")

sns.despine(left=True, bottom=False)
#plt.savefig("plot.tiff")