In [None]:
import pandas as pd
from collections import defaultdict
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import gridspec
from scipy.stats import pearsonr
from scipy import stats

### Import of Data

In [None]:
df=pd.read_excel(r"C:\...\Enriched transcripts.xlsx",skiprows=[0])

In [None]:
coR_Heat=pd.read_excel(r"C:\...\co_regulons_new_andria.xlsx", sheet_name="HS APEAL RIP up 0.8")
coR_Mock=pd.read_excel(r"C:\...\co_regulons_new_andria.xlsx", sheet_name="NS APEAL RIP up 0.8")

In [None]:
df2=pd.read_excel(r"C:\...\P bodies interactome meta analytics_ANDRIA CORRECT values final.xlsx",skiprows=[0,1,2])

In [None]:
#CDS sequences downloaded from TAIR
dic=defaultdict(str)
with open("C:/Users/nwntas/Araport11_cds_20220914") as f:
    line='1'
    while line:
        line=f.readline().strip()
        if ">" in line:
            name=line[1:line.index("|")-1]
            continue
        dic[name]+=line

In [None]:
cds=dict(dic)
cds=dict(filter(lambda x: len(x[1])%3==0 and set(x[1])-{"A",'C',"G","T"}==set() and ".1" in x[0] and x[1][:3]=="ATG" and len(x[1])>3 and x[1][-3:]in ['TGA','TAA','TAG'], cds.items()))
codons=dict(map(lambda x: [x[0],[x[1][i:i+3] for i in range(0,len(x[1]),3)]],cds.items()))

In [None]:
#Protein sequences downloaded from TAIR
dic=defaultdict(str)
with open(r"C:\Users\nwntas\Araport11_pep_20220914") as f:
    line='1'
    while line:
        line=f.readline().strip()
        if ">" in line:
            name=line[1:line.index("|")-1]
            continue
        dic[name]+=line  

In [None]:
proteins=dict(dic)
proteins=dict(filter(lambda x: ".1" in x[0],proteins.items()))
proteins=dict(map(lambda x: (x[0][:-2],x[1][:-1]),proteins.items()))

### Calculations

In [None]:
coRegs_heat=set(coR_Heat["Co-regulons"].dropna().to_list())
coRegs_mock=set(coR_Mock["Co-regulons"].dropna().to_list())

In [None]:
mock_enriched=df[df["Log ratio mock"]>1]["geneID"].to_list()
heat_enriched=df[df["Log ratio heat"]>1]["geneID"].to_list()

In [None]:
up_hs=set(df2[(df2["30 AP LOGratio"]>=0.8)|(df2["30 PDL LOGratio"]>=0.8)]["protein"].to_list())

up_ns=set(df2[(df2["22 AP LOGratio"]>=0.8)|(df2["22 PDL LOGratio"]>=0.8)]["protein"].to_list())

In [None]:
down_hs=set(df2[(df2["30 AP LOGratio"]<-0.8)|(df2["30 PDL LOGratio"]<-0.8)]["protein"].to_list())

down_ns=set(df2[(df2["22 AP LOGratio"]<-0.8)|(df2["22 PDL LOGratio"]<-0.8)]["protein"].to_list())

In [None]:
Gen_code={
    'Phe': ["TTT","TTC"],
    'Leu':["TTA","TTG","CTT","CTC","CTA",'CTG'],
    'Ile':["ATT","ATC","ATA"],
    'Met':["ATG"],
    'Val':["GTT","GTC","GTA","GTG"],
    'Ser':["TCT","TCC","TCA",'TCG',"AGT",'AGC'],
    'Pro':["CCT","CCC","CCA","CCG"],
    'Thr':['ACT',"ACC","ACA","ACG"],
    'Ala':["GCT","GCC","GCA","GCG"],
    'Tyr':["TAT","TAC"],
    'Ter':["TAA","TAG","TGA"],
    'His':["CAT","CAC"],
    'Gln':["CAA","CAG"],
    'Asn':["AAT","AAC"],
    'Lys':["AAA","AAG"],
    'Asp':["GAT","GAC"],
    'Glu':["GAA","GAG"],
    'Cys':["TGT",'TGC'],
    'Trp':["TGG"],
    'Arg':["CGT","CGC","CGA","CGG","AGA","AGG"],
    'Gly':['GGT',"GGC","GGA","GGG"]
}

In [None]:
#Based on calculations from "On the physical basis of the Amino Acid PR"
PYR_density={
    'Phe':5.0,
    'Leu':4.9,
    'Ile':4.9,
    'Met':5.3,
    'Val':5.6,
    'Ser':7.5,
    'Pro':6.6,
    'Thr':6.6,
    'Ala':7.0,
    'Tyr':5.4,
    'His':8.4,
    'Gln':8.6,
    'Asn':10.0,
    'Lys':10.1,
    'Asp':13.0,
    'Glu':12.5,
    'Cys':4.8,
    'Trp':5.2,
    'Arg':9.1,
    'Gly':7.9
}

In [None]:
#PUR affinity based on calculations from 
#"Evidence of direct complementary interactions between messenger RNAs and their cognate proteins"
PUR_density={
    'Phe':0.11,
    'Leu':0.03,
    'Ile':0.05,
    'Met':-0.12,
    'Val':-0.02,
    'Ser':0.04,
    'Pro':-0.06,
    'Thr':0.03,
    'Ala':0.02,
    'Tyr':0.09,
    'His':0.18,
    'Gln':0.04,
    'Asn':-0.03,
    'Lys':-0.07,
    'Asp':-0.01,
    'Glu':-0.15,
    'Cys':-0.13,
    'Trp':-0.16,
    'Arg':-0.01,
    'Gly':-0.02
}

In [None]:
amino_translation={
    "A":"Ala",
    "C":"Cys",
    "D":"Asp",
    "E":"Glu",
    "F":"Phe",
    "G":"Gly",
    'H':"His",
    "I":"Ile",
    "K":"Lys",
    "L":"Leu",
    "M":"Met",
    "N":"Asn",
    "P":"Pro",
    "Q":"Gln",
    "R":"Arg",
    "S":"Ser",
    "T":'Thr',
    "V":"Val",
    "W":"Trp",
    "Y":"Tyr"
}

In [None]:
#This is basically the master function for offseting
def offsetting(seq,prot_seq,offset=0):
    codons_new=[cds[seq+".1"][k+offset:offset+k+3] for k in range(0,len(cds[seq+".1"][offset:]),3) if len(cds[seq+".1"][offset:])-3>k]
    offset_list= list(zip(map(lambda x: amino_translation[x],proteins[prot_seq]),codons_new))
    return [np.mean(list(map(lambda x: (PYR_density[x[0]],(x[1].count("C")+x[1].count("T"))/3),offset_list[i:i+21])),axis=0) for i in range(len(offset_list)) if len(offset_list)-21>i]

### Generating plots

In [None]:
def generate_plot(gene,location=1,save=False):
    density=[k for k,v in offsetting(gene,gene)]
    content=[v for k,v in offsetting(gene,gene)]

    gs=gridspec.GridSpec(2,1,height_ratios=[4,1])
    ax=plt.subplot(gs[0,0])
    line1, = ax.plot(list(range(len(content))),content)
    ax.set_ylabel("mRNA PYR density")
    ax.set_yticks(np.arange(0.2,1,0.2))
    ax_new = ax.twinx()
    line2, = ax_new.plot(list(range(len(density))),density,color="orange")
    ax_new.invert_yaxis()
    #ax_new.set_yticks(np.arange(5,10.5,1.5))
    ax_new.set_ylabel("Protein PYR affinity")

    lines=[line1,line2]
    labels=["mRNA","Protein"]
    plt.legend(handles=lines,labels=labels,loc=location)
    ax.set_title(f"{gene} | pearson's R:%.3f"%pearsonr(density,content)[0])

    ax1=plt.subplot(gs[1,0])
    norm_density=list(map(lambda x: 1-(x-min(density))/(max(density)-min(density)),density))
    norm_content=list(map(lambda x: (x-min(content))/(max(content)-min(content)),content))
    similarity_score=np.abs(np.array(norm_density)-np.array(norm_content))

    sns.lineplot(x=range(len(similarity_score)),y=similarity_score)
    ax1.set_ylim([0,0.7])
    ax1.set_ylabel("Absolute Distance")
    ax1.set_xlabel("Sequence bins")

    if save:
        plt.savefig(f"{gene}_cognate.tiff",dpi=600)

In [None]:
#Example of plot// To save change False to True
generate_plot("AT1G78080",location=4,save=False)

In [None]:
#Here either mock_enriched or coRegs_mock may be used
l=[]
for coReg in mock_enriched:
    try:
        density=[k for k,v in offsettings(coReg,coReg)]
        content=[v for k,v in offsettings(coReg,coReg)]
        l+=[pearsonr(content,density)[0]]
    except KeyError:
        continue

In [None]:
#Here either heat_enriched or coRegs_heat may be used
l1=[]
for coReg in heat_enriched:
    try:
        density=[k for k,v in offsettings(coReg,coReg)]
        content=[v for k,v in offsettings(coReg,coReg)]
        l1+=[pearsonr(content,density)[0]]
    except KeyError:
        continue

In [None]:
#Either l or l1 may be used
fig, ax=plt.subplots()
sns.histplot(l)
ax.set_xlabel("Pearson's R correlation")
ax.set_title("PYR content/ PYR density\nCo-Regulons NS")

In [None]:
def PUR_PR(x):
    return [sum(list(map(lambda y: PUR_density[amino_translation[y]],proteins[x][i:i+21])))/21 for i in range(len(proteins[x])) if len(proteins[x])-21>i]

In [None]:
#In this cell up_hs may be changed with down_hs
ph=[]
for i in up_hs:
    try:
        nh+=[PUR_PR(i)]
    except KeyError:
        print(i)
        continue

In [None]:
#In this cell up_hs may be changed with down_ns
nh=[]
for i in up_ns:
    try:
        nh+=[PUR_PR(i)]
    except KeyError:
        print(i)
        continue

In [None]:
a=np.concatenate(ph)
b=np.concatenate(nh)

In [None]:
ps=list(map(lambda x: "ns"if x>=0.05 else "$"+f"{x:.2e}"f"{x}"[:f"{x:.2e}".index("-")].replace("e","\cdot10^{%s}"%f"{x:.2e}"[f"{x:.2e}".index("-"):])+"$",[stats.mannwhitneyu(a,b)[1]]))

In [None]:
fig, ax=plt.subplots(figsize=(10,5))
sns.set_style("white")
sns.histplot([a,b],bins=50,kde=True,palette=["#87CEEB","#FFA500"])
ax.legend(labels=['Heat',"Mock"][::-1],loc=2)
ax.set_title('Heat/Mock')
ax.set_xlabel("bin PUR affinity score")
ax_sub=fig.add_axes([0.66, 0.4,0.2,0.3])
ax.axvline(np.median(a),color="#87CEEB",linestyle="--")
ax.axvline(np.median(b),color="#FFA500",linestyle="--")

#ax.set_ylim([1,44000])
sns.set_style("whitegrid")
sec=sns.boxplot(data=[a,b], 
            showfliers=False,palette=["#87CEEB","#FFA500"])

ax_sub.set_xticklabels(['Heat',"Mock"])

add_stat_annotation(ax=sec,x=['Heat',"Mock"],
                                           y=[np.median(a),np.median(b)],
                                           box_pairs=[('Heat',"Mock")],
                                           pvalues=[stats.mannwhitneyu(a,b)[1]],
                                           text_format="simple",
                                           text_annot_custom=ps,
                                           loc="inside",
                                           verbose=1,
                                           perform_stat_test=False,
                                           line_offset_to_box=0.8)

#plt.savefig("name.tiff", dpi=600)