In [1]:
import pandas as pd
import os
import numpy as np
from scipy.stats import pearsonr
from tqdm import tqdm

In [2]:
def get_esm_score_for_с(df1, df2):
    scores = []
    for protein, pos, aa1, aa2 in tqdm(zip(df2["Protein"], df2["Pos.C"], df2["AA1.C"], df2["AA2.C"])):
        try:
            score = df1[(df1.uniprot_id == protein) & (df1.row == aa1 + ' ' + str(pos)) & (df1.column == aa2)]['esm1b_score'].values[0]
        except:
            score = None
        scores.append(score)
    return scores

In [3]:
def get_esm_score_for_a(df1, df2):
    scores = []
    for protein, pos, aa1, aa2 in tqdm(zip(df2["Protein"], df2["Pos.A"], df2["AA1.A"], df2["AA2.A"])):
        try:
            score = df1[(df1.uniprot_id == protein) & (df1.row == aa1 + ' ' + str(pos)) & (df1.column == aa2)]['esm1b_score'].values[0]
        except:
            score = None
        scores.append(score)
    return scores

In [4]:
res = pd.read_csv("result.native.llr.tsv")
res['esm1b_score'] = pd.to_numeric(res['esm1b_score'], errors='coerce')
res = res[~res['esm1b_score'].isna()]


# UBC9

In [14]:
ubc9 = pd.read_csv("data_per_dataset/pairs_ubc9.tsv", sep='\t')

In [15]:
ubc9["Score.A'.Esm1b"] = None

In [16]:
%%time

scores_c = get_esm_score_for_с(res, ubc9)
scores_a = get_esm_score_for_a(res, ubc9)

ubc9['Score.C.Esm1b'] = scores_c
ubc9['Score.A.Esm1b'] = scores_a

2302it [00:24, 94.81it/s]
2302it [00:24, 93.83it/s]

CPU times: user 48.5 s, sys: 376 ms, total: 48.8 s
Wall time: 48.8 s





In [17]:
%%time

for i, row in enumerate(ubc9.iterrows()):
    _, uniprot_id, pos_c, aa1_c, aa2_c, score_c, pos_a, aa1_a, aa2_a, score_a, score_a_dash, score_delta, score_a_dash, score_c_esm, score_a_esm = row[1]
    try:
        data = pd.read_csv(f'res_all_scores/ubc9/ubc9_all/ubc9_all/UBC9_{aa1_c}{pos_c}{aa2_c}.modified.tsv')
        ubc9.loc[i, "Score.A'.Esm1b"] = data[(data['row'] == aa1_a + ' ' + str(pos_a)) & (data.column == aa2_a)]['esm1b_score'].values[0]
    except:
        ('NO')

CPU times: user 3.99 s, sys: 73.1 ms, total: 4.07 s
Wall time: 4.26 s


In [19]:
ubc9["Score.A'.Esm1b"].isna().sum()

1

In [28]:
ubc9.to_csv('ubc9_all.tsv', sep='\t', index=False)

### YAP1

In [30]:
yap1 = pd.read_csv("data_per_dataset/pairs_yap1.tsv", sep='\t')

In [31]:
yap1.shape

(38720, 12)

In [33]:
yap1["Score.A'.Esm1b"] = None

In [41]:
%%time

for i, row in tqdm(enumerate(yap1.iterrows())):
    _, uniprot_id, pos_c, aa1_c, aa2_c, score_c, pos_a, aa1_a, aa2_a, score_a, score_a_dash, score_delta, score_a_dash, score_c_esm, score_a_esm = row[1]
    try:
        data = pd.read_csv(f'res_all_scores/yap1/yap1_all/YAP1_{aa1_c}{pos_c}{aa2_c}.modified.tsv')
        yap1.loc[i, "Score.A'.Esm1b"] = data[(data['row'] == aa1_a + ' ' + str(pos_a)) & (data.column == aa2_a)]['esm1b_score'].values[0]
    except:
        ('NO')

38720it [02:28, 260.97it/s]

CPU times: user 2min 25s, sys: 2.3 s, total: 2min 28s
Wall time: 2min 28s





In [42]:
yap1["Score.A'.Esm1b"].isna().sum()

0

In [32]:
%%time

scores_c = get_esm_score_for_с(res, yap1)
scores_a = get_esm_score_for_a(res, yap1)

yap1['Score.C.Esm1b'] = scores_c
yap1['Score.A.Esm1b'] = scores_a

38720it [06:52, 93.86it/s]
38720it [06:56, 92.86it/s]

CPU times: user 13min 43s, sys: 4.83 s, total: 13min 48s
Wall time: 13min 49s





In [43]:
yap1.to_csv('res_fin_df/yap1_all.tsv', sep='\t', index=False)

### BRCA1 db 6

In [44]:
brca1_db6 = pd.read_csv("data_per_dataset/pairs_brca1_db6.tsv", sep='\t')

In [46]:
%%time

scores_c = get_esm_score_for_с(res, brca1_db6)
scores_a = get_esm_score_for_a(res, brca1_db6)

brca1_db6['Score.C.Esm1b'] = scores_c
brca1_db6['Score.A.Esm1b'] = scores_a

4304it [00:46, 92.17it/s]
4304it [00:46, 92.16it/s]

CPU times: user 1min 32s, sys: 722 ms, total: 1min 33s
Wall time: 1min 33s





In [47]:
brca1_db6["Score.A'.Esm1b"] = None

In [48]:
%%time

for i, row in tqdm(enumerate(brca1_db6.iterrows())):
    _, uniprot_id, pos_c, aa1_c, aa2_c, score_c, pos_a, aa1_a, aa2_a, score_a, score_a_dash, score_delta, score_a_dash, score_c_esm, score_a_esm = row[1]
    try:
        data = pd.read_csv(f'res_all_scores/brca1_db6/brca1_db6_all/BRCA1_db6_{aa1_c}{pos_c}{aa2_c}.modified.tsv')
        brca1_db6.loc[i, "Score.A'.Esm1b"] = data[(data['row'] == aa1_a + ' ' + str(pos_a)) & (data.column == aa2_a)]['esm1b_score'].values[0]
    except:
        ('NO')

4304it [00:52, 81.88it/s] 

CPU times: user 50.5 s, sys: 972 ms, total: 51.4 s
Wall time: 52.6 s





In [48]:
brca1_db6["Score.A'.Esm1b"].notna().sum()

4218

In [34]:
brca1_db6.shape

(4304, 13)

In [49]:
brca1_db6.to_csv('res_fin_df/brca1_db6_all.tsv', sep='\t', index=False)

### BRCA1 db 8

In [3]:
brca1_db8 = pd.read_csv("data_per_dataset/pairs_brca1_db8.tsv", sep='\t')

In [12]:
%%time

scores_c = get_esm_score_for_с(res, brca1_db8)
scores_a = get_esm_score_for_a(res, brca1_db8)

brca1_db8['Score.C.Esm1b'] = scores_c
brca1_db8['Score.A.Esm1b'] = scores_a

8426it [01:36, 87.30it/s]
8426it [01:34, 89.07it/s]

CPU times: user 3min 8s, sys: 1.5 s, total: 3min 10s
Wall time: 3min 11s





In [13]:
brca1_db8['Score.A.Esm1b'].isna().sum()

76

In [14]:
brca1_db8["Score.A'.Esm1b"] = None

In [17]:
%%time

for i, row in tqdm(enumerate(brca1_db8.iterrows())):
    _, uniprot_id, pos_c, aa1_c, aa2_c, score_c, pos_a, aa1_a, aa2_a, score_a, score_a_dash, score_delta, score_a_dash, score_c_esm, score_a_esm = row[1]
    try:
        data = pd.read_csv(f'res_all_scores/brca1_db8_all_fin/BRCA1_db8_{aa1_c}{pos_c}{aa2_c}.modified.tsv')
        brca1_db8.loc[i, "Score.A'.Esm1b"] = data[(data['row'] == aa1_a + ' ' + str(pos_a)) & (data.column == aa2_a)]['esm1b_score'].values[0]
    except:
        ('NO')

8426it [01:36, 87.21it/s] 

CPU times: user 1min 33s, sys: 1.21 s, total: 1min 34s
Wall time: 1min 36s





In [18]:
brca1_db8["Score.A'.Esm1b"].isna().sum()

152

In [21]:
brca1_db8.to_csv('res_fin_df/brca1_db8_all.tsv', sep='\t', index=False)

### PABP

In [50]:
pabp = pd.read_csv("data_per_dataset/pairs_pabp.tsv", sep='\t')

In [51]:
%%time

scores_c = get_esm_score_for_с(res, pabp)
scores_a = get_esm_score_for_a(res, pabp)

pabp['Score.C.Esm1b'] = scores_c
pabp['Score.A.Esm1b'] = scores_a

73022it [13:04, 93.04it/s]
73022it [13:01, 93.41it/s]

CPU times: user 25min 53s, sys: 11.9 s, total: 26min 5s
Wall time: 26min 6s





In [52]:
pabp["Score.A'.Esm1b"] = None

In [53]:
%%time

for i, row in tqdm(enumerate(pabp.iterrows())):
    _, uniprot_id, pos_c, aa1_c, aa2_c, score_c, pos_a, aa1_a, aa2_a, score_a, score_a_dash, score_delta, score_a_dash, score_c_esm, score_a_esm = row[1]
    try:
        data = pd.read_csv(f'res_all_scores/pabp_2/pabp_all/PABP_{aa1_c}{pos_c}{aa2_c}.modified.tsv')
        pabp.loc[i, "Score.A'.Esm1b"] = data[(data['row'] == aa1_a + ' ' + str(pos_a)) & (data.column == aa2_a)]['esm1b_score'].values[0]
    except:
        ('NO')

73022it [05:03, 240.84it/s]

CPU times: user 4min 58s, sys: 4.34 s, total: 5min 2s
Wall time: 5min 3s





In [54]:
pabp["Score.A'.Esm1b"].isna().sum()

0

In [55]:
pabp.to_csv('res_fin_df/pabp_all.tsv', sep='\t', index=False)

### UBE4B

In [56]:
ube4b = pd.read_csv("data_per_dataset/pairs_ube4b.tsv", sep='\t')

In [57]:
ube4b.shape

(103798, 12)

In [58]:
%%time

scores_c = get_esm_score_for_с(res, ube4b)
scores_a = get_esm_score_for_a(res, ube4b)

ube4b['Score.C.Esm1b'] = scores_c
ube4b['Score.A.Esm1b'] = scores_a

103798it [18:33, 93.18it/s]
103798it [18:32, 93.28it/s]

CPU times: user 36min 50s, sys: 17.3 s, total: 37min 7s
Wall time: 37min 6s





In [61]:
ube4b["Score.A'.Esm1b"] = None

In [62]:
%%time

for i, row in tqdm(enumerate(ube4b.iterrows())):
    _, uniprot_id, pos_c, aa1_c, aa2_c, score_c, pos_a, aa1_a, aa2_a, score_a, score_a_dash, score_delta, score_a_dash, score_c_esm, score_a_esm = row[1]
    try:
        data = pd.read_csv(f'res_all_scores/ube4b_all/UBE4B_{aa1_c}{pos_c}{aa2_c}.modified.tsv')
        ube4b.loc[i, "Score.A'.Esm1b"] = data[(data['row'] == aa1_a + ' ' + str(pos_a)) & (data.column == aa2_a)]['esm1b_score'].values[0]
    except:
        ('NO')

103798it [13:21, 129.57it/s]

CPU times: user 13min 10s, sys: 11.3 s, total: 13min 21s
Wall time: 13min 21s





In [63]:
ube4b["Score.A'.Esm1b"].isna().sum()

0

In [64]:
ube4b.to_csv('res_fin_df/ube4b_all.tsv', sep='\t', index=False)

# SPG1

In [12]:
spg1 = pd.read_csv("data_per_dataset/pairs_spg1.tsv", sep='\t')

In [13]:
spg1

Unnamed: 0,#Dataset,Protein,Pos.C,AA1.C,AA2.C,Score.C,Pos.A,AA1.A,AA2.A,Score.A,Score.A',Score.Delta
0,maveDB_15,SPG1_STRSG,228,Q,A,0.602,229,Y,A,-0.788,-3.782,-2.994
1,maveDB_15,SPG1_STRSG,228,Q,A,0.602,229,Y,C,-0.699,-2.562,-1.863
2,maveDB_15,SPG1_STRSG,228,Q,A,0.602,229,Y,D,-6.644,-6.436,0.208
3,maveDB_15,SPG1_STRSG,228,Q,A,0.602,229,Y,E,-6.796,-7.562,-0.766
4,maveDB_15,SPG1_STRSG,228,Q,A,0.602,229,Y,F,0.076,-0.108,-0.184
...,...,...,...,...,...,...,...,...,...,...,...,...
1071829,maveDB_15,SPG1_STRSG,282,E,Y,-2.396,281,T,R,-0.088,-0.126,-0.038
1071830,maveDB_15,SPG1_STRSG,282,E,Y,-2.396,281,T,S,-0.252,-0.310,-0.058
1071831,maveDB_15,SPG1_STRSG,282,E,Y,-2.396,281,T,V,-0.580,-0.515,0.065
1071832,maveDB_15,SPG1_STRSG,282,E,Y,-2.396,281,T,W,-0.326,-0.796,-0.470


In [16]:
%%time

scores_c = get_esm_score_for_с(res, spg1)
scores_a = get_esm_score_for_a(res, spg1)

spg1['Score.C.Esm1b'] = scores_c
spg1['Score.A.Esm1b'] = scores_a

1071834it [3:17:18, 90.54it/s]
1071834it [3:17:44, 90.34it/s]


CPU times: user 6h 29min 45s, sys: 2min 6s, total: 6h 31min 51s
Wall time: 6h 35min 3s


In [19]:
spg1.isna().sum()

#Dataset             0
Protein              0
Pos.C                0
AA1.C                0
AA2.C                0
Score.C              0
Pos.A                0
AA1.A                0
AA2.A                0
Score.A              0
Score.A'             0
Score.Delta          0
Score.C.Esm1b    19484
Score.A.Esm1b    19484
dtype: int64

In [17]:
spg1.to_csv('res_fin_df/spg1_a_c.tsv', sep='\t', index=False)

In [21]:
spg1["Score.A'.Esm1b"] = None

In [22]:
%%time

for i, row in tqdm(enumerate(spg1.iterrows())):
    _, uniprot_id, pos_c, aa1_c, aa2_c, score_c, pos_a, aa1_a, aa2_a, score_a, score_a_dash, score_delta, score_a_dash, score_c_esm, score_a_esm = row[1]
    try:
        data = pd.read_csv(f'res_all_scores/spg1/SPG1_{aa1_c}{pos_c}{aa2_c}.modified.tsv')
        spg1.loc[i, "Score.A'.Esm1b"] = data[(data['row'] == aa1_a + ' ' + str(pos_a)) & (data.column == aa2_a)]['esm1b_score'].values[0]
    except:
        ('NO')

1071834it [58:41, 304.40it/s]


CPU times: user 57min 40s, sys: 51.2 s, total: 58min 31s
Wall time: 58min 41s


In [24]:
spg1.to_csv('res_fin_df/spg1_all.tsv', sep='\t', index=False)

In [23]:
spg1["Score.A'.Esm1b"].isna().sum()

38968