In [1]:
import pandas as pd
from tqdm import tqdm
from scipy.stats import spearmanr
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(rc={'figure.figsize':(8,5)})

In [2]:
single = pd.read_csv("../data/single.main.tsv", sep='\t')

In [3]:
single = single.rename(columns={'#Dataset': 'Dataset'})

In [4]:
result = pd.read_csv("../result.native.llr.tsv")
result['esm1b_score'] = pd.to_numeric(result['esm1b_score'], errors='coerce')

In [5]:
def get_esm_score(df1, df2):
    scores = []
    for protein, pos, aa1, aa2 in tqdm(zip(df2["Protein"], df2["Pos.A"], df2["AA1.A"], df2["AA2.A"])):
        try:
            score = df1[(df1.uniprot_id == protein) & (df1.row == aa1 + ' ' + str(pos)) & (df1.column == aa2)]['esm1b_score'].values[0]
            scores.append(score)
        except:
            score = None
            scores.append(score)
    return scores

In [6]:
scores = get_esm_score(result, single)

8520it [01:33, 91.54it/s]


In [7]:
len(scores)

8520

In [8]:
single['Score.Esm1b'] = scores

In [9]:
single["Score.Esm1b"].isna().sum()

29

In [10]:
single[single["Score.Esm1b"].isna()]

Unnamed: 0,Dataset,Protein,Pos.A,AA1.A,AA2.A,Score.A,Score.Esm1b
1026,maveDB_15,SPG1_STRSG,228,Q,A,0.602,
1027,maveDB_15,SPG1_STRSG,228,Q,C,0.034,
1028,maveDB_15,SPG1_STRSG,228,Q,D,-0.134,
1029,maveDB_15,SPG1_STRSG,228,Q,E,-0.071,
1030,maveDB_15,SPG1_STRSG,228,Q,F,0.693,
1031,maveDB_15,SPG1_STRSG,228,Q,G,0.367,
1032,maveDB_15,SPG1_STRSG,228,Q,H,0.294,
1033,maveDB_15,SPG1_STRSG,228,Q,I,0.345,
1034,maveDB_15,SPG1_STRSG,228,Q,K,0.413,
1035,maveDB_15,SPG1_STRSG,228,Q,L,0.458,


In [11]:
single.Protein.unique()

array(['SPG1_STRSG', 'UBE4B_MOUSE', 'PABP_YEAST', 'YAP1_HUMAN',
       'BRCA1_HUMAN', 'UBC9_HUMAN'], dtype=object)

In [19]:
single_not_na = single.dropna()

In [20]:
single_not_na['Score.Esm1b'].isna().sum()

0

In [22]:
mask_6 = (single_not_na['Protein'] == 'BRCA1_HUMAN') & (single_not_na['Dataset'] == 'maveDB_6')
mask_8 = (single_not_na['Protein'] == 'BRCA1_HUMAN') & (single_not_na['Dataset'] == 'maveDB_8')

In [23]:
single_not_na.loc[mask_6, 'Protein'] = 'BRCA1_HUMAN_1'
single_not_na.loc[mask_8, 'Protein'] = 'BRCA1_HUMAN_2'

In [24]:
for prot in single_not_na.Protein.unique():
    print(prot)

SPG1_STRSG
UBE4B_MOUSE
PABP_YEAST
YAP1_HUMAN
BRCA1_HUMAN_2
BRCA1_HUMAN_1
UBC9_HUMAN


In [25]:
corr = pd.DataFrame(columns=[single_not_na.Protein.unique()], index=['spearmanr', 'p_value'])

for prot in single_not_na.Protein.unique():
    corr_res, p_value = spearmanr(single_not_na[single_not_na['Protein'] == prot]['Score.Esm1b'], single_not_na[single_not_na['Protein'] == prot]['Score.A'])
    corr.loc['spearmanr', prot] = corr_res
    corr.loc['p_value', prot] = p_value

corr

Unnamed: 0,SPG1_STRSG,UBE4B_MOUSE,PABP_YEAST,YAP1_HUMAN,BRCA1_HUMAN_2,BRCA1_HUMAN_1,UBC9_HUMAN
spearmanr,0.273374,0.434379,0.616402,0.561296,0.045095,0.266067,0.410937
p_value,0.0,0.0,0.0,0.0,0.0237,0.0,0.0


In [71]:
spearmanr(single_not_na['Score.A'], single_not_na['Score.Esm1b'])

SignificanceResult(statistic=0.37120783532463936, pvalue=1.163958234888451e-275)