In [33]:
import pandas as pd
import os
import numpy as np
import seaborn as sns
sns.set(rc={'figure.figsize':(8,5)})
import matplotlib.pyplot as plt
from scipy.stats import pearsonr, spearmanr
from tqdm import tqdm
from scipy.special import expit

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [34]:
def get_pos_diff(df):
    pos_diff = np.abs(df['Pos.C'] - df['Pos.A'])
    df['pos_diff'] = pos_diff
    return df

In [35]:
def get_spearmanr_le(df, n):
    return spearmanr(df[df.pos_diff <= n]["Score.Delta"], df[df.pos_diff <= n]["Score.Delta.Esm1b"])

In [36]:
def get_corr(df, list_of_n):
    corr = pd.DataFrame(columns=['count', 'spearmanr', 'p_value'], index=list_of_n)
    for n in list_of_n:
        count = df[df.pos_diff <= n].shape[0]
        corr.loc[n, 'count'] = count
        corr_res, p_value = get_spearmanr_le(df, n)
        corr.loc[n, 'spearmanr'] = corr_res
        corr.loc[n, 'p_value'] = p_value
    return corr

In [37]:
list_of_n = [1, 2, 3, 5, 6, 7, 10, 15, 20, 30, 40, 50, 100, 200, 500, 1000]

In [38]:
data = pd.read_csv('all_data.tsv', sep='\t')

In [39]:
data.Protein.unique()

array(['UBC9_HUMAN', 'YAP1_HUMAN', 'PABP_YEAST', 'UBE4B_MOUSE',
       'BRCA1_HUMAN_db6', 'BRCA1_HUMAN_db8', 'SPG1_STRSG'], dtype=object)

# UBC9

In [40]:
ubc9 = data[data['Protein'] == 'UBC9_HUMAN']

In [41]:
ubc9 = get_pos_diff(ubc9)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['pos_diff'] = pos_diff


In [42]:
spearmanr(ubc9["Score.Delta"], ubc9["Score.Delta.Esm1b"])

SignificanceResult(statistic=-0.040485150072776824, pvalue=0.052114732475482546)

In [43]:
ubc9_corr = get_corr(ubc9, list_of_n)
ubc9_corr

Unnamed: 0,count,spearmanr,p_value
1,12,0.014135,0.965224
2,34,0.301691,0.082917
3,54,0.293042,0.031517
5,96,0.141549,0.168927
6,116,0.151266,0.105042
7,136,0.045189,0.601394
10,234,-0.027807,0.672169
15,378,-0.046557,0.366709
20,476,-0.01557,0.734736
30,764,-0.023288,0.520394


# YAP1

In [44]:
yap1 = data[data['Protein'] == 'YAP1_HUMAN']

In [45]:
yap1.shape

(38720, 16)

In [46]:
yap1 = get_pos_diff(yap1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['pos_diff'] = pos_diff


In [47]:
yap1_corr = get_corr(yap1, list_of_n)
yap1_corr

Unnamed: 0,count,spearmanr,p_value
1,2258,0.146644,0.0
2,4458,0.147983,0.0
3,6552,0.120417,0.0
5,10542,0.119323,0.0
6,12452,0.109549,0.0
7,14316,0.091174,0.0
10,19554,0.086669,0.0
15,26758,0.076697,0.0
20,32268,0.082633,0.0
30,38244,0.087428,0.0


# PABP

In [48]:
pabp = data[data['Protein'] == 'PABP_YEAST']

In [49]:
pabp = get_pos_diff(pabp)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['pos_diff'] = pos_diff


In [50]:
pabp_corr = get_corr(pabp, list_of_n)
pabp_corr

Unnamed: 0,count,spearmanr,p_value
1,5854,0.032411,0.013141
2,11422,0.067858,0.0
3,16830,0.071734,0.0
5,27026,0.07932,0.0
6,31696,0.073312,0.0
7,36150,0.067922,0.0
10,47920,0.055634,0.0
15,62574,0.055593,0.0
20,70828,0.053376,0.0
30,73022,0.049232,0.0


# UBE4B

In [51]:
ube4b = data[data['Protein'] == 'UBE4B_MOUSE']

In [52]:
ube4b = get_pos_diff(ube4b)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['pos_diff'] = pos_diff


In [53]:
ube4b_corr = get_corr(ube4b, list_of_n)
ube4b_corr

Unnamed: 0,count,spearmanr,p_value
1,2230,0.025844,0.22248
2,4334,0.040332,0.00792
3,6452,0.049497,7e-05
5,10648,0.040569,2.8e-05
6,12656,0.041314,3e-06
7,14654,0.043254,0.0
10,20266,0.037793,0.0
15,28836,0.028761,1e-06
20,36574,0.025094,2e-06
30,49096,0.014661,0.00116


# BRCA1_db6

In [54]:
brca1_6 = data[data['Protein'] == 'BRCA1_HUMAN_db6']

In [55]:
brca1_6 = get_pos_diff(brca1_6)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['pos_diff'] = pos_diff


In [56]:
brca1_6_corr = get_corr(brca1_6, list_of_n)
brca1_6_corr

Unnamed: 0,count,spearmanr,p_value
1,170,0.09137,0.236012
2,306,0.07611,0.184226
3,418,0.128158,0.008711
5,638,0.084782,0.032264
6,746,0.090394,0.013517
7,842,0.1077,0.00175
10,1152,0.077905,0.008161
15,1350,0.080003,0.003266
20,1462,0.088747,0.000681
30,1670,0.078992,0.001235


# BRCA1_db8

In [57]:
brca1_8 = data[data['Protein'] == 'BRCA1_HUMAN_db8']

In [58]:
brca1_8 = get_pos_diff(brca1_8)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['pos_diff'] = pos_diff


In [59]:
brca1_8_corr = get_corr(brca1_8, list_of_n)
brca1_8_corr

Unnamed: 0,count,spearmanr,p_value
1,328,-0.08114,0.142563
2,564,-0.02883,0.494419
3,792,-0.011945,0.737132
5,1240,-0.012688,0.655346
6,1458,-0.018302,0.484986
7,1632,-0.006958,0.778794
10,2228,-0.01715,0.418461
15,2646,-0.012143,0.532382
20,2860,-0.018197,0.330651
30,3290,-0.018172,0.297398


# SPG1

In [60]:
spg1 = data[data['Protein'] == 'SPG1_STRSG']

In [61]:
spg1 = get_pos_diff(spg1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['pos_diff'] = pos_diff


In [62]:
spg1_corr = get_corr(spg1, list_of_n)
spg1_corr

Unnamed: 0,count,spearmanr,p_value
1,38266,0.101934,0.0
2,75808,0.060746,0.0
3,112628,0.069043,0.0
5,184104,0.078392,0.0
6,218760,0.079182,0.0
7,252678,0.069402,0.0
10,350140,0.057104,0.0
15,497990,0.04439,0.0
20,627836,0.044039,0.0
30,833466,0.023979,0.0
