In [1]:
import torch
import pandas as pd
import numpy as np
import networkx as nx
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
import os
os.chdir('/home/lyz/co-phase-separation/PSGAT/')
DATA_ROOT = './data'

In [3]:
SAVE_ROOT = './saves/INTEGRATE_ProSE80d_pos1neg1/'
preds = pd.read_pickle(f'{SAVE_ROOT}preds.pkl')
genes = pd.read_pickle(f'{SAVE_ROOT}genes.pkl')
N = 10

In [4]:
phasep_df = pd.read_pickle('./data/Label/human.condensate-formation.proteins-v1.pkl')
lt = set(phasep_df[phasep_df['source']=='literature']['uniprot_entry'])
pr = set(phasep_df[phasep_df['source']=='phasepred']['uniprot_entry'])
pr = pr - lt
dp = set(phasep_df[phasep_df['source']=='deepphase']['uniprot_entry'])
dp = dp - pr - lt

In [5]:
ppi = 'integrate'

ppi_path = os.path.join(
    DATA_ROOT,           
    f'PPIN/{ppi.upper()}.csv'
)
edges = pd.read_csv(ppi_path)
edges['C'] = np.sort(edges[['A', 'B']].values).sum(axis=1)

for i in range(N): 
    tmp = pd.read_pickle(f'{SAVE_ROOT}edge_cosine/model_{i}.pkl')
    edges[i] = tmp['cosim']
edges['pred'] = edges.iloc[:, 3:].mean(axis=1)

edges.loc[(edges['A'].isin(lt))&(edges['B'].isin(lt)), 'type'] = 'ps.ppi'
edges['type'] = edges['type'].fillna('nps.ppi')

# Calculation of average cosine similarity for all pairwises

In [6]:
for n in tqdm(range(N)):
    embed = torch.load(f'{SAVE_ROOT}embeddings/model_{n}.pt')
    cosim = pd.DataFrame(cosine_similarity(embed))
    cosim.columns, cosim.index = genes, genes
    temp_dict = dict(zip(np.arange(len(cosim.columns)), cosim.columns))
    
    cosim_ndarray = cosim.to_numpy()
    cosim_idx = np.tril_indices(cosim_ndarray.shape[0], -1)
    idx0, idx1 = cosim_idx[0], cosim_idx[1]
    cosim_tril_ndarray = cosim_ndarray[cosim_idx]
    
    if n == 0:
        cosim_tril_df = pd.DataFrame({
            'idx0': idx0,
            'idx1': idx1,
            f'cos.sim.{n}': cosim_tril_ndarray
        })
        cosim_tril_df[0] = cosim_tril_df['idx0'].map(temp_dict)
        cosim_tril_df[1] = cosim_tril_df['idx1'].map(temp_dict)
        cosim_tril_df[2] = np.sort(cosim_tril_df[[0, 1]].values).sum(axis=1)
    else:
        cosim_tril_df[f'cos.sim.{n}'] = cosim_tril_ndarray
    
    cosim_tril_df.loc[cosim_tril_df[f'cos.sim.{n}'] < 0, f'cos.sim.{n}'] = 0
    cosim_tril_df.loc[cosim_tril_df[f'cos.sim.{n}'] > 1, f'cos.sim.{n}'] = 1
    # cosim_tril_df[f'pct.rank.{n}'] = cosim_tril_df[f'cos.sim.{n}'].rank(pct=True)

100%|██████████| 10/10 [02:02<00:00, 12.25s/it]


In [7]:
dat1 = cosim_tril_df[[0, 1, 2] + [f'cos.sim.{n}' for n in range(N)]].copy()
dat1['mean.cos.sim'] = dat1[[f'cos.sim.{n}' for n in range(N)]].mean(axis=1)

In [8]:
dat2 = dat1[[0, 1, 2, 'mean.cos.sim']].copy()
dat2['pct.rank'] = dat2['mean.cos.sim'].rank(pct=True)
dat2['rank'] = dat2['mean.cos.sim'].rank()

In [9]:
dat2.to_pickle(f'{SAVE_ROOT}05-co_phase_list.pkl')