In [1]:
import pandas as pd
import numpy as np
import scipy as sp
from tqdm import tqdm

In [2]:
phasep_df = pd.read_pickle('./data/Label/human.condensate-formation.proteins-v1.pkl')
lt = set(phasep_df[phasep_df['source']=='literature']['uniprot_entry'])
dp = set(phasep_df[phasep_df['source']=='deepphase']['uniprot_entry'])
dp = dp - lt
pr = set(phasep_df[phasep_df['source']=='phasepred']['uniprot_entry'])
pr = pr - dp - lt

In [3]:
edges = pd.read_pickle('./saves/INTEGRATE_unweight_ProSE80d_mcc/edge_cosine/model_0.pkl')
edges = edges[['A', 'B', 'cosim']].copy()

In [4]:
edges.head()

Unnamed: 0,A,B,cosim
0,Q9H2S6,Q9NPE6,0.914216
1,Q9H2S6,Q9BXK5,0.909164
2,Q9H2S6,O60238,0.193015
3,Q9H2S6,P20138,0.62093
4,Q9H2S6,Q9UM44,0.010836


In [5]:
import networkx as nx
def dataframe2graph(interactome_df, mcc=True):
    G = nx.Graph()
    G.add_edges_from(interactome_df.to_numpy())
    if mcc:
        c = max(nx.connected_components(G), key=len)
        G_mcc = G.subgraph(c)
        return G_mcc
    else:
        return G

In [6]:
def get_adjmat(e, pset, fill_negEdges_val=-1):
    e.loc[(e['A'].isin(pset))&(e['B'].isin(pset)), 'type'] = 1
    e['type'] = e['type'].fillna(0).astype(int)
    nodes = np.sort(pd.concat([e['A'], e['B']]).drop_duplicates())
    graph = dataframe2graph(e[['A', 'B']])
    
    adj_mat = nx.adjacency_matrix(graph, nodelist=nodes).toarray()
    adj_mat_df = pd.DataFrame(adj_mat)
    adj_mat_df.columns, adj_mat_df.index = nodes, nodes
    
    for _, i in tqdm(e.iterrows()):
        a, b, ty = i['A'], i['B'], i['type']
        if ty==1:
            continue
        elif ty==0:
            adj_mat_df[a][b] = fill_negEdges_val
            adj_mat_df[b][a] = fill_negEdges_val
    
    return adj_mat_df

In [7]:
df1 = get_adjmat(edges, lt)

241464it [04:09, 966.70it/s] 


In [12]:
adjmat_csr1 = sp.sparse.csr_matrix(df1.to_numpy())

In [16]:
df2 = get_adjmat(edges, set(phasep_df['uniprot_entry']))

241464it [03:20, 1203.69it/s]


In [18]:
adjmat_csr2 = sp.sparse.csr_matrix(df2.to_numpy())

In [22]:
prose_df = pd.read_pickle('./data/NodeFeat/SeqEmb/seqemb_80d.pkl')

In [29]:
genes = set(pd.concat([edges['A'], edges['B']]).drop_duplicates())

In [36]:
prose_array = prose_df[prose_df['entry'].isin(genes)].sort_values(by='entry').set_index('entry').to_numpy()

In [47]:
prose_array_mat = np.asmatrix(prose_array)

In [55]:
# help(np.savez)

In [60]:
# save pickles
sp.sparse.save_npz('../230422_adjmat_lt.npz', adjmat_csr1)
sp.sparse.save_npz('../230422_adjmat_lt_pr_dp.npz', adjmat_csr2)
np.savez('../230422_nodefeat.npz', a=prose_array_mat)

In [62]:
sp.sparse.load_npz('../230422_adjmat_lt.npz')

<15939x15939 sparse matrix of type '<class 'numpy.int64'>'
	with 482928 stored elements in Compressed Sparse Row format>

In [65]:
np.load('../230422_nodefeat.npz')['a']

array([[-0.94118451,  1.60268738,  0.19945823, ...,  0.19491316,
        -0.43772279, -0.08016426],
       [-0.29100994, -2.06819021,  0.29064222, ..., -0.22262475,
        -0.29430415,  0.20296665],
       [-0.55109096, -0.29558413, -0.11974005, ..., -0.00221224,
         0.10755734, -0.07246632],
       ...,
       [-0.61434225, -0.21582841, -0.33786499, ...,  0.07044926,
        -0.0911999 , -0.02410985],
       [ 1.02569928,  1.77695801,  2.14784547, ...,  0.24515249,
        -0.07967637,  0.02876148],
       [ 0.14874894,  0.56108616,  1.09394704, ...,  0.09033816,
         0.10259608,  0.11066866]])