In [1]:
import os
import sys
sys.path.append('..')  
import random
import pickle

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
from scipy.stats import hypergeom

In [4]:
from scregat.data_process import prepare_model_input, sum_counts, plot_edge, ATACGraphDataset
from run_scReGAT import *

In [5]:
# load graph
with open('../data/dataset_atac_core_MFG.pkl', 'rb') as f:
    dataset_atac = pickle.load(f)

In [6]:
peaks_list = [t for t in dataset_atac.array_peak if t.startswith('chr')]
from tqdm import tqdm
data_mt = []
for t in tqdm(dataset_atac.list_graph, desc="Processing Graphs"):
    data_mt.append(torch.flatten(t.x).numpy())
data_mt = np.stack(data_mt)

Processing Graphs: 100%|██████████| 2130/2130 [00:00<00:00, 103126.72it/s]


In [7]:
def cosine_similarity_matrix(matrix):
    norm_matrix = np.linalg.norm(matrix, axis=0)
    normalized_matrix = matrix / norm_matrix
    similarity_matrix = np.dot(normalized_matrix.T, normalized_matrix)
    return similarity_matrix

In [8]:
similarity_matrix = cosine_similarity_matrix(data_mt)

In [9]:
top_samples = dataset_atac.list_graph[0].y_exp.shape[0]
top_samples

1348

In [10]:
edge_info = get_edge_info(dataset_atac)

In [11]:
peak_count = edge_info.peak.value_counts()

In [12]:
edge_index = pd.read_csv("../data/Edge_index_with_Specifici_Hi-C.txt",index_col=0)
edge_index.shape

(10698, 1)

In [13]:
raw_edge = dataset_atac.list_graph[0].edge_index

In [14]:
edge_peak_count = edge_info.loc[~edge_info.index.isin(edge_index.iloc[:,0].values),].peak.value_counts()

In [15]:
df_index = pd.DataFrame(dataset_atac.array_peak)
peak_index = df_index.loc[df_index.iloc[:,0].isin(edge_peak_count.index.values)].index.values

In [16]:
pairs = []

# Pre-fetch the peak names corresponding to the peak indices
peak_names = dataset_atac.array_peak[peak_index].tolist()

# Iterate over peak_index with tqdm for progress tracking
for idx, i in tqdm(enumerate(peak_index), total=len(peak_index), desc="Building similarity pairs"):
    column_similarities = similarity_matrix[i, :top_samples]

    # Get edgeN: the number of connections for the current peak
    edgeN = edge_peak_count.loc[peak_names[idx]]

    # Get the indices of the top edgeN similarities
    top_sim_indices = np.argsort(-column_similarities)[:edgeN]

    # Construct pairs (j, i), where i is the peak index, j is a top similar sample index
    new_pairs = [(j, i) for j in top_sim_indices]
    pairs.extend(new_pairs)


Building similarity pairs: 100%|██████████| 50976/50976 [00:05<00:00, 9818.30it/s] 


In [17]:
index1 = [t[1] for t in pairs]
index2 = [t[0] for t in pairs]
new_edge = torch.tensor([index1, index2])
new_edge = torch.cat([raw_edge[:,edge_index.iloc[:,0].values], new_edge], dim=1)

In [18]:
for t in dataset_atac.list_graph:
    t.edge_index = new_edge

In [19]:
file_atac_test = os.path.join('../data/', 'dataset_atac_cosine_MFG.pkl')
with open(file_atac_test, 'wb') as w_pkl:
    str_pkl = pickle.dumps(dataset_atac)
    w_pkl.write(str_pkl)