In [None]:
import pandas as pd
from node2vec import Node2Vec
import networkx as nx

def Runnode2vec(filepath):
    n2v = n2.SparseOTF(p=1, q=0.5, workers=4, verbose=True)
    
    # Edge list with weights
    df = pd.read_csv(filepath, sep="\t")  
    
    # Normalize combined_score
    df["combined_score"] /= 1000  # Scale between 0 and 1
    
    df.to_csv("weighted_edges.txt", sep=" ", header=False, index=False)

    # edge list with weights enabled
    edge_list = n2v.read_edg("weighted_edges.txt", weighted=True, directed=False)

    # Generation of embeddings
    emd = n2v.embed(dim=128, num_walks=15, walk_length=60, window_size=10, epochs=15)

    # Debugging
    print("Embedding generated. Type:", type(emd))
    print("Embedding content:", emd)
    print("SparseOTF attributes:", dir(n2v))

    # Handle node IDs
    if hasattr(n2v, "nodes"):
        n2v_emd = pd.DataFrame(emd, index=n2v.nodes)
    else:
        raise AttributeError("SparseOTF object does not have a method or attribute to access node IDs.")
    
    # Rename columns
    n2v_emd.columns = ['network_' + str(col) for col in n2v_emd.columns]

    return n2v_emd.reset_index().rename(columns={"index": "ensembl"})



In [None]:

import pandas as pd
from node2vec import Node2Vec
import networkx as nx

def Runnode2vec(filepath):
    # Edge list with weights
    df = pd.read_csv(filepath, sep="\t", header=None, names=["source", "target", "weight"])
    
    # Normalize combined_score
    df["weight"] /= 1000  # Scale between 0 and 1
    
    # Creation of a graph from the edge list
    G = nx.from_pandas_edgelist(df, "source", "target", ["weight"], create_using=nx.Graph())
    
    # Initialize Node2Vec with optimized parameters
    node2vec = Node2Vec(G, dimensions=128, walk_length=60, num_walks=15, workers=4, p=1, q=0.5)
    
    # Generate embeddings
    model = node2vec.fit(window=10, min_count=1, batch_words=4)
    
    # Extract embeddings
    emd = pd.DataFrame([model.wv[str(node)] for node in G.nodes()], index=G.nodes())
    emd.columns = [f'network_{i}' for i in range(emd.shape[1])]
    
    # Reset index
    return emd.reset_index().rename(columns={"index": "ensembl"})

In [None]:
import pandas as pd

def ImportDGN():
    dgn = pd.read_csv("Liver_Summary_GDA_ALL.csv")
    dgn_dict = pd.read_csv("gda_dictionary.csv", index_col=None)

    score_threshold = 0.02
    ei_threshold = 0.7

    dgn = dgn[['gene', 'evidenceIndexGDA', 'scoreGDA']]
    dgn = dgn.loc[dgn['scoreGDA'] >= score_threshold]
    dgn = dgn.loc[dgn['evidenceIndexGDA'] > ei_threshold]
    dgn.rename({'scoreGDA':'gda_score'}, axis=1, inplace=True)
    dgn = dgn.merge(dgn_dict, on="gene").drop(['gene'], axis=1)
    dgn['gda_score'] = 1

    return dgn[['ensembl', 'gda_score']]

In [None]:
import pandas as pd
import re

def parse_fraction(cell):
    if pd.isna(cell):
        return None
    
    nums = re.findall(r'\d+(?:\.\d+)?', str(cell))
    
    if len(nums) < 2:
        return None
    
    num = float(nums[0])
    den = float(nums[1])
    
    return num / den if den != 0 else None


def ImportGDC(file_path):
    gdc = pd.read_csv(file_path)
    fraction_cols = [
        'SSM Affected Cases in Cohort',
        'SSM Affected Cases Across the GDC',
        'CNV_Gain',
        'CNV_Loss'
    ]

    for col in fraction_cols:
        gdc[col] = gdc[col].apply(parse_fraction)

    gdc = gdc.rename(columns={
        'SSM Affected Cases in Cohort': 'nih_ssm_in_cohort',
        'SSM Affected Cases Across the GDC': 'nih_ssm_across_gdc',
        'CNV_Gain': 'nih_cnv_gain',
        'CNV_Loss': 'nih_cnv_loss',
        'Gene_ID': 'ensembl',
        'Mutations': '# Mutations'
    })
    drop_cols = ['Symbol', 'Name', 'Cytoband', 'Type', 'Annotations']
    gdc = gdc.drop(columns=[c for c in drop_cols if c in gdc.columns])

    return gdc

In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD

def ImportHPA():
    hpa = pd.read_csv(r"hpa_gene_features.tsv", sep='\t').drop_duplicates(subset='Gene')

    identifiers = [
        "Gene",
        "Ensembl"
    ]
    discrete_features = [
        "Protein class",
        "Biological process",
        "Molecular function",
        "Disease involvement",
        "Subcellular location",
    ]
    continuous_features = [
        "Tissue RNA - liver [NX]",
        "Single Cell Type RNA - Mucus-secreting cells [NX]",
        "Single Cell Type RNA - Ito cells [NX]",
        "Single Cell Type RNA - Kupffer cells [NX]"
    ]

    hpa_features = hpa.loc[:, hpa.columns.isin(identifiers + discrete_features + continuous_features)]

    # Normalization
    for col in continuous_features:
        hpa_features[col] = (hpa_features[col] - hpa_features[col].mean()) / hpa_features[col].std()

    def explode(feature):
        return feature.apply(lambda x: x.replace(' ', '').split(','))

    hpa_clean = hpa.fillna('')
    for ft in discrete_features:
        hpa_clean[ft] = explode(hpa_clean[ft])

    protein_class = hpa_clean["Protein class"].explode().unique()
    biological_process = hpa_clean["Biological process"].explode().unique()
    molecular_function = hpa_clean["Molecular function"].explode().unique()
    disease_involvement = hpa_clean["Disease involvement"].explode().unique()
    subcellular_location = hpa_clean["Subcellular location"].explode().unique()
    GO_features = np.concatenate([protein_class, biological_process, molecular_function, disease_involvement, subcellular_location])

    RowFeatures = pd.DataFrame(data=0, index=hpa_clean['Ensembl'], columns=GO_features)
    counter = 0

    for index, row in RowFeatures.iterrows():
        features = hpa_clean.iloc[counter][['Protein class', 'Biological process', 'Molecular function', 'Disease involvement', 'Subcellular location']].to_list()
        flattened = [item for sublist in features for item in sublist if item]
        for t in flattened:
            row[t] = 1
        counter += 1

    # Truncated SVD
    n_comp = 200
    svd = TruncatedSVD(n_components=n_comp)
    svdModel = svd.fit(RowFeatures)
    visits_emb = svdModel.transform(RowFeatures)
    hpa_reduced = pd.DataFrame(data=visits_emb, index=RowFeatures.index).reset_index(names="Ensembl")

    continuous_data = hpa_features[['Ensembl'] + continuous_features].drop_duplicates()
    hpa_final = pd.merge(hpa_reduced, continuous_data, on='Ensembl', how='left')

    #hpa_final = hpa_final.drop(columns=['Ensembl'])

    hpa_final.columns = ['hpa_' + str(col) for col in hpa_final.columns]
    hpa_final = hpa_final.rename({
        'hpa_Ensembl': 'ensembl',
        'hpa_Tissue RNA - liver [NX]': 'nx_tissue_rna_liver',
        'hpa_Single Cell Type RNA - Mucus-secreting cells [NX]': 'nx_single_cell_type_mucus_secreting_cells',
        'hpa_Single Cell Type RNA - Ito cells [NX]': 'nx_single_cell_type_ito_cells',
        'hpa_Single Cell Type RNA - Kupffer cells [NX]': 'nx_single_cell_type_kupffer_cells'
    }, axis=1)

    return hpa_final


In [None]:
import pandas as pd

def remove_redun(el, verbose=False):
    if verbose:
        print("Original Size: ", len(el))

    el_new = el.iloc[:, 0:2].apply(sorted, axis=1)

    el_new = pd.DataFrame.from_dict(dict(zip(el_new.index, el_new.values))).T 

    el_new = el_new.drop_duplicates()
    if verbose:
        postDrop = len(el_new)
        print("After Dropping Duplicates: ", len(el_new), "(-", len(el)-postDrop, ")")

    el_new = el_new.merge(el, left_on=[el_new.columns[0],el_new.columns[1]],  right_on=[el.columns[0], el.columns[1]])
    if verbose:
        print("After Merging: ", len(el_new), "(-", postDrop-len(el_new), ")")
        print()

    return el_new.iloc[:, 2:]

def map_IDs(el, gmap, verbose = False, dropNaNvalues = True):
    gp_map_f = gmap.set_index('#string_protein_id')['alias']

    el_converted = el.reset_index(drop=True)

    el_converted[el_converted.columns[0]] = el_converted[el_converted.columns[0]].map(gp_map_f)
    el_converted[el_converted.columns[1]] = el_converted[el_converted.columns[1]].map(gp_map_f)

    if verbose:
        print("NaN values per Column:", el_converted[el_converted.columns[0]].isna().sum(), el_converted[el_converted.columns[1]].isna().sum())

    if dropNaNvalues:
        el_converted = el_converted.dropna()
        if verbose:
            print("New edge list size:", len(el_converted), "( -", len(el)-len(el_converted), ")")
    
    return el_converted

def ImportSTRING():
    el_map = pd.read_csv(r"9606.protein.aliases.v12.0.txt", sep="\t")
    el = pd.read_csv(r"9606.protein.links.v12.0_sc.txt", sep=" ")
    el_map = el_map.loc[el_map.source == 'Ensembl_gene']

    el = remove_redun(el, True)
    el = map_IDs(el, el_map, verbose=True)

    return el

In [None]:
gdc = ImportGDC("cnv_data.csv")
gdc.head()

In [None]:
hpa = ImportHPA()
hpa

In [None]:
el = ImportSTRING()
el

In [None]:
master = hpa.merge(gdc, on="ensembl")
master

In [None]:
el_allgenes = pd.concat([el['protein1'], el['protein2']]).drop_duplicates()
master = master.loc[master['ensembl'].isin(el_allgenes)]
master

In [None]:
el_intersect = (
    el.iloc[:, :3] 
    .merge(master["ensembl"], right_on="ensembl", left_on='protein1')
    .drop("ensembl", axis=1)
)
el_intersect = (
    el_intersect
    .merge(master["ensembl"], right_on="ensembl", left_on='protein2')
    .drop("ensembl", axis=1)
    .rename(columns={'protein1': 'gene1', 'protein2': 'gene2'})
)

el = el_intersect.merge(el, right_on=['protein1', 'protein2'], left_on=['gene1', 'gene2']).drop(['protein1', 'protein2'], axis=1)
el

In [None]:
el[['gene1', 'gene2']].to_csv('hcc_edge_list_latest.edg', index=False, header=False, sep='\t') 
el.to_csv('hcc_edge_list_features_latest.edg', index=False)

In [None]:
import pandas as pd
import numpy as np

el['gene1'] = pd.to_numeric(el['gene1'], errors='coerce')
el['gene2'] = pd.to_numeric(el['gene2'], errors='coerce')
el['combined_score_x'] = pd.to_numeric(el['combined_score_x'], errors='coerce')

el.dropna(inplace=True)

genes_array = el[['gene1', 'gene2', 'combined_score_x']].to_numpy(dtype=np.int64)

np.save('hcc_edge_list_latest.npy', genes_array)

print("Numpy array saved with weights!")

In [None]:
dgn = ImportDGN()
dgn = dgn.loc[dgn['ensembl'].isin(master['ensembl'])]
dgn

In [None]:
network = Runnode2vec("hcc_edge_list_latest.edg")
master = master.merge(network, on='ensembl')
master.to_csv("node_node2vec_data_latest.csv", index=None)
master

In [None]:
master["gda_score"] = np.nan  
master.loc[master["ensembl"].isin(dgn["ensembl"]), "gda_score"] = 1

In [None]:
num_ones = (master.iloc[:, -1] == 1).sum()
print("Number of 1s in the last column:", num_ones)

In [None]:
master.to_csv("200_HCC_node_network_embeddings_latest.csv", index=None)