In [None]:
import pandas as pd
from node2vec import Node2Vec
import networkx as nx

def Runnode2vec(filepath):
    n2v = n2.SparseOTF(p=1, q=0.5, workers=4, verbose=True)
    
    # Load the edge list with weights
    df = pd.read_csv(filepath, sep="\t")
    
    # Normalization
    df["combined_score"] /= 1000
    
    df.to_csv("weighted_edges.txt", sep=" ", header=False, index=False)

    # Read the edge list
    edge_list = n2v.read_edg("weighted_edges.txt", weighted=True, directed=False)

    # Generate embeddings
    emd = n2v.embed(dim=128, num_walks=15, walk_length=60, window_size=10, epochs=15)

    # Debugging
    print("Embedding generated. Type:", type(emd))
    print("Embedding content:", emd)
    print("SparseOTF attributes:", dir(n2v))

    # Handling node IDs
    if hasattr(n2v, "nodes"):
        n2v_emd = pd.DataFrame(emd, index=n2v.nodes)
    else:
        raise AttributeError("SparseOTF object does not have a method or attribute to access node IDs.")
    
    # Renaming
    n2v_emd.columns = ['network_' + str(col) for col in n2v_emd.columns]

    return n2v_emd.reset_index().rename(columns={"index": "ensembl"})



In [None]:
import pandas as pd

def ImportDGN():
    dgn = pd.read_csv("disease_gda_summary.tsv",sep='\t')
    dgn_dict = pd.read_csv("gda_dictionary.csv", index_col=None)

    score_threshold = 0.02
    ei_threshold = 0.7

    dgn = dgn[['Gene', 'EI_gda', 'Score_gda']]
    dgn = dgn.loc[dgn['Score_gda'] >= score_threshold]
    dgn = dgn.loc[dgn['EI_gda'] > ei_threshold]
    dgn.rename({'Score_gda':'gda_score'}, axis=1, inplace=True)
    dgn = dgn.merge(dgn_dict, on="Gene").drop(['Gene'], axis=1)
    dgn['gda_score'] = 1

    return dgn[['ensembl', 'gda_score']]

In [None]:
import pandas as pd
import re

def ImportGDC(file_path):
    gdc = pd.read_csv(file_path)

    feature_name_columns = ['# SSM Affected Cases in Cohort', '# CNV Gain', '# CNV Loss']
    
    for feature_column in feature_name_columns:
        split_columns = gdc[feature_column].replace({',':''}, regex=True).str.split(' ', n=2, expand=True)
        gdc[f'{feature_column}_1'] = split_columns[0].astype(float)
        gdc[f'{feature_column}_2'] = split_columns[1]
        
        gdc[f'{feature_column}_3'] = split_columns[2].replace('[^0-9.]', '', regex=True).astype(float)
        
        gdc[feature_column] = gdc[f'{feature_column}_1'] / gdc[f'{feature_column}_3']
        gdc = gdc.drop([f'{feature_column}_1', f'{feature_column}_2', f'{feature_column}_3'], axis=1)

    gdc.drop(['Symbol', 'Name', 'Cytoband', 'Type', 'Annotations', 'Survival'], axis=1, inplace=True)

    split_columns = gdc['# SSM Affected Cases Across the GDC'].replace({',':''}, regex=True).str.split(' ', n=2, expand=True)
    gdc['1'] = split_columns[0].astype(float)
    gdc['2'] = split_columns[1]
    
    gdc['3'] = split_columns[2].replace('[^0-9.]', '', regex=True).astype(float)
    
    gdc['# SSM Affected Cases Across the GDC'] = gdc['1'] / gdc['3']
    gdc = gdc.drop(['1', '2', '3'], axis=1)

    gdc = gdc.rename({'# SSM Affected Cases in Cohort': 'nih_ssm_in_cohort',
                      '# SSM Affected Cases Across the GDC': 'nih_ssm_across_gdc',
                      '# CNV Gain': 'nih_cnv_gain',
                      '# CNV Loss': 'nih_cnv_loss',
                      'Gene ID': 'ensembl',
                      '#Mutations': 'nih_tot_mutations'}, axis=1)

    return gdc

In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD

def ImportHPA():
    hpa = pd.read_csv(r"hpa_gene_features.tsv", sep='\t').drop_duplicates(subset='Gene')

    identifiers = [
        "Gene",
        "Ensembl"
    ]
    discrete_features = [
        "Protein class",
        "Biological process",
        "Molecular function",
        "Disease involvement",
        "Subcellular location",
    ]
    continuous_features = [
        "Tissue RNA - esophagus [NX]",
        "Single Cell Type RNA - Mucus-secreting cells [NX]",
        "Single Cell Type RNA - Basal glandular cells [NX]",
        "Single Cell Type RNA - Undifferentiated cells [NX]"
    ]

    hpa_features = hpa.loc[:, hpa.columns.isin(identifiers + discrete_features + continuous_features)]

    # Normalization of continuous features
    for col in continuous_features:
        hpa_features[col] = (hpa_features[col] - hpa_features[col].mean()) / hpa_features[col].std()

    def explode(feature):
        return feature.apply(lambda x: x.replace(' ', '').split(','))

    hpa_clean = hpa.fillna('')
    for ft in discrete_features:
        hpa_clean[ft] = explode(hpa_clean[ft])

    protein_class = hpa_clean["Protein class"].explode().unique()
    biological_process = hpa_clean["Biological process"].explode().unique()
    molecular_function = hpa_clean["Molecular function"].explode().unique()
    disease_involvement = hpa_clean["Disease involvement"].explode().unique()
    subcellular_location = hpa_clean["Subcellular location"].explode().unique()
    GO_features = np.concatenate([protein_class, biological_process, molecular_function, disease_involvement, subcellular_location])

    RowFeatures = pd.DataFrame(data=0, index=hpa_clean['Ensembl'], columns=GO_features)
    counter = 0

    for index, row in RowFeatures.iterrows():
        features = hpa_clean.iloc[counter][['Protein class', 'Biological process', 'Molecular function', 'Disease involvement', 'Subcellular location']].to_list()
        flattened = [item for sublist in features for item in sublist if item]
        for t in flattened:
            row[t] = 1
        counter += 1

    # Truncated SVD
    n_comp = 200
    svd = TruncatedSVD(n_components=n_comp)
    svdModel = svd.fit(RowFeatures)
    visits_emb = svdModel.transform(RowFeatures)
    hpa_reduced = pd.DataFrame(data=visits_emb, index=RowFeatures.index).reset_index(names="Ensembl")
    
    # Merging
    continuous_data = hpa_features[['Ensembl'] + continuous_features].drop_duplicates()
    hpa_final = pd.merge(hpa_reduced, continuous_data, on='Ensembl', how='left')

    # Rename columns
    hpa_final.columns = ['hpa_' + str(col) for col in hpa_final.columns]
    hpa_final = hpa_final.rename({
        'hpa_Ensembl': 'ensembl',
        'hpa_Tissue RNA - esophagus [NX]': 'nx_tissue_rna_esophagus',
        'hpa_Single Cell Type RNA - Mucus-secreting cells [NX]': 'nx_single_cell_type_mucus_secreting_cells',
        'hpa_Single Cell Type RNA - Basal glandular cells [NX]': 'nx_single_cell_type_basal_glandular_cells',
        'hpa_Single Cell Type RNA - Undifferentiated cells [NX]': 'nx_single_cell_type_undifferentiated_cells'
    }, axis=1)

    return hpa_final


In [None]:
import pandas as pd

def remove_redun(el, verbose=False):
    if verbose:
        print("Original Size: ", len(el))

    el_new = el.iloc[:, 0:2].apply(sorted, axis=1)

    el_new = pd.DataFrame.from_dict(dict(zip(el_new.index, el_new.values))).T 

    el_new = el_new.drop_duplicates()
    if verbose:
        postDrop = len(el_new)
        print("After Dropping Duplicates: ", len(el_new), "(-", len(el)-postDrop, ")")

    el_new = el_new.merge(el, left_on=[el_new.columns[0],el_new.columns[1]],  right_on=[el.columns[0], el.columns[1]])
    if verbose:
        print("After Merging: ", len(el_new), "(-", postDrop-len(el_new), ")")
        print()

    return el_new.iloc[:, 2:]

def map_IDs(el, gmap, verbose = False, dropNaNvalues = True):
    gp_map_f = gmap.set_index('#string_protein_id')['alias']

    el_converted = el.reset_index(drop=True)

    el_converted[el_converted.columns[0]] = el_converted[el_converted.columns[0]].map(gp_map_f)
    el_converted[el_converted.columns[1]] = el_converted[el_converted.columns[1]].map(gp_map_f)

    if verbose:
        print("NaN values per Column:", el_converted[el_converted.columns[0]].isna().sum(), el_converted[el_converted.columns[1]].isna().sum())

    if dropNaNvalues:
        el_converted = el_converted.dropna()
        if verbose:
            print("New edge list size:", len(el_converted), "( -", len(el)-len(el_converted), ")")
    
    return el_converted

def ImportSTRING():
    el_map = pd.read_csv(r"9606.protein.aliases.v12.0.txt", sep="\t")
    el = pd.read_csv(r"9606.protein.links.v12.0_sc.txt", sep=" ")
    el_map = el_map.loc[el_map.source == 'Ensembl_gene']

    el = remove_redun(el, True)
    el = map_IDs(el, el_map, verbose=True)

    return el



In [None]:
gdc = ImportGDC('gdc_esca_fin.csv')
gdc

In [None]:
hpa = ImportHPA()
hpa

In [None]:
el = ImportSTRING()
el

In [None]:
master = hpa.merge(gdc, on="ensembl")
master

In [None]:
el_allgenes = pd.concat([el['protein1'], el['protein2']]).drop_duplicates()
master = master.loc[master['ensembl'].isin(el_allgenes)]
master

In [None]:
el_intersect = (
    el.iloc[:, :3] #Weights
    .merge(master["ensembl"], right_on="ensembl", left_on='protein1')
    .drop("ensembl", axis=1)
)
el_intersect = (
    el_intersect
    .merge(master["ensembl"], right_on="ensembl", left_on='protein2')
    .drop("ensembl", axis=1)
    .rename(columns={'protein1': 'gene1', 'protein2': 'gene2'})
)

el = el_intersect.merge(el, right_on=['protein1', 'protein2'], left_on=['gene1', 'gene2']).drop(['protein1', 'protein2'], axis=1)
el

In [None]:

el[['gene1', 'gene2']].to_csv('eso_edge_list_latest.edg', index=False, header=False, sep='\t') 
el.to_csv('esca_features_latest.edg', index=False)


In [None]:
import pandas as pd
import numpy as np

el['gene1'] = pd.to_numeric(el['gene1'], errors='coerce')
el['gene2'] = pd.to_numeric(el['gene2'], errors='coerce')
el['combined_score_x'] = pd.to_numeric(el['combined_score_x'], errors='coerce')

el.dropna(inplace=True)

genes_array = el[['gene1', 'gene2', 'combined_score_x']].to_numpy(dtype=np.int64)

np.save('eso_edge_list_latest.npy', genes_array)


In [None]:
dgn = ImportDGN()
dgn = dgn.loc[dgn['ensembl'].isin(master['ensembl'])]
dgn

In [None]:
network = Runnode2vec("eso_edge_list_latest.edg")
master = master.merge(network, on='ensembl')
master.to_csv("node_node2vec_data_latest.csv", index=None)
master

In [None]:
master["gda_score"] = np.nan  

#"label" to 1 for genes present in 'dgn'
master.loc[master["ensembl"].isin(dgn["ensembl"]), "gda_score"] = 1


In [None]:
num_ones = (master.iloc[:, -1] == 1).sum()
print("No. of 1s in the last column:", num_ones)

In [None]:
master.to_csv("200_node_network_embeddings_latest.csv", index=None)