# Focuesed Entrez Convertor
#### This file takes the raw gene files and converts them to the carva format without the overhead of running many traits in parallel 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from collections import defaultdict
from tqdm import tqdm
import networkx as nx
from matplotlib_venn import venn2, venn3
from neteval import gene_mapper as gm
from neteval import query_ensembl as qe
from neteval import query_hgnc as qh
import obonet as obo
import re
from sklearn.metrics.pairwise import cosine_similarity

from pathlib import Path

  from pkg_resources import DistributionNotFound, get_distribution


### base settings
datadir is the directory at the base of the project

common data name is the trait for the common genes

In [2]:
datadir = Path.cwd().parent

In [11]:
common_data_name = "lupus_canon"

#### Data Cleaning
in this part we clean the lupus files to match the carva pipeline

note: not sure we need the part below, might want to keep those who are connected to several genes.
if i want to keep, need to split the rows

In [None]:
def clean_gwas_catalog_data(datafile, outfile, pval_th=5e-8, include_intergenic=False):
    """Clean the GWAS Catalog data and write to a new file.

    Args:
        datafile (str): file path for GWAS Catalog data
        outfile (str): output file for cleaned data
        pval_th (float): p-value threshold for filtering
        include_intergenic (bool): whether to include intergenic associations

    Returns:
        None
    """
    cols= ['DATE', 'PUBMEDID', 'DISEASE/TRAIT', 'MAPPED_GENE', 'SNP_GENE_IDS', 'P-VALUE', 'OR or BETA' ,'MAPPED_TRAIT', 
           'MAPPED_TRAIT_URI', 'INTERGENIC', 'STUDY ACCESSION', 'SNP_ID_CURRENT', 'INITIAL SAMPLE SIZE', 'GENOTYPING TECHNOLOGY']    
    if include_intergenic:
        cols = cols + ['UPSTREAM_GENE_ID', 'DOWNSTREAM_GENE_ID', 'UPSTREAM_GENE_DISTANCE', 'DOWNSTREAM_GENE_DISTANCE']
    data = pd.read_csv(datafile, sep="\t", usecols=cols, low_memory=False)
    # filter on pval
    data = data[data["P-VALUE"] <= pval_th]
    # filter on gene and trait present
    data = data.dropna(subset=['SNP_GENE_IDS', "MAPPED_TRAIT_URI"])
    # filter out intergenic
    if not include_intergenic:
        data = data[data["INTERGENIC"] == 0]
    # remove associations with multiple genes
    data = data[~data["SNP_GENE_IDS"].str.contains(",")]
    # remove associations with multiple traits
    data = data[~data["MAPPED_TRAIT_URI"].str.contains(",")]
    # create trait code
    data['TRAIT_CODE'] = data['MAPPED_TRAIT_URI'].apply(lambda x: x.split('/')[-1])
    # write the cleaned file
    data.to_csv(outfile, sep="\t", index=False)
    print("Wrote data to", outfile)

In [None]:
clean_gwas_catalog_data(datadir/ f"data/{common_data_name}_common.txt", datadir/ "data" /f"{common_data_name}_common.txt.update", pval_th=1)

In [None]:
gwas_genes = pd.read_csv(datadir / "data" / f"{common_data_name}_common.txt.update", sep="\t")
display(gwas_genes.head(2))

In [None]:
# this code was taken from notebook 1A - map identifiers
# First map from Ensembl
ensembl_map, missing = qe.get_latest_ensembl_id(gwas_genes['SNP_GENE_IDS'].unique())
ensembl_to_entrez, missing_entrez = gm.convert_node_ids(ensembl_map['to'].values, 'Ensembl', 'Entrez')
ensembl_map['Entrez'] = [ensembl_to_entrez[x] if x in ensembl_to_entrez else '' for x in ensembl_map['to']]
id_ensembl = gwas_genes.merge(ensembl_map.loc[:, ('from', 'Entrez')], left_on='SNP_GENE_IDS', right_on='from', how='left')

In [None]:
# Try mapping based on symbols for unsuccessful conversions 
symbol_map, symbol_missing = qh.perform_hgnc_query(id_ensembl[(id_ensembl['Entrez'].isnull()) | (id_ensembl['Entrez']== '')]['MAPPED_GENE'].unique(), 'Symbol', 'Symbol')
symbol_to_entrez, missing = gm.convert_node_ids(list(symbol_map.values()), 'Symbol', 'Entrez')
symbol_map = pd.DataFrame(symbol_map.items(), columns=['from', 'to'])
symbol_map['Entrez'] = [symbol_to_entrez[x] if x in symbol_to_entrez else '' for x in symbol_map['to']]
id_symbol = gwas_genes.iloc[~id_ensembl.index].merge(symbol_map.loc[:, ('from', 'Entrez')], left_on='MAPPED_GENE', right_on='from', how='inner')
id_ensembl = id_ensembl[(id_ensembl['Entrez'] != '') & (~id_ensembl['Entrez'].isnull())]

In [None]:
# Put all together
converted_gwas_genes = pd.concat([id_ensembl, id_symbol])
converted_gwas_genes = converted_gwas_genes[converted_gwas_genes['Entrez'] != '']
converted_gwas_genes.to_csv(datadir/ "data" / f"{common_data_name}_common.txt.update.enterz", sep="\t", index=False)

In [None]:
display(converted_gwas_genes.head(2)[["Entrez", "P-VALUE"]])

In [None]:
commons = pd.read_csv(datadir/ "data" / f"{common_data_name}_common.txt.update.enterz", sep="\t")
display(commons.head)

In [None]:
print(commons.columns)

In [None]:
commons_clean = commons[["Entrez", "MAPPED_GENE", "P-VALUE", "OR or BETA"]]
commons_clean.rename(columns={'P-VALUE': 'P-value'}, inplace=True)
display(commons_clean)

#### optional: set mapped gene as the entrez col

why? for some reason my version of the networks use gene symbol instead of entrez, but the code takes entrez as the input col

In [12]:
# if already have the file, read it:
commons_clean = pd.read_csv(datadir / "data" / "lupus_canon_entrez_cv.txt", sep="\t")
display(commons_clean)

Unnamed: 0,Entrez,Gene Symbol,P-value,Beta
0,57118,CAMK1D,3e-08,1.11
1,51347,TAOK3,3e-08,1.12
2,80205,CHD9,5e-08,1.15


In [13]:
rename_mapping = {'Entrez': 'Entrez_old','Gene Symbol': 'Entrez', "MAPPED_GENE": "Entrez"}
commons_renamed = commons_clean.rename(columns=rename_mapping)

In [14]:
commons_renamed.to_csv(datadir/ "data" / f"{common_data_name}_cv.txt", sep="\t", index=False)

## Converting ravar genes

converting the rare genes file to carva standard format

In [None]:
ravar_genes = pd.read_csv(datadir / "data" / "lupus_rare_genes.txt", sep="\t")
ravar_genes = ravar_genes.rename(columns={"P-Value":"P-value"})
# note - skipped taking a logp col since gwas doesnt have it, if missing later come back here

In [None]:
ensembl_map, missing = qe.get_latest_ensembl_id(ravar_genes['Ensembl ID'].unique())
ensembl_to_entrez, missing_entrez = gm.convert_node_ids(ensembl_map['to'].values, 'Ensembl', 'Entrez')
ensembl_map['Entrez'] = [ensembl_to_entrez[x] if x in ensembl_to_entrez else '' for x in ensembl_map['to']]
id_ensembl = ravar_genes.merge(ensembl_map.loc[:, ('from', 'Entrez')], left_on='Ensembl ID', right_on='from', how='inner')
id_ensembl = id_ensembl[id_ensembl['Entrez'] != '']

In [None]:
if len(missing_entrez) > 0:
    symbol_map, symbol_missing = qh.perform_hgnc_query(ravar_genes[ravar_genes['Ensembl ID'].isin(missing_entrez)]['Gene Symbol'].unique(), 'Symbol', 'Symbol')
    symbol_to_entrez, missing = gm.convert_node_ids(list(symbol_map.values()), 'Symbol', 'Entrez')
    symbol_map = pd.DataFrame(symbol_map.items(), columns=['from', 'to'])
    symbol_map['Entrez'] = [symbol_to_entrez[x] if x in symbol_to_entrez else '' for x in symbol_map['to']]
    id_symbol = ravar_genes.iloc[~id_ensembl.index].merge(symbol_map.loc[:, ('from', 'Entrez')], left_on='Gene Symbol', right_on='from', how='inner')
    converted_ravar_genes = pd.concat([id_ensembl, id_symbol])
else:
    converted_ravar_genes = id_ensembl

in the following ccode i removed pmid, trait label and reported trait as  i dont need them

In [None]:
# add the PMCIDs to the converted data
converted_ravar_genes = converted_ravar_genes.merge(ravar_genes.loc[:, ('Ensembl ID', 'P-value', 'Location',
                        'Gene Symbol')].drop_duplicates(), on=['Gene Symbol', 'Ensembl ID', 'P-value', 'Location'], how='left')

In [None]:
display(converted_ravar_genes)

In [None]:
converted_ravar_genes.to_csv(datadir/ "data/lupus_rare.txt.update.enterz", sep="\t", index=False)

now clean the cols to only have those the original pipeline did

In [None]:
ravars = pd.read_csv(datadir/ "data/lupus_rare.txt.update.enterz", sep="\t")
display(ravars.head)

In [None]:
ravar_clean = ravars[['Entrez', 'Gene Symbol', 'Ensembl ID', 'P-value']]
display(ravar_clean)

In [None]:
ravar_clean.to_csv(datadir/ "data/lupus_rv.txt", sep="\t", index=False)

In [15]:
# if already have the file, read it:
ravar_clean = pd.read_csv(datadir / "data" / "lupus_canon_entrez_rv.txt", sep="\t")
display(ravar_clean)

Unnamed: 0,Entrez,Gene Symbol,Ensembl ID,P-value
0,9623,TCL1B,ENSG00000213231,1e-06
1,25948,KBTBD2,ENSG00000170852,9e-06
2,5799,PTPRN2,ENSG00000282185,7.1e-05
3,1553,CYP2A13,ENSG00000197838,7.3e-05
4,10206,TRIM13,ENSG00000204977,8.6e-05


In [16]:
rename_mapping = {'Entrez': 'Entrez_old','Gene Symbol': 'Entrez', "MAPPED_GENE": "Entrez"}
ravar_renamed = ravar_clean.rename(columns=rename_mapping)

In [17]:
ravar_renamed.to_csv(datadir/ "data" / f"{common_data_name}_rv.txt", sep="\t", index=False)