In [1]:
import pandas as pd
import os

In [2]:
DATA_DIR = 'integration_data'

gene_mapping_filepath = os.path.join(DATA_DIR, 'gene_name_map.txt') # text file generated from HGCN website
cui_to_hgnc_filepath = os.path.join(DATA_DIR, 'HGNC_to_CUI.csv') # CUI to HGNC mapping from UMLS 
gene_mapping_output_filepath = os.path.join(DATA_DIR, 'gene_mapping.csv') # where the formatted gene mapping is to be ouputted

In [3]:
# explode alias symbols to map alias symbols to approved symbols

gene_name_map_df = pd.read_csv(gene_mapping_filepath, sep="\t", dtype=str)
# add official symbol as alias for mapping
gene_name_map_df['Alias symbols'] = gene_name_map_df['Alias symbols'].fillna('')
gene_name_map_df['Alias symbols'] = gene_name_map_df['Alias symbols'] + ", " + gene_name_map_df['Approved symbol']
# put alias symbols in list for exploding
gene_name_map_df['Alias symbols'] = gene_name_map_df['Alias symbols'].apply(lambda x: [symb for symb in str(x).split(", ") if symb])
gene_name_map_df = gene_name_map_df.explode('Alias symbols')
gene_name_map_df = gene_name_map_df.rename(columns={'Alias symbols': 'Alias symbol'})

In [4]:
# add CUI
cui_to_hgnc = pd.read_csv(cui_to_hgnc_filepath)
gene_name_map_df = gene_name_map_df.merge(cui_to_hgnc, how='left', on='HGNC ID')

In [5]:
# save to csv
gene_name_map_df.to_csv(gene_mapping_output_filepath, index=False)