In [1]:
## Load libraries
import pandas as pd
import numpy as np
import mygene
import os

## Display all rows of pandas dataframes
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

# Opt into the future behavior for silent downcasting
pd.set_option('future.no_silent_downcasting', True)

In [2]:
def unify_gene_ids_and_symbols(df):

    ## Make a copy to use in function
    dff = df.copy()
    
    ## Get all the unique gene_id and gene_symbols
    df_ids = dff["gene_id"].copy().drop_duplicates(inplace=False).dropna(inplace=False).to_list()
    df_symbols = dff["gene_symbol"].copy().drop_duplicates(inplace=False).dropna(inplace=False).to_list()

    ## Query by gene_ids and gene_symbols
    mg = mygene.MyGeneInfo()
    results_id_query = mg.querymany(df_ids, scopes='symbol,alias,name,ensembl.gene,ensembl.transcript', fields='symbol,ensembl.gene', species='human', as_dataframe=True, df_index=False, returnall=False)
    results_symbol_query = mg.querymany(df_symbols, scopes='symbol,alias,name,ensembl.gene,ensembl.transcript', fields='symbol,ensembl.gene', species='human', as_dataframe=True, df_index=False, returnall=False)
    
    ## Delete duplicates, only keep highest value
    results_id_query = results_id_query.sort_values(by=['query', '_score'], ascending=[True, False]).drop_duplicates(subset='query', keep='first')
    results_symbol_query = results_symbol_query.sort_values(by=['query', '_score'], ascending=[True, False]).drop_duplicates(subset='query', keep='first')
    
    # Ensure that the column 'ensembl' is of string type, which allows the use of string methods
    results_id_query["ensembl"] = results_id_query["ensembl"].astype(str)
    results_symbol_query["ensembl"] = results_symbol_query["ensembl"].astype(str)
    
    # Split the strings and handle NaNs by chaining the .str methods
    results_id_query["first_ensembl"] = results_id_query["ensembl"].str.split(":", expand=True)[1].str.split("'", expand=True)[1]
    results_symbol_query["first_ensembl"] = results_symbol_query["ensembl"].str.split(":", expand=True)[1].str.split("'", expand=True)[1]

    # Standardize all NaN-like values to pd.NA
    results_id_query = results_id_query.replace({np.nan: pd.NA, None: pd.NA, "nan": pd.NA})
    results_symbol_query = results_symbol_query.replace({np.nan: pd.NA, None: pd.NA, "nan": pd.NA})
    
    # Fill NA values in 'ensembl.gene' column with values from 'first_ensembl' column
    results_id_query['ensembl.gene'] = results_id_query['ensembl.gene'].copy().fillna(results_id_query['first_ensembl'], inplace=False)
    results_symbol_query['ensembl.gene'] = results_symbol_query['ensembl.gene'].copy().fillna(results_symbol_query['first_ensembl'], inplace=False)

    ## Only keep relevant columns
    results_id_query = results_id_query[["query", "ensembl.gene", "symbol"]].copy()
    results_symbol_query = results_symbol_query[["query", "ensembl.gene", "symbol"]].copy()

    ## Rename columns
    results_id_query.columns = ["gene_id", "from_id_ensembl_id", "from_id_official_gene_symbol"]
    results_symbol_query.columns = ["gene_symbol", "from_symbol_ensembl_id", "from_symbol_official_gene_symbol"]

    results_id_query.drop_duplicates(inplace=True)
    results_symbol_query.drop_duplicates(inplace=True)

    
    ## Merge with original dataframe
    dff = dff.merge(results_id_query, how="left", on="gene_id")
    dff = dff.merge(results_symbol_query, how="left", on="gene_symbol")

    ## Combine results from both approaches (symbol and id)
    dff['ensembl_id'] = dff['from_id_ensembl_id'].copy().fillna(dff['from_symbol_ensembl_id'], inplace=False)
    dff['official_symbol'] = dff['from_id_official_gene_symbol'].copy().fillna(dff['from_symbol_official_gene_symbol'], inplace=False)

    ## Drop intermediate columns
    dff.drop(columns=["from_id_ensembl_id", "from_id_official_gene_symbol", "from_symbol_ensembl_id", "from_symbol_official_gene_symbol", "gene_id", "gene_symbol"], inplace=True)

    ## Drop anything that is missing both ensembl_id and official_symbol. Also drop any duplicates
    dff.dropna(subset=["ensembl_id", "official_symbol"], how="all", inplace=True)
    
    ## Drop Duplicates
    dff.drop_duplicates(inplace=True)
    dff.drop_duplicates(subset=["ensembl_id", "official_symbol"], inplace=True)

    
    ## Make ensembl id and official symbol the first two columns 
    dff = dff[dff.columns[-2:].tolist() + dff.columns[:-2].tolist()].copy()

    
    return dff

In [3]:
## Import data

df = pd.read_csv("../../data/raw_data/meta-analysis/marques-coelho_et_al.csv")


In [6]:
df.head()

Unnamed: 0,TXNAME,dataset,GENEID,GENE_NAME,gene.log2FC,gene.padj,transcript.log2FC,transcript.padj,gene_biotype,iso_biotype,dtu.dIF,dtu.gene.padj,dtu.isoform.padj,dtu.ofdr.gene,dtu.ofdr.transcript,threshDT,threshG
0,ENST00000000233.9,MAYO,ENSG00000004059.10,ARF5,-0.281214,0.000813,-0.283385,0.011291,protein_coding,protein_coding,0.00437,,,,,Not Significant,Non-Significant
1,ENST00000000233.9,MSBB BM10,ENSG00000004059.10,ARF5,-0.263954,0.031766,-0.2437,0.126583,protein_coding,protein_coding,0.004464,,,,,Not Significant,Non-Significant
2,ENST00000000233.9,MSBB BM22,ENSG00000004059.10,ARF5,-0.260638,0.050646,-0.291972,0.096546,protein_coding,protein_coding,-0.001508,,,,,Not Significant,Non-Significant
3,ENST00000000233.9,MSBB BM36,ENSG00000004059.10,ARF5,-0.435165,0.000253,-0.472171,0.007999,protein_coding,protein_coding,-0.053296,,,,,,DEG
4,ENST00000000233.9,MSBB BM44,ENSG00000004059.10,ARF5,-0.25296,0.041865,-0.304051,0.075831,protein_coding,protein_coding,-0.003478,,,,,Not Significant,Non-Significant
