In [1]:
from os import name
import pandas as pd
import mygene
import numpy as np

flat_file_url = "https://gist.github.com/kdaily/2ed85e0dd3048fea8424b40243ddfa1c/raw/420086bd941962df66992667972c13462e504cc6/gencode.v24.primary_assembly.refFlat.txt"

Agora uses one particular dataset that historically has been packed as part of an RData file.  Since finding the original
code used to generate gene_info.RData, I realized that code cannot be run.  It rellies on the presence of a couple of
columns that come from the mygene package in BioConductor.  This notebook is the most faithful reproduction of that data
workflow in order to generate an interoperable file corresponding to the one we haver been using in Agora for a long time.

This is the provenance of gene_info.feather.

The next cell contains the set-up required to run the notebook:

The first step is to read the raw data into a pandas dataframe and make sure the names are standardized.  The result we
get is a Pandas Series that needs to be converted in a dataframe.

In [6]:
# gene_table = pd.read_csv(flat_file_url, sep='\t', header=None, usecols=[0], names=['ensembl_gene_id'])
gene_table = pd.read_csv('../../gene_info_refined.csv')
# gene_table = gene_table["ensembl_gene_id"].replace("\\..*", "", regex=True).drop_duplicates()

# gene_table = pd.DataFrame(gene_table)
gene_table = gene_table['ensembl_gene_id']
# gene_table.columns = ['ensembl_gene_id']

gene_table.shape # should be the same as the R counterpart

(20011,)

Next, we must fetch the data from the BioConductor Package in order to retrieve a few key fields.  Interestingly, the
field X_Score - a measurement of how well the search algorithm did in finding this gene- is not present anymore.  Feel
free to modify the query to include that field and verify it for yourself.

*"query" is the name of the index and needs to be named "ensembl_gene_id".

In [8]:
mg = mygene.MyGeneInfo()
# bioconductor_gene_info = mg.getgenes(gene_table['ensembl_gene_id'], fields=["symbol", "name", "summary", "type_of_gene", "alias"], as_dataframe=True)
bioconductor_gene_info = mg.getgenes(gene_table, fields=["symbol", "name", "summary", "type_of_gene", "alias"], as_dataframe=True)
bioconductor_gene_info.index.rename("ensembl_gene_id", inplace=True)
bioconductor_gene_info.head()

querying 1-1000...done.
querying 1001-2000...done.
querying 2001-3000...done.
querying 3001-4000...done.
querying 4001-5000...done.
querying 5001-6000...done.
querying 6001-7000...done.
querying 7001-8000...done.
querying 8001-9000...done.
querying 9001-10000...done.
querying 10001-11000...done.
querying 11001-12000...done.
querying 12001-13000...done.
querying 13001-14000...done.
querying 14001-15000...done.
querying 15001-16000...done.
querying 16001-17000...done.
querying 17001-18000...done.
querying 18001-19000...done.
querying 19001-20000...done.
querying 20001-20011...done.


Unnamed: 0_level_0,notfound,_id,_version
ensembl_gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000069712,True,,
ENSG00000083622,,ENSG00000083622,1.0
ENSG00000093100,,ENSG00000093100,1.0
ENSG00000100101,,ENSG00000100101,1.0
ENSG00000103200,,ENSG00000103200,1.0


In [13]:
bioconductor_gene_info[bioconductor_gene_info['notfound'] == True].count()

notfound    4015
_id            0
_version       0
dtype: int64

In [14]:
bioconductor_gene_info[bioconductor_gene_info['notfound'].isna()].count()

notfound        0
_id         15996
_version    15996
dtype: int64

In [16]:
bioconductor_gene_info.columns

Index(['notfound', '_id', '_version'], dtype='object')

We join, and then stardardize our datasets:

In [12]:
gene_table_merged = pd.merge(left=gene_table, right=bioconductor_gene_info, how='left', on="ensembl_gene_id")
gene_table_merged.columns = gene_table_merged.columns.str.replace("[#,@,&,*,^,?,(,),%,$,#,!,/]", "")
gene_table_merged.columns = gene_table_merged.columns.str.replace("[' ', '-', '.']", "_")
gene_table_merged.columns = map(str.lower, gene_table_merged.columns)


# the next two lines would be relevant if we wanted to bring in the go.MF field.  Since we do not, they're commented out.  Older datasets should still contain them, so I'm providing the logic in case you see those.
# gene_table_merged["go_mf"] = gene_table_merged["go_mf"].fillna('').astype(str)
# gene_table_merged["go_mf_pubmed"] = gene_table_merged["go_mf_pubmed"].fillna(np.nan).apply(lambda x: x if type(x) is None or type(x) is list else [x])

gene_table_merged.shape

  gene_table_merged.columns = gene_table_merged.columns.str.replace("[#,@,&,*,^,?,(,),%,$,#,!,/]", "")
  gene_table_merged.columns = gene_table_merged.columns.str.replace("[' ', '-', '.']", "_")


(60727, 9)

It's important that we check the values here.  We expect the index to be populated for every row (in other words, it should match the row count of the previous cell), while missing values on the other columns are expected.  The 'notfound' column should be an indicator that querying for that particular gene yielded no result.  Therefore, columns used for internal purposes (the ones starting in underscore) should contain the same number of missing values.

In [13]:
for col in gene_table_merged.columns:
    print("Missing values from " + col + ": " + str(gene_table_merged[col].isna().sum()))
    
not_found = gene_table_merged[gene_table_merged['notfound'].notna()]
not_found.shape

Missing values from ensembl_gene_id: 0
Missing values from _id: 4015
Missing values from _version: 4015
Missing values from name: 20011
Missing values from symbol: 20011
Missing values from type_of_gene: 35892
Missing values from alias: 41492
Missing values from summary: 46418
Missing values from notfound: 56712


(4015, 9)

Most importantly, we would like to make sure that there's no information in the other columns every time 'notfound' is True.  That will ensure the cleanliness of the dataset.

In [14]:
interesting_columns = [col for col in not_found.columns if '_' not in col[0]] # all columns that don't start with _
interesting_columns.remove('ensembl_gene_id')
interesting_columns.remove('notfound')

for col in interesting_columns:
    print(not_found[not_found[col].notna()].shape[0])

0
0
0
0
0


In [15]:
gene_table_merged.columns

Index(['ensembl_gene_id', '_id', '_version', 'name', 'symbol', 'type_of_gene',
       'alias', 'summary', 'notfound'],
      dtype='object')

Lastly, we can confidently remove the values where notfound is true, and write our feather file:

In [7]:
# gene_table_merged_py = gene_table_merged.copy() # this copy gets used for analysis in the ./comparisson.ipynb file
# gene_table_merged = gene_table_merged[gene_table_merged['notfound'].isnull()].reset_index()

for row in gene_table_merged.loc[gene_table_merged['alias'].isnull(), 'alias'].index:
    gene_table_merged.at[row, 'alias'] = []

gene_table_merged['alias'] = gene_table_merged['alias'].apply(lambda cell: cell if isinstance(cell, list) else [cell])
# gene_table_merged[gene_table_merged['alias'].map(type) == float]
# set(gene_table_merged['alias'].map(type))


# gene_table_merged_py.to_feather('../output/gene_table_merged_py.feather')
gene_table_merged.to_feather('../output/gene_table_merged.feather')