In [1]:
from os import name
import pandas as pd
import mygene
import numpy as np
from pybiomart import Server

Query Ensembl for a list of all Ensembl IDs in the database of human genes. 

In [2]:
server = Server(host='http://www.ensembl.org')

dataset = (server.marts['ENSEMBL_MART_ENSEMBL']
                 .datasets['hsapiens_gene_ensembl'])

ensemblIds = dataset.query(attributes=['ensembl_gene_id'])
ensemblIds = ensemblIds.rename(columns={"Gene stable ID": "ensembl_gene_id"})

ensemblIds.to_csv("../output/agora_ensg_list.txt", index = False, header = False)

ensemblIds.shape

(68324, 1)

Get info on each gene from mygene

In [3]:
mg = mygene.MyGeneInfo()

bioconductor_gene_info = mg.getgenes(ensemblIds['ensembl_gene_id'], fields=["symbol", "name", "summary", "type_of_gene", "alias"], as_dataframe=True)
bioconductor_gene_info.index.rename("ensembl_gene_id", inplace=True)
bioconductor_gene_info.head()

querying 1-1000...done.
querying 1001-2000...done.
querying 2001-3000...done.
querying 3001-4000...done.
querying 4001-5000...done.
querying 5001-6000...done.
querying 6001-7000...done.
querying 7001-8000...done.
querying 8001-9000...done.
querying 9001-10000...done.
querying 10001-11000...done.
querying 11001-12000...done.
querying 12001-13000...done.
querying 13001-14000...done.
querying 14001-15000...done.
querying 15001-16000...done.
querying 16001-17000...done.
querying 17001-18000...done.
querying 18001-19000...done.
querying 19001-20000...done.
querying 20001-21000...done.
querying 21001-22000...done.
querying 22001-23000...done.
querying 23001-24000...done.
querying 24001-25000...done.
querying 25001-26000...done.
querying 26001-27000...done.
querying 27001-28000...done.
querying 28001-29000...done.
querying 29001-30000...done.
querying 30001-31000...done.
querying 31001-32000...done.
querying 32001-33000...done.
querying 33001-34000...done.
querying 34001-35000...done.
queryin

Unnamed: 0_level_0,_id,_version,alias,name,summary,symbol,type_of_gene,notfound
ensembl_gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ENSG00000000003,7105,2.0,"[T245, TM4SF6, TSPAN-6]",tetraspanin 6,The protein encoded by this gene is a member o...,TSPAN6,protein-coding,
ENSG00000000005,64102,2.0,"[BRICD4, CHM1L, TEM]",tenomodulin,This gene encodes a protein that is related to...,TNMD,protein-coding,
ENSG00000000419,8813,2.0,"[CDGIE, MPDS]",dolichyl-phosphate mannosyltransferase subunit...,Dolichol-phosphate mannose (Dol-P-Man) serves ...,DPM1,protein-coding,
ENSG00000000457,57147,1.0,"[PACE-1, PACE1]",SCY1 like pseudokinase 3,This gene encodes a protein with a kinase doma...,SCYL3,protein-coding,
ENSG00000000460,55732,1.0,,chromosome 1 open reading frame 112,,C1orf112,protein-coding,


In [4]:
bioconductor_gene_info[bioconductor_gene_info['notfound'] == True].count()

_id             0
_version        0
alias           0
name            0
summary         0
symbol          0
type_of_gene    0
notfound        9
dtype: int64

In [5]:
bioconductor_gene_info[bioconductor_gene_info['notfound'].isna()].count()

_id             68317
_version        68317
alias           25980
name            49231
summary         24434
symbol          49231
type_of_gene    39583
notfound            0
dtype: int64

In [6]:
bioconductor_gene_info.columns

Index(['_id', '_version', 'alias', 'name', 'summary', 'symbol', 'type_of_gene',
       'notfound'],
      dtype='object')

We join, and then stardardize our datasets:

In [7]:
gene_table_merged = pd.merge(left=ensemblIds, right=bioconductor_gene_info, how='left', on="ensembl_gene_id")
gene_table_merged.columns = gene_table_merged.columns.str.replace("[#,@,&,*,^,?,(,),%,$,#,!,/]", "", regex = True)
gene_table_merged.columns = gene_table_merged.columns.str.replace("[' ', '-', '.']", "_", regex = True)
gene_table_merged.columns = map(str.lower, gene_table_merged.columns)


# the next two lines would be relevant if we wanted to bring in the go.MF field.  Since we do not, they're commented out.  Older datasets should still contain them, so I'm providing the logic in case you see those.
# gene_table_merged["go_mf"] = gene_table_merged["go_mf"].fillna('').astype(str)
# gene_table_merged["go_mf_pubmed"] = gene_table_merged["go_mf_pubmed"].fillna(np.nan).apply(lambda x: x if type(x) is None or type(x) is list else [x])

gene_table_merged.shape

(68326, 9)

In [8]:
gene_table_merged.columns

Index(['ensembl_gene_id', '_id', '_version', 'alias', 'name', 'summary',
       'symbol', 'type_of_gene', 'notfound'],
      dtype='object')

It's important that we check the values here.  We expect the index to be populated for every row (in other words, it should match the row count of the previous cell), while missing values on the other columns are expected.  The 'notfound' column should be an indicator that querying for that particular gene yielded no result.  Therefore, columns used for internal purposes (the ones starting in underscore) should contain the same number of missing values.

In [9]:
for col in gene_table_merged.columns:
    print("Missing values from " + col + ": " + str(gene_table_merged[col].isna().sum()))
    
not_found = gene_table_merged[gene_table_merged['notfound'].notna()]
not_found.shape

Missing values from ensembl_gene_id: 0
Missing values from _id: 9
Missing values from _version: 9
Missing values from alias: 42346
Missing values from name: 19095
Missing values from summary: 43892
Missing values from symbol: 19095
Missing values from type_of_gene: 28743
Missing values from notfound: 68317


(9, 9)

Most importantly, we would like to make sure that there's no information in the other columns every time 'notfound' is True.  That will ensure the cleanliness of the dataset.

In [10]:
interesting_columns = [col for col in not_found.columns if '_' not in col[0]] # all columns that don't start with _
interesting_columns.remove('ensembl_gene_id')
interesting_columns.remove('notfound')

for col in interesting_columns:
    print(not_found[not_found[col].notna()].shape[0])

0
0
0
0
0


In [11]:
gene_table_merged.columns

Index(['ensembl_gene_id', '_id', '_version', 'alias', 'name', 'summary',
       'symbol', 'type_of_gene', 'notfound'],
      dtype='object')

In [12]:
gene_table_merged.head()

Unnamed: 0,ensembl_gene_id,_id,_version,alias,name,summary,symbol,type_of_gene,notfound
0,ENSG00000000003,7105,2.0,"[T245, TM4SF6, TSPAN-6]",tetraspanin 6,The protein encoded by this gene is a member o...,TSPAN6,protein-coding,
1,ENSG00000000005,64102,2.0,"[BRICD4, CHM1L, TEM]",tenomodulin,This gene encodes a protein that is related to...,TNMD,protein-coding,
2,ENSG00000000419,8813,2.0,"[CDGIE, MPDS]",dolichyl-phosphate mannosyltransferase subunit...,Dolichol-phosphate mannose (Dol-P-Man) serves ...,DPM1,protein-coding,
3,ENSG00000000457,57147,1.0,"[PACE-1, PACE1]",SCY1 like pseudokinase 3,This gene encodes a protein with a kinase doma...,SCYL3,protein-coding,
4,ENSG00000000460,55732,1.0,,chromosome 1 open reading frame 112,,C1orf112,protein-coding,


Lastly, we fix NULL values in the "alias" field and write our feather file:

In [13]:
for row in gene_table_merged.loc[gene_table_merged['alias'].isnull(), 'alias'].index:
    gene_table_merged.at[row, 'alias'] = []

gene_table_merged['alias'] = gene_table_merged['alias'].apply(lambda cell: cell if isinstance(cell, list) else [cell])
# gene_table_merged[gene_table_merged['alias'].map(type) == float]
# set(gene_table_merged['alias'].map(type))

print(gene_table_merged.shape)

# gene_table_merged_py.to_feather('../output/gene_table_merged_py.feather')
gene_table_merged.to_feather('../output/gene_table_merged_GRCh38.p13.feather')

(68326, 9)
