# Setup

- Import libraries
- Get project directory and set working directory as project directory

In [5]:
import os, importlib
import pandas as pd
import numpy as np
from IPython.core.display import display, HTML
from scripts import processData, files
importlib.reload(processData)
importlib.reload(files)

paths = files.getPaths()
os.chdir(paths['dirProject'])
print('Current working directory: ' + os.getcwd())

Current working directory: /home/groups/kornberg/bentyeh/projects/disprot


In [2]:
display(HTML("<style>.container { width:80% !important; }</style>"))

Directory structure
- data/
  - ref_raw/: raw reference data - should never be used except in data pre-processing step
  - ref_proc/: processed reference data - format of reference data to be used in the program
  - ml_datasets/: datasets for training ML models for CS 229 project
  - data_aux/: auxiliary data

# Download Data

In [3]:
%%bash
# wget parameters
# -nc: no clobber - only download if file is not already present
# -O: output file
# -P: directory prefix

# UniProtKB
# - UP000005640_9606.fasta.gz: reference human proteome
# - UP000005640_9606.idmapping.gz: mappings from UniProtKB to other databases
# - See ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/README
wget -nc -P data/ref_raw/uniprot \
  ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000005640_9606.fasta.gz
wget -nc -P data/ref_raw/uniprot \
  ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000005640_9606.idmapping.gz
wget -nc -P data/ref_raw/uniprot \
  ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping.dat.gz

# UCSC
# - knownCanonical_GRCh38.txt.gz: canonical splice variants of each gene, GRCh38/hg38 genome build
# - knownCanonical_GRCh37.txt.gz: canonical splice variants of each gene, GRCh37/hg19 genome build
# - See http://genome.ucsc.edu/cgi-bin/hgTables?hgta_doSchemaDb=hg38&hgta_doSchemaTable=knownCanonical
wget -nc -O data/ref_raw/ucsc/knownCanonical_GRCh38.txt.gz \
  http://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/knownCanonical.txt.gz
wget -nc -O data/ref_raw/ucsc/knownCanonical_GRCh37.txt.gz \
  http://hgdownload.soe.ucsc.edu/goldenPath/hg19/database/knownCanonical.txt.gz

# Ensembl
# - Homo_sapiens.GRCh37.63.pep.all.fa.gz: human proteome
# - See http://ftp.ensembl.org/pub/release-63/fasta/homo_sapiens/pep/README
wget -nc -P data/ref_raw/ensembl \
  http://ftp.ensembl.org/pub/release-63/fasta/homo_sapiens/pep/Homo_sapiens.GRCh37.63.pep.all.fa.gz
wget -nc -P data/ref_raw/ensembl \
  http://ftp.ensembl.org/pub/release-94/fasta/homo_sapiens/pep/Homo_sapiens.GRCh38.pep.all.fa.gz

# HGNC
# - gene_groups.tsv: HGNC gene groups dataset
# - See https://www.genenames.org/data/genegroup/ and https://www.genenames.org/download/statistics-and-files/
if [ ! -f data/ref_raw/hgnc/gene_groups.tsv.gz ]; then
  wget -nc -O data/ref_raw/hgnc/gene_groups.tsv \
    https://www.genenames.org/cgi-bin/genegroup/download-all
  gzip data/ref_raw/hgnc/gene_groups.tsv
fi

# D2P2
# - all.disrange.gz: disorder predictions from all predictors for all proteins
# - vlxt.disrange.gz: disorder predictions from VLXT predictor for all proteins
# - genomes.protein.gz: map D2P2 protein ID to external protein accession number (e.g., for humans: Ensembl ENSP IDs)
# - d2p2_protein_to_uniprot.tsv.bz2: map D2P2 protein ID to UniProtKB accession number
# - See http://d2p2.pro/download
wget -nc -P data/ref_raw/d2p2/disorder \
  http://d2p2.pro/downloads/all.disrange.gz
wget -nc -P data/ref_raw/d2p2/disorder \
  http://d2p2.pro/downloads/vlxt.disrange.gz
wget -nc -P data/ref_raw/d2p2/disorder \
  http://d2p2.pro/downloads/vsl2b.disrange.gz
wget -nc -P data/ref_raw/d2p2/protein \
  http://d2p2.pro/downloads/genomes.protein.gz
wget -nc -P data/ref_raw/d2p2 \
  http://d2p2.pro/database/d2p2_protein_to_uniprot.tsv.bz2

# HIPPIE
# - hippie_current.txt: human PPI database
if [ ! -f data/ref_raw/hippie_current.txt.gz ]; then
  wget -nc -P data/ref_raw \
    http://cbdm-01.zdv.uni-mainz.de/~mschaefer/hippie/hippie_current.txt
  gzip data/ref_raw/hippie_current.txt
fi

# Disprot
# 

File ‘data/ref_raw/uniprot/UP000005640_9606.fasta.gz’ already there; not retrieving.
File ‘data/ref_raw/uniprot/UP000005640_9606.idmapping.gz’ already there; not retrieving.
File ‘data/ref_raw/uniprot/HUMAN_9606_idmapping.dat.gz’ already there; not retrieving.
File `data/ref_raw/ucsc/knownCanonical_GRCh38.txt.gz' already there; not retrieving.
File `data/ref_raw/ucsc/knownCanonical_GRCh37.txt.gz' already there; not retrieving.
File ‘data/ref_raw/ensembl/Homo_sapiens.GRCh37.63.pep.all.fa.gz’ already there; not retrieving.

File ‘data/ref_raw/ensembl/Homo_sapiens.GRCh38.pep.all.fa.gz’ already there; not retrieving.

File ‘data/ref_raw/d2p2/disorder/all.disrange.gz’ already there; not retrieving.

File ‘data/ref_raw/d2p2/disorder/vlxt.disrange.gz’ already there; not retrieving.

File ‘data/ref_raw/d2p2/disorder/vsl2b.disrange.gz’ already there; not retrieving.

File ‘data/ref_raw/d2p2/protein/genomes.protein.gz’ already there; not retrieving.

File ‘data/ref_raw/d2p2/d2p2_protein_to_unipr

GO terms
- GO:0003700 DNA-binding transcription factor activity
- GO:0051090 regulation of DNA-binding transcription factor activity
- GO:0006357 regulation of transcription by RNA polymerase II
  - has over 10000 annotations, slow to download
- GO:2000142 regulation of DNA-templated transcription, initiation
- GO:0032784 regulation of DNA-templated transcription, elongation
- GO:0031554 regulation of DNA-templated transcription, termination
- GO:0001047 core promoter binding
- GO:0003712 transcription coregulator activity
- GO:0001012 RNA polymerase II regulatory region DNA binding
- GO:0016592 mediator complex
- GO:0097718 disordered domain specific binding

In [4]:
if not os.path.exists(paths['gl_QuickGO_raw']):
    goIds = ['GO:0003700', 'GO:0051090', 'GO:0006357', 'GO:2000142', 'GO:0032784', 'GO:0031554',
             'GO:0001047', 'GO:0003712', 'GO:0001012', 'GO:0016592', 'GO:0097718']
    gl_QuickGO_raw = processData.getQuickGO(','.join(goIds))
    gl_QuickGO_raw.to_csv(paths['gl_QuickGO_raw'], **files.save_kwargs)

# Process Data

## Reference Data
Process raw data from data/ref_raw to data/ref_proc

In [2]:
# Script parameters
# - reprocess: bool
#     (Re)process and save data, or read processed data
# - show_df: bool
#     Display DataFrames in the notebook after reading and processing data

reprocess = False
show_df = True

### Ensembl
Parse Ensembl proteome FASTA file into TSV table

In [6]:
# Ensembl human proteome
if reprocess or not os.path.exists(paths['ensemblProteome']):
    ensemblProteome = processData.fastaToDF(paths['ensemblProteome_raw'],
                                            headerParser=processData.parseEnsemblPepHeader)
    
    # drop versioning from transcript and gene IDs
    ensemblProteome['id'] = ensemblProteome['id'].str.split('.').str[0]
    ensemblProteome['transcript'] = ensemblProteome['transcript'].str.split('.').str[0]
    ensemblProteome['gene'] = ensemblProteome['gene'].str.split('.').str[0]
    ensemblProteome.to_csv(paths['ensemblProteome'], **files.save_kwargs)
else:
    ensemblProteome = pd.read_table(paths['ensemblProteome'])

if show_df:
    display(ensemblProteome)

Unnamed: 0,accession,coord_system,description,end,gene,gene_biotype,gene_symbol,id,name,seq,seqtype,source,start,status,strand,transcript,transcript_biotype,version
0,HGNC:12158,chromosome,T cell receptor beta diversity 1,142786224,ENSG00000282431,TR_D_gene,TRBD1,ENSP00000487941,7,GTGG,pep,HGNC Symbol,142786213,,1,ENST00000632684,TR_D_gene,GRCh38
1,HGNC:12255,chromosome,T cell receptor delta diversity 2,22439015,ENSG00000237235,TR_D_gene,TRDD2,ENSP00000451515,14,PSY,pep,HGNC Symbol,22439007,,1,ENST00000434970,TR_D_gene,GRCh38
2,HGNC:12256,chromosome,T cell receptor delta diversity 3,22449125,ENSG00000228985,TR_D_gene,TRDD3,ENSP00000452494,14,TGGY,pep,HGNC Symbol,22449113,,1,ENST00000448914,TR_D_gene,GRCh38
3,HGNC:12254,chromosome,T cell receptor delta diversity 1,22438554,ENSG00000223997,TR_D_gene,TRDD1,ENSP00000451042,14,EI,pep,HGNC Symbol,22438547,,1,ENST00000415118,TR_D_gene,GRCh38
4,HGNC:12158,chromosome,T cell receptor beta diversity 1,142847317,ENSG00000282253,TR_D_gene,TRBD1,ENSP00000488240,CHR_HSCHR7_2_CTG6,GTGG,pep,HGNC Symbol,142847306,,1,ENST00000631435,TR_D_gene,GRCh38
5,HGNC:5495,chromosome,immunoglobulin heavy diversity 3-10,105904527,ENSG00000211923,IG_D_gene,IGHD3-10,ENSP00000419773,14,VLLWFGELL,pep,HGNC Symbol,105904497,,-1,ENST00000390583,IG_D_gene,GRCh38
6,HGNC:5496,chromosome,immunoglobulin heavy diversity 3-16,105895670,ENSG00000211917,IG_D_gene,IGHD3-16,ENSP00000428366,14,VL*LRLGELSLY,pep,HGNC Symbol,105895634,,-1,ENST00000390577,IG_D_gene,GRCh38
7,HGNC:5483,chromosome,immunoglobulin heavy diversity 1-14 (non-funct...,105900654,ENSG00000227108,IG_D_gene,IGHD1-14,ENSP00000418765,14,GITGT,pep,HGNC Symbol,105900638,,-1,ENST00000451044,IG_D_gene,GRCh38
8,HGNC:5489,chromosome,immunoglobulin heavy diversity 2-15,105897987,ENSG00000211918,IG_D_gene,IGHD2-15,ENSP00000427969,14,RIL*WW*LLL,pep,HGNC Symbol,105897957,,-1,ENST00000390578,IG_D_gene,GRCh38
9,HGNC:5491,chromosome,immunoglobulin heavy diversity 2-21,105888578,ENSG00000211912,IG_D_gene,IGHD2-21,ENSP00000429324,14,SILWW*LLF,pep,HGNC Symbol,105888551,,-1,ENST00000390572,IG_D_gene,GRCh38


In [7]:
# Ensembl human proteome
if reprocess or not os.path.exists(paths['ensemblProteome63']):
    ensemblProteome63 = processData.fastaToDF(paths['ensemblProteome63_raw'],
                                              save=paths['ensemblProteome63'],
                                              headerParser=processData.parseEnsemblPepHeader,
                                              **files.save_kwargs)
else:
    ensemblProteome63 = pd.read_table(paths['ensemblProteome63'])

if show_df:
    display(ensemblProteome63)

Unnamed: 0,coord_system,end,gene,id,name,seq,seqtype,start,status,strand,transcript,version
0,chromosome,120741861,ENSG00000089163,ENSP00000444838,12,MTGAGISTESGIPDYRSEKVGLYARTDRRPIQHGDFVRSAPIRQRY...,pep,120740119,novel,1,ENST00000536460,GRCh37
1,chromosome,120751052,ENSG00000089163,ENSP00000202967,12,MKMSFALTFRSAKGRWIANPSQPCSKASIGLFVPASPPLDPEKVKE...,pep,120740124,known,1,ENST00000202967,GRCh37
2,chromosome,169746955,ENSG00000152253,ENSP00000282074,2,MVEDELALFDKSINEFWNKFKSTDTSCQMAGLRDTYKDSIKAFAEK...,pep,169727404,known,-1,ENST00000282074,GRCh37
3,chromosome,169769787,ENSG00000152253,ENSP00000393322,2,MVEDELALFDKSINEFWNKFKSTDTSCQMAGLRDTYKDSIKAFAEK...,pep,169733748,known,-1,ENST00000451987,GRCh37
4,chromosome,50689690,ENSG00000100429,ENSP00000397517,22,MGTALVYHEDMTATRLLWDDPECEIERPERLTAALDRLRQRGLEQR...,pep,50683612,known,-1,ENST00000415993,GRCh37
5,chromosome,50689814,ENSG00000100429,ENSP00000216271,22,MGTALVYHEDMTATRLLWDDPECEIERPERLTAALDRLRQRGLEQR...,pep,50683612,known,-1,ENST00000216271,GRCh37
6,chromosome,50689508,ENSG00000100429,ENSP00000407640,22,MGTALVYHEDMTATRLLWDDPECEIERPERLTAALDRLRQRGLEQR...,pep,50683834,known,-1,ENST00000429374,GRCh37
7,chromosome,50689461,ENSG00000100429,ENSP00000406150,22,MGTALVYHEDMTATRLLWDDPECEIERPERLTAALDRLRQRGLEQR...,pep,50683871,known,-1,ENST00000454936,GRCh37
8,chromosome,50689669,ENSG00000100429,ENSP00000397542,22,MGTALVYHEDMTATRLLWDDPECEIERPERLTAALDRLRQRGLEQR...,pep,50683871,novel,-1,ENST00000448072,GRCh37
9,chromosome,50689461,ENSG00000100429,ENSP00000343540,22,MGTALVYHEDMTATRLLWDDPECEIERPERLTAALDRLRQRGLEQR...,pep,50683936,known,-1,ENST00000349505,GRCh37


### UniProt
Parse UniProt proteome FASTA file into TSV table

In [8]:
# UniProt human proteome
if reprocess or not os.path.exists(paths['uniprotProteome']):
    uniprotProteome = processData.fastaToDF(paths['uniprotProteome_raw'],
                                            save=paths['uniprotProteome'],
                                            headerParser=processData.parseUniProtHeader,
                                            **files.save_kwargs)
else:
    uniprotProteome = pd.read_table(paths['uniprotProteome'])

if show_df:
    display(uniprotProteome)

Unnamed: 0,db,gn,id,os,ox,pe,proteinName,seq,sv,uniprotName
0,tr,DNAJC25-GNG10,A0A024R161,Homo sapiens,9606,3,Guanine nucleotide-binding protein subunit gamma,MGAPLLSPGWGAGAAGRRWWMLLAPLLPALLLVRPAGALVEGLYCG...,1,A0A024R161_HUMAN
1,tr,hCG_2014768,A0A024R1R8,Homo sapiens,9606,4,"HCG2014768, isoform CRA_a",MSSHEGGKKKALKQPKKQAKEMDEEEKAFKQKQKEEQKKLEVLKAK...,1,A0A024R1R8_HUMAN
2,sp,NUDT4B,A0A024RBG1,Homo sapiens,9606,3,Diphosphoinositol polyphosphate phosphohydrola...,MMKFKPNQTRTYDREGFKKRAACLCFRSEQEDEVLLVSSSRYPDQW...,1,NUD4B_HUMAN
3,tr,TRBV20OR9-2,A0A075B6H5,Homo sapiens,9606,4,T cell receptor beta variable 20/OR9-2 (non-fu...,METVVTTLPREGGVGPSRKMLLLLLLLGPGSGLSAVVSQHPSRVIC...,1,A0A075B6H5_HUMAN
4,tr,IGKV3-7,A0A075B6H7,Homo sapiens,9606,1,Immunoglobulin kappa variable 3-7 (non-functio...,MEAPAQLLFLLLLWLPDTTREIVMTQSPPTLSLSPGERVTLSCRAS...,1,A0A075B6H7_HUMAN
5,tr,IGKV1D-42,A0A075B6H8,Homo sapiens,9606,4,Immunoglobulin kappa variable 1D-42 (non-funct...,MDMRVPAQLLGLLLLWLPGVRFDIQMTQSPSFLSASVGDRVSIICW...,6,A0A075B6H8_HUMAN
6,sp,IGLV4-69,A0A075B6H9,Homo sapiens,9606,1,Immunoglobulin lambda variable 4-69,MAWTPLLFLTLLLHCTGSLSQLVLTQSPSASASLGASVKLTCTLSS...,1,LV469_HUMAN
7,sp,IGLV8-61,A0A075B6I0,Homo sapiens,9606,3,Immunoglobulin lambda variable 8-61,MSVPTMAWMMLLLGLLAYGSGVDSQTVVTQEPSFSVSPGGTVTLTC...,7,LV861_HUMAN
8,sp,IGLV4-60,A0A075B6I1,Homo sapiens,9606,3,Immunoglobulin lambda variable 4-60,MAWTPLLLLFPLLLHCTGSLSQPVLTQSSSASASLGSSVKLTCTLS...,1,LV460_HUMAN
9,tr,IGLV11-55,A0A075B6I3,Homo sapiens,9606,4,Immunoglobulin lambda variable 11-55 (non-func...,MALTPLLLLLLSHCTGSLSRPVLTQPPSLSASPGATARLPCTLSSD...,3,A0A075B6I3_HUMAN


### UCSC
UCSC knownCanonical database

In [9]:
if reprocess or not os.path.exists(paths['ucscKnownCanonical38']):
    knownCanonical = pd.read_table(paths['ucscKnownCanonical38_raw'], header=None,
                                   names=['chrom', 'chromStart', 'chromEnd', 'clusterId', 'transcript', 'gene'])
    # only keep genes from standard chromosomes
    with open(paths['standardChromosomes'], mode='r') as f:
        stdchr = [s.strip() for s in f.readlines()]
    knownCanonical = knownCanonical[knownCanonical['chrom'].isin(stdchr)]
    
    # drop versioning from transcript and gene IDs
    knownCanonical['gene'] = knownCanonical['gene'].str.split('.').str[0]
    knownCanonical['transcript'] = knownCanonical['transcript'].str.split('.').str[0]
    knownCanonical.to_csv(paths['ucscKnownCanonical38'], **files.save_kwargs)
else:
    knownCanonical = pd.read_table(paths['ucscKnownCanonical38'])

if show_df:
    display(knownCanonical)

Unnamed: 0,chrom,chromStart,chromEnd,clusterId,transcript,gene
0,chrX,100628669,100636806,1,ENST00000373020,ENSG00000000003
1,chrX,100584801,100599885,2,ENST00000373031,ENSG00000000005
2,chr20,50934866,50958550,3,ENST00000371588,ENSG00000000419
3,chr1,169853073,169893959,4,ENST00000367772,ENSG00000000457
4,chr1,169795048,169854080,5,ENST00000359326,ENSG00000000460
5,chr1,27612063,27635277,6,ENST00000374005,ENSG00000000938
6,chr1,196651877,196747504,7,ENST00000367429,ENSG00000000971
7,chr6,143494810,143511690,8,ENST00000002165,ENSG00000001036
8,chr6,53497340,53545129,9,ENST00000229416,ENSG00000001084
9,chr6,41072944,41099976,10,ENST00000341376,ENSG00000001167


### D2P2
Proteome and disorder data.

In [10]:
if reprocess or not os.path.exists(paths['d2p2ProteomeHuman']):
    !rm data/ref_proc/d2p2/human.protein.tsv.gz
    !zcat data/ref_raw/d2p2/protein/genomes.protein.gz | grep "^hs" > data/ref_proc/d2p2/human.protein.tsv
    !gzip data/ref_proc/d2p2/human.protein.tsv
    d2p2ProteomeHuman = pd.read_table(
        paths['d2p2ProteomeHuman'], sep='\s|:', header=None,
        usecols=['species', 'ensembl_peptide_id', 'd2p2_id', 'seqtype', 'status',
                 'coord_system', 'version', 'name', 'start', 'end', 'strand',
                 'gene', 'transcript'],
        names = ['species', 'ensembl_peptide_id', 'd2p2_id', 'seqtype', 'status',
                 'coord_system', 'version', 'name', 'start', 'end', 'strand',
                 'drop1', 'gene', 'drop2', 'transcript'],
        engine='python')
    d2p2ProteomeHuman.to_csv(paths['d2p2ProteomeHuman'], **files.save_kwargs)
else:
    d2p2ProteomeHuman = pd.read_table(paths['d2p2ProteomeHuman'])

if show_df:
    display(d2p2ProteomeHuman)

Unnamed: 0,species,ensembl_peptide_id,d2p2_id,seqtype,status,coord_system,version,name,start,end,strand,gene,transcript
0,hs,ENSP00000000233,3185902,pep,known,chromosome,GRCh37,7,127228399,127231759,1,ENSG00000004059,ENST00000000233
1,hs,ENSP00000000412,3147428,pep,known,chromosome,GRCh37,12,9092961,9102551,-1,ENSG00000003056,ENST00000000412
2,hs,ENSP00000000442,3163426,pep,known,chromosome,GRCh37,11,64073050,64084210,1,ENSG00000173153,ENST00000000442
3,hs,ENSP00000001008,3145028,pep,known,chromosome,GRCh37,12,2904119,2913124,1,ENSG00000004478,ENST00000001008
4,hs,ENSP00000001146,3204608,pep,known,chromosome,GRCh37,2,72356367,72375167,-1,ENSG00000003137,ENST00000001146
5,hs,ENSP00000002125,3203443,pep,known,chromosome,GRCh37,2,37458809,37476303,1,ENSG00000003509,ENST00000002125
6,hs,ENSP00000002165,3193587,pep,known,chromosome,GRCh37,6,143816614,143832827,-1,ENSG00000001036,ENST00000002165
7,hs,ENSP00000002501,3173989,pep,novel,chromosome,GRCh37,16,90071273,90085881,-1,ENSG00000003249,ENST00000002501
8,hs,ENSP00000002596,3159414,pep,known,chromosome,GRCh37,4,11394774,11431389,-1,ENSG00000002587,ENST00000002596
9,hs,ENSP00000002829,3186592,pep,known,chromosome,GRCh37,3,50192562,50226508,1,ENSG00000001617,ENST00000002829


In [11]:
if reprocess or not os.path.exists(paths['d2p2DisorderHuman']):
    # create temporary "proteome" with added width information
    tmpProteome = ensemblProteome63[['id']].rename({'id': 'ensembl_peptide_id'}, axis='columns')
    tmpProteome['width'] = list(map(len, ensemblProteome63['seq']))
    assert(sum(tmpProteome['ensembl_peptide_id'].duplicated() == 0))
    tmpProteome = d2p2ProteomeHuman.merge(tmpProteome, how='left', on='ensembl_peptide_id')
    assert(sum(tmpProteome.drop_duplicates(['width', 'd2p2_id']).duplicated('d2p2_id')) == 0)

    human_d2p2_ids = set(d2p2ProteomeHuman['d2p2_id'])
    d2p2DisorderAll = pd.read_table(paths['d2p2Disorder_raw'], header=None,
                                    names=['d2p2_id', 'start', 'end', 'predictor_id', 'gi'])
    d2p2DisorderHuman = d2p2DisorderAll.loc[d2p2DisorderAll['d2p2_id'].isin(human_d2p2_ids), :]
    d2p2DisorderHuman = processData.keepValidDisorder(d2p2DisorderHuman, tmpProteome, makeCopy=True)
    d2p2DisorderHuman.to_csv(paths['d2p2DisorderHuman'], **files.save_kwargs)
else:
    d2p2DisorderHuman = pd.read_table(paths['d2p2DisorderHuman'])

if show_df:
    display(d2p2DisorderHuman)

Unnamed: 0,d2p2_id,start,end,predictor_id,gi
0,3143913,110,111,6,237203001
1,3143913,111,120,1,40583388
2,3143913,1,11,3,116090459
3,3143913,115,120,2,92522826
4,3143913,115,120,6,237203002
5,3143913,1,17,6,237202997
6,3143913,1,5,2,92522824
7,3143913,37,41,2,92522825
8,3143913,37,43,3,116090460
9,3143913,82,82,6,237202998


In [12]:
if reprocess or not os.path.exists(paths['d2p2DisorderHuman_vlxt']):
    d2p2DisorderHuman_vlxt = d2p2DisorderHuman[d2p2DisorderHuman['predictor_id'] == processData.predictorNameToId['VLXT']]
    d2p2DisorderHuman_vlxt.to_csv(paths['d2p2DisorderHuman_vlxt'], **files.save_kwargs)
else:
    d2p2DisorderHuman_vlxt = pd.read_table(paths['d2p2DisorderHuman_vlxt'])

if show_df:
    display(d2p2DisorderHuman_vlxt)

Unnamed: 0,d2p2_id,start,end,predictor_id,gi
0,3143913,111,120,1,40583388
1,3143916,10,26,1,42810253
2,3143916,104,109,1,42810255
3,3143916,116,123,1,42810256
4,3143916,1,2,1,42810252
5,3143916,86,94,1,42810254
6,3143917,105,144,1,45037189
7,3143917,1,18,1,45037187
8,3143917,28,38,1,45037188
9,3143918,1,2,1,47264003


In [13]:
if reprocess or not os.path.exists(paths['d2p2DisorderHuman_vsl2b']):
    d2p2DisorderHuman_vsl2b = d2p2DisorderHuman[d2p2DisorderHuman['predictor_id'] == processData.predictorNameToId['VSL2b']]
    d2p2DisorderHuman_vsl2b.to_csv(paths['d2p2DisorderHuman_vsl2b'], **files.save_kwargs)
else:
    d2p2DisorderHuman_vsl2b = pd.read_table(paths['d2p2DisorderHuman_vsl2b'])

if show_df:
    display(d2p2DisorderHuman_vsl2b)

Unnamed: 0,d2p2_id,start,end,predictor_id,gi
0,3143913,115,120,2,92522826
1,3143913,1,5,2,92522824
2,3143913,37,41,2,92522825
3,3143916,1,36,2,94675034
4,3143916,82,129,2,94675035
5,3143917,1,52,2,96824850
6,3143917,99,168,2,96824851
7,3143918,146,149,2,98974477
8,3143918,1,7,2,98974475
9,3143918,67,73,2,98974476


### Create mapping between UniProt Accession IDs, and Ensembl Peptide IDs, HGNC Gene Symbols, and D2P2 IDs

#### Create 1:1 mapping between canonical UniProt Accession IDs, canonical Ensembl Peptide IDs, and HGNC Gene Symbols

Sources
- BioMart: download Ensembl (gene, transcript, protein) <-> UniProt <-> HGNC symbol + APPRIS annotation
  - See processData.Rmd
- UCSC knownCanonical
- UniProt reference proteome

Filtering
1. Subset BioMart by knownCanonical transcripts
2. Inner join between UniProt reference proteome and BioMart
3. Heuristics
   - Keep only manually curated SwissProt (as opposed to TrEMBL) entries
   - Drop any entry lacking any (UniProt or HGNC) gene symbol/name
   - Among entries whose UniProt sequence and Ensembl sequences are the same, arbitrarily keep one entry
   - Fill missing (null) HGNC symbols with UniProt gene names
   - Among entries with 3 of the 4 following entries matching, arbitrarily keep one entry:
     - UniProt ID
     - Ensembl IDs
     - HGNC symbol
     - UniProt sequence --- may need to first do outer join with Ensembl proteome?
4. Manual curation

In [14]:
biomart_raw = pd.read_table(paths['ensemblBiomart_raw'])
biomart = biomart_raw[biomart_raw['ensembl_transcript_id'].isin(set(knownCanonical['transcript']))]
if show_df:
    display(biomart)

Unnamed: 0,ensembl_gene_id,ensembl_transcript_id,ensembl_peptide_id,uniprotswissprot,hgnc_symbol,transcript_appris
0,ENSG00000173598,ENST00000337179,ENSP00000338352,A0A024RBG1,NUDT4,principal3
1,ENSG00000177144,ENST00000322209,ENSP00000492425,A0A024RBG1,NUDT4B,principal1
2,ENSG00000211637,ENST00000390282,ENSP00000374817,A0A075B6H9,IGLV4-69,principal1
3,ENSG00000211638,ENST00000390283,ENSP00000374818,A0A075B6I0,IGLV8-61,principal1
4,ENSG00000211639,ENST00000390284,ENSP00000374819,A0A075B6I1,IGLV4-60,principal1
5,ENSG00000211642,ENST00000390287,ENSP00000374822,A0A075B6I4,IGLV10-54,principal5
6,ENSG00000211649,ENST00000390295,ENSP00000374830,A0A075B6I9,IGLV7-46,principal1
7,ENSG00000211654,ENST00000390300,ENSP00000374835,A0A075B6J1,IGLV5-37,principal1
9,ENSG00000211661,ENST00000390307,ENSP00000374842,A0A075B6J6,IGLV3-22,principal1
10,ENSG00000211664,ENST00000390310,ENSP00000374845,A0A075B6J9,IGLV2-18,principal1


In [15]:
idMap = uniprotProteome.merge(biomart, how='inner', left_on='id', right_on='uniprotswissprot')
idMap.rename({'id': 'uniprot_id'}, axis='columns', inplace=True)
idMap = processData.removeDups(idMap, ensemblProteome)
idMap = processData.manualMap(idMap, uniprotProteome=uniprotProteome, ensemblProteome=ensemblProteome)

Number of remaining duplicates:
UniProt IDs: 0
Ensembl Peptide IDs: 0
HGNC symbols: 0
Sequences: 76


In [16]:
idMap.reset_index(drop=True, inplace=True)
idMap = idMap[['uniprot_id', 'ensembl_gene_id', 'ensembl_transcript_id', 'ensembl_peptide_id', 'hgnc_symbol', 'proteinName', 'seq']]
if show_df:
    display(idMap)

Unnamed: 0,uniprot_id,ensembl_gene_id,ensembl_transcript_id,ensembl_peptide_id,hgnc_symbol,proteinName,seq
0,A0A024RBG1,ENSG00000177144,ENST00000322209,ENSP00000492425,NUDT4B,Diphosphoinositol polyphosphate phosphohydrola...,MMKFKPNQTRTYDREGFKKRAACLCFRSEQEDEVLLVSSSRYPDQW...
1,A0A075B6H9,ENSG00000211637,ENST00000390282,ENSP00000374817,IGLV4-69,Immunoglobulin lambda variable 4-69,MAWTPLLFLTLLLHCTGSLSQLVLTQSPSASASLGASVKLTCTLSS...
2,A0A075B6I0,ENSG00000211638,ENST00000390283,ENSP00000374818,IGLV8-61,Immunoglobulin lambda variable 8-61,MSVPTMAWMMLLLGLLAYGSGVDSQTVVTQEPSFSVSPGGTVTLTC...
3,A0A075B6I1,ENSG00000211639,ENST00000390284,ENSP00000374819,IGLV4-60,Immunoglobulin lambda variable 4-60,MAWTPLLLLFPLLLHCTGSLSQPVLTQSSSASASLGSSVKLTCTLS...
4,A0A075B6I4,ENSG00000211642,ENST00000390287,ENSP00000374822,IGLV10-54,Immunoglobulin lambda variable 10-54,MPWALLLLTLLTHSAVSVVQAGLTQPPSVSKGLRQTATLTCTGNSN...
5,A0A075B6I9,ENSG00000211649,ENST00000390295,ENSP00000374830,IGLV7-46,Immunoglobulin lambda variable 7-46,MAWTPLFLFLLTCCPGSNSQAVVTQEPSLTVSPGGTVTLTCGSSTG...
6,A0A075B6J1,ENSG00000211654,ENST00000390300,ENSP00000374835,IGLV5-37,Immunoglobulin lambda variable 5-37,MAWTPLLLLLLSHCTGSLSQPVLTQPPSSSASPGESARLTCTLPSD...
7,A0A075B6J6,ENSG00000211661,ENST00000390307,ENSP00000374842,IGLV3-22,Immunoglobulin lambda variable 3-22,MAWATLLLPLLNLYTGSVASYELTQLPSVSVSPGQTARITCSGDVL...
8,A0A075B6J9,ENSG00000211664,ENST00000390310,ENSP00000374845,IGLV2-18,Immunoglobulin lambda variable 2-18,MAWALLLLTLLTQGTGSWAQSALTQPPSVSGSPGQSVTISCTGTSS...
9,A0A075B6K0,ENSG00000211665,ENST00000390311,ENSP00000374846,IGLV3-16,Immunoglobulin lambda variable 3-16,MAWIPLLLPLLTLCTGSEASYELTQPPSVSVSLGQMARITCSGEAL...


#### Merge in D2P2 IDs

2 ways to merge in D2P2 IDs:
- by UniProt ID via `d2p2IdMap`
  - The D2P2 ID Map is a 1:many mapping D2P2 IDs to UniProt IDs.
  ```
  d2p2IdMap = pd.read_table(paths['d2p2IdMap_raw'], header=None, names=['d2p2_id', 'uniprot_id'])
  idMap = idMap.merge(d2p2IdMap, how='left', on='uniprot_id')
  ```
- by Ensembl Peptide ID via `d2p2ProteomeHuman`
  - The D2P2 proteome is a 1:many mapping for D2P2 IDs to Ensembl Peptide IDs.
  ```
  idMap = idMap.merge(d2p2ProteomeHuman[['ensembl_peptide_id', 'd2p2_id']], how='left', on='ensembl_peptide_id')
  ```
- Neither method is complete, so I combine both.
  - Order matters. The more reliable mapping should be merged first.

Notes
- Possibly 1:many mapping between D2P2 IDs and other genome/proteome database IDs
  - Should not affect downstream usage
  - May be due to different genome versions: D2P2 uses GRCh37.63, wherease the above mapping was generated using GRCh38
  - D2P2 lacks IDs for several proteins, resulting in null (NaN) D2P2 IDs
  - Handful of duplicate D2P2 ID mappings to proteins with identical sequences
  ```
  idMap[(~idMap['d2p2_id'].isnull()) & (idMap['d2p2_id'].duplicated(keep=False))]
  ```
  - Easily trimmed to 1:1 mapping by dropping duplicates:
  ```
  idMap.drop(idMap.index[idMap['d2p2_id'].isnull()], axis='index', inplace=True)
  idMap.drop_duplicates(subset='d2p2_id', keep='first', inplace=True)
  ```
- Because the merge above allows NaNs to be introduced (i.e., where a D2P2 ID does not exist for a Uniprot ID or Ensembl Peptide ID), the `'d2p2_id'` column will have `dtype(np.float64)` instead of the original `dtype(np.int64)` in either `d2p2IdMap` or `d2p2ProteomeHuman`. If NaNs are removed (e.g., if `idMap` is trimmed to a 1:1 mapping), the D2P2 ID column can be converted to `dtype(np.int64)`:
  ```
  idMap = idMap.astype({'d2p2_id': np.int64})
  ```

In [17]:
d2p2IdMap = pd.read_table(paths['d2p2IdMap_raw'], header=None, names=['d2p2_id', 'uniprot_id'])

In [18]:
# merge in D2P2 IDs from the D2P2 proteome by matching Ensembl Peptide IDs (should be unique in both DataFrames)
idMap = idMap.merge(d2p2ProteomeHuman[['ensembl_peptide_id', 'd2p2_id']], how='left', on='ensembl_peptide_id')

# for remaining unknown (null) D2P2 IDs, merge in D2P2 IDs from the UniProt ID <-> D2P2 ID map provided by D2P2 by matching UniProt IDs
tmp = pd.merge(idMap.loc[idMap['d2p2_id'].isnull(), ['uniprot_id']], d2p2IdMap, how='left', on='uniprot_id')
tmp = tmp.set_index('uniprot_id').squeeze()
idMap.loc[idMap['d2p2_id'].isnull(), 'd2p2_id'] = tmp.loc[idMap.loc[idMap['d2p2_id'].isnull(), 'uniprot_id']].values
idMap = idMap[['uniprot_id', 'ensembl_gene_id', 'ensembl_transcript_id', 'ensembl_peptide_id', 'hgnc_symbol', 'd2p2_id', 'proteinName', 'seq']]

In [23]:
idMap = idMap.sort_values(by='hgnc_symbol', axis='index').reset_index(drop=True)
idMap.to_csv(paths['idMap'], **files.save_kwargs)
if show_df:
    display(idMap)

Unnamed: 0,uniprot_id,ensembl_gene_id,ensembl_transcript_id,ensembl_peptide_id,hgnc_symbol,d2p2_id,proteinName,seq
0,P04217,ENSG00000121410,ENST00000263100,ENSP00000263100,A1BG,3192101.0,Alpha-1B-glycoprotein,MSMLVVFLLLWGVTWGPVTEAAIFYETQPSLWAESESLLKPLANVT...
1,Q9NQ94,ENSG00000148584,ENST00000373993,ENSP00000363105,A1CF,3173601.0,APOBEC1 complementation factor,MESNHKSGDGLSGTQKEAALRALVQRTGYSLVQENGQRKYGGPPPG...
2,P01023,ENSG00000175899,ENST00000318602,ENSP00000323929,A2M,3147452.0,Alpha-2-macroglobulin,MGKNKLLHPSLVLLLLVLLPTDASVSGKPQYMVLVPSLLHTETTEK...
3,A8K2U0,ENSG00000166535,ENST00000299698,ENSP00000299698,A2ML1,3147398.0,Alpha-2-macroglobulin-like protein 1,MWAQLLLGMLALSPAIAEELPNYLVTLPARLNFPSVQKVCLDLSPG...
4,U3KPV4,ENSG00000184389,ENST00000442999,ENSP00000475261,A3GALT2,,"Alpha-1,3-galactosyltransferase 2",MALKEGLRAWKRIFWRQILLTLGLLGLFLYGLPKFRHLEALIPMGV...
5,Q9NPC4,ENSG00000128274,ENST00000642412,ENSP00000494127,A4GALT,3185412.0,Lactosylceramide 4-alpha-galactosyltransferase,MSKPPDLLLRLLRGAPRQRVCTLFIIGFKFTFFVSIMIYWHVVGEP...
6,Q9UNA3,ENSG00000118017,ENST00000236709,ENSP00000236709,A4GNT,3194070.0,"Alpha-1,4-N-acetylglucosaminyltransferase",MRKELQLSLSVTLLLVCGFLYQFTLKSSCLFCLPSFKSHQGLEALL...
7,Q9NRG9,ENSG00000094914,ENST00000209873,ENSP00000209873,AAAS,3158085.0,Aladin,MCSLGLFPPPPPRGQVTLYEHNNELVTGSSYESPPPDFRGQWINLP...
8,Q86V21,ENSG00000081760,ENST00000316519,ENSP00000324842,AACS,3183097.0,Acetoacetyl-CoA synthetase,MSKEERPGREEILECQVMWEPDSKKNTQMDRFRAAVGAACGLALES...
9,P22760,ENSG00000114771,ENST00000232892,ENSP00000232892,AADAC,3194835.0,Arylacetamide deacetylase,MGRKSLYLLIVGILIAYYIYTPLPDNVEEPWRMMWINAHLKTIQNL...


## Gene Lists

Reference data

In [3]:
idMap = pd.read_table(paths['idMap'])
idMap.drop(idMap.index[idMap['d2p2_id'].isnull()], axis='index', inplace=True)
idMap.drop_duplicates(subset='d2p2_id', keep='first', inplace=True)
idMap = idMap.astype({'d2p2_id': np.int64})
if show_df:
    display(idMap)

Unnamed: 0,uniprot_id,ensembl_gene_id,ensembl_transcript_id,ensembl_peptide_id,hgnc_symbol,d2p2_id,proteinName,seq
0,P04217,ENSG00000121410,ENST00000263100,ENSP00000263100,A1BG,3192101,Alpha-1B-glycoprotein,MSMLVVFLLLWGVTWGPVTEAAIFYETQPSLWAESESLLKPLANVT...
1,Q9NQ94,ENSG00000148584,ENST00000373993,ENSP00000363105,A1CF,3173601,APOBEC1 complementation factor,MESNHKSGDGLSGTQKEAALRALVQRTGYSLVQENGQRKYGGPPPG...
2,P01023,ENSG00000175899,ENST00000318602,ENSP00000323929,A2M,3147452,Alpha-2-macroglobulin,MGKNKLLHPSLVLLLLVLLPTDASVSGKPQYMVLVPSLLHTETTEK...
3,A8K2U0,ENSG00000166535,ENST00000299698,ENSP00000299698,A2ML1,3147398,Alpha-2-macroglobulin-like protein 1,MWAQLLLGMLALSPAIAEELPNYLVTLPARLNFPSVQKVCLDLSPG...
5,Q9NPC4,ENSG00000128274,ENST00000642412,ENSP00000494127,A4GALT,3185412,Lactosylceramide 4-alpha-galactosyltransferase,MSKPPDLLLRLLRGAPRQRVCTLFIIGFKFTFFVSIMIYWHVVGEP...
6,Q9UNA3,ENSG00000118017,ENST00000236709,ENSP00000236709,A4GNT,3194070,"Alpha-1,4-N-acetylglucosaminyltransferase",MRKELQLSLSVTLLLVCGFLYQFTLKSSCLFCLPSFKSHQGLEALL...
7,Q9NRG9,ENSG00000094914,ENST00000209873,ENSP00000209873,AAAS,3158085,Aladin,MCSLGLFPPPPPRGQVTLYEHNNELVTGSSYESPPPDFRGQWINLP...
8,Q86V21,ENSG00000081760,ENST00000316519,ENSP00000324842,AACS,3183097,Acetoacetyl-CoA synthetase,MSKEERPGREEILECQVMWEPDSKKNTQMDRFRAAVGAACGLALES...
9,P22760,ENSG00000114771,ENST00000232892,ENSP00000232892,AADAC,3194835,Arylacetamide deacetylase,MGRKSLYLLIVGILIAYYIYTPLPDNVEEPWRMMWINAHLKTIQNL...
10,Q6P093,ENSG00000197953,ENST00000356517,ENSP00000348911,AADACL2,3194828,Arylacetamide deacetylase-like 2,MGLKALCLGLLCVLFVSHFYTPMPDNIEESWKIMALDAIAKTCTFT...


In [25]:
gl_QuickGO_raw = pd.read_table(paths['gl_QuickGO_raw']) \
    .rename({'GENE PRODUCT ID': 'uniprot_id', 'SYMBOL': 'hgnc_symbol', 'GO TERM': 'goId'}, axis='columns')

Gene lists based on HGNC gene groups

In [26]:
hgncGeneGroups = pd.read_table(paths['hgncGeneGroups'])
if show_df:
    display(hgncGeneGroups)

Unnamed: 0,HGNC ID,Approved symbol,Approved name,Status,Locus type,Previous symbols,Synonyms,Chromosome,NCBI Gene ID,Ensembl gene ID,Vega gene ID,Group ID,Group name
0,HGNC:24500,VIRMA,vir like m6A methyltransferase associated,Approved,gene with protein product,KIAA1429,"DKFZP434I116, fSAP121",8q22.1,25962.0,ENSG00000164944,OTTHUMG00000164426,1419,m6A methyltransferase complex
1,HGNC:29330,METTL14,methyltransferase like 14,Approved,gene with protein product,,KIAA1627,4q26,57721.0,ENSG00000145388,OTTHUMG00000161167,1419,m6A methyltransferase complex
2,HGNC:17563,METTL3,methyltransferase like 3,Approved,gene with protein product,,"Spo8, M6A, MT-A70",14q11.2,56339.0,ENSG00000165819,OTTHUMG00000168825,1419,m6A methyltransferase complex
3,HGNC:16846,WTAP,WT1 associated protein,Approved,gene with protein product,,"KIAA0105, MGC3925, Mum2",6q25.3,9589.0,ENSG00000146457,OTTHUMG00000015933,1419,m6A methyltransferase complex
4,HGNC:324,AGPAT1,1-acylglycerol-3-phosphate O-acyltransferase 1,Approved,gene with protein product,,LPAAT-alpha,6p21.32,10554.0,ENSG00000204310,OTTHUMG00000031210,46,1-acylglycerol-3-phosphate O-acyltransferases
5,HGNC:25718,LPCAT1,lysophosphatidylcholine acyltransferase 1,Approved,gene with protein product,AYTL2,"FLJ12443, AGPAT9, AGPAT10",5p15.33,79888.0,ENSG00000153395,OTTHUMG00000131017,46,1-acylglycerol-3-phosphate O-acyltransferases
6,HGNC:26032,LPCAT2,lysophosphatidylcholine acyltransferase 2,Approved,gene with protein product,AYTL1,"FLJ20481, AGPAT11, LysoPAFAT",16q12.2,54947.0,ENSG00000087253,OTTHUMG00000133238,46,1-acylglycerol-3-phosphate O-acyltransferases
7,HGNC:325,AGPAT2,1-acylglycerol-3-phosphate O-acyltransferase 2,Approved,gene with protein product,BSCL,LPAAT-beta,9q34.3,10555.0,ENSG00000169692,OTTHUMG00000020936,46,1-acylglycerol-3-phosphate O-acyltransferases
8,HGNC:326,AGPAT3,1-acylglycerol-3-phosphate O-acyltransferase 3,Approved,gene with protein product,,LPAAT-gamma,21q22.3,56894.0,ENSG00000160216,OTTHUMG00000086892,46,1-acylglycerol-3-phosphate O-acyltransferases
9,HGNC:20885,AGPAT4,1-acylglycerol-3-phosphate O-acyltransferase 4,Approved,gene with protein product,,"LPAAT-delta, dJ473J16.2",6q26,56895.0,ENSG00000026652,OTTHUMG00000015966,46,1-acylglycerol-3-phosphate O-acyltransferases


In [27]:
gl_GTFs = hgncGeneGroups[hgncGeneGroups['Group name'].str.contains('General transcription factors', case=False, regex=False)]
gl_GTFs = gl_GTFs.merge(idMap[['uniprot_id', 'hgnc_symbol', 'd2p2_id']], how='left', left_on='Approved symbol', right_on='hgnc_symbol')
gl_GTFs = gl_GTFs[['uniprot_id', 'hgnc_symbol', 'd2p2_id']].dropna(axis='index', how='any')
gl_GTFs.to_csv(paths['gl_GTFs'], **files.save_kwargs)
if show_df:
    display(gl_GTFs)

Unnamed: 0,uniprot_id,hgnc_symbol,d2p2_id
0,P52655,GTF2A1,3203415.0
1,P52657,GTF2A2,3153355.0
2,Q00403,GTF2B,3197832.0
3,P20226,TBP,26712063.0
4,P29083,GTF2E1,3192485.0
5,P29084,GTF2E2,3147479.0
6,P35269,GTF2F1,3151317.0
7,P13984,GTF2F2,3148596.0
8,P32780,GTF2H1,3151013.0
9,Q13888,GTF2H2,3164916.0


In [28]:
gl_medComplex = hgncGeneGroups[hgncGeneGroups['Group name'].str.contains('mediator complex', case=False, regex=False)]
gl_medComplex = gl_medComplex.merge(idMap[['uniprot_id', 'hgnc_symbol', 'd2p2_id']], how='left', left_on='Approved symbol', right_on='hgnc_symbol')
gl_medComplex = gl_medComplex[['uniprot_id', 'hgnc_symbol', 'd2p2_id']].dropna(axis='index', how='any')
gl_medComplex.to_csv(paths['gl_medComplex'], **files.save_kwargs)
if show_df:
    display(gl_medComplex)

Unnamed: 0,uniprot_id,hgnc_symbol,d2p2_id
0,P24863,CCNC,3191319
1,Q9BWU1,CDK19,3191972
2,P49336,CDK8,3146826
3,Q15648,MED1,3166161
4,Q9BTT4,MED10,3156671
5,Q9P086,MED11,3152668
6,Q93074,MED12,3161888
7,Q86YW9,MED12L,3194805
8,Q9UHV7,MED13,3181091
9,Q71F56,MED13L,3178720


In [29]:
gl_POLR = hgncGeneGroups[hgncGeneGroups['Group name'].str.contains('RNA polymerase subunits', case=False, regex=False)]
gl_POLR = gl_POLR.merge(idMap[['uniprot_id', 'hgnc_symbol', 'd2p2_id']], how='left', left_on='Approved symbol', right_on='hgnc_symbol')
gl_POLR = gl_POLR[['uniprot_id', 'hgnc_symbol', 'd2p2_id']].dropna(axis='index', how='any')
gl_POLR.to_csv(paths['gl_POLR'], **files.save_kwargs)
if show_df:
    display(gl_POLR)

Unnamed: 0,uniprot_id,hgnc_symbol,d2p2_id
0,O95602,POLR1A,3204936.0
1,Q9H9Y6,POLR1B,3205507.0
2,O15160,POLR1C,3186068.0
4,Q9GZS1,POLR1E,3156674.0
5,P24928,POLR2A,3154207.0
6,P30876,POLR2B,3165120.0
7,P19387,POLR2C,3164383.0
8,O15514,POLR2D,3205750.0
9,P19388,POLR2E,3148124.0
10,P61218,POLR2F,3182641.0


Gene lists based on QuickGO annotations

In [30]:
gl_QuickGO = gl_QuickGO_raw.drop_duplicates(subset=['uniprot_id', 'hgnc_symbol'])    
gl_QuickGO = gl_QuickGO[['goId', 'uniprot_id', 'hgnc_symbol']]
gl_QuickGO.to_csv(paths['gl_QuickGO'], **files.save_kwargs)
if show_df:
    display(gl_QuickGO)

Unnamed: 0,goId,uniprot_id,hgnc_symbol
0,GO:0006357,A0A087WX78,ELOA3C
1,GO:0000981,A0A0U1RQI7,KLF18
3,GO:0000978,A0A1B0GTS1,HSFX4
7,GO:0000122,A0A1B0GVZ6,MBD3L2B
8,GO:0000978,A0A1B0GWH4,HSFX3
13,GO:0000978,A0AVK6,E2F8
32,GO:0003712,A0JLT2,MED19
37,GO:0000978,A0PJY2,FEZF1
41,GO:0000981,A1YPR0,ZBTB7C
43,GO:0000981,A2RRD8,ZNF320


In [31]:
gl_TFs = gl_QuickGO_raw[gl_QuickGO_raw['goId'] == 'GO:0003700'].drop_duplicates(subset='uniprot_id')
gl_TFs = gl_TFs.merge(idMap[['uniprot_id', 'hgnc_symbol', 'd2p2_id']], how='left', on=['uniprot_id', 'hgnc_symbol'])
gl_TFs = gl_TFs[['uniprot_id', 'hgnc_symbol', 'd2p2_id']].dropna(axis='index', how='any').reset_index(drop=True)
gl_TFs.to_csv(paths['gl_TFs'], **files.save_kwargs)
if show_df:
    display(gl_TFs)

Unnamed: 0,uniprot_id,hgnc_symbol,d2p2_id
0,A0AVK6,E2F8,3151408.0
1,A6NCS4,NKX2-6,3146819.0
2,A6NFD8,HELT,3183278.0
3,A6NGD5,ZSCAN5C,3191139.0
4,A6NI15,MSGN1,3202570.0
5,A6NJL1,ZSCAN5B,3191112.0
6,A6NK53,ZNF233,3179611.0
7,O00268,TAF4,3166818.0
8,O00287,RFXAP,3147777.0
9,O00321,ETV2,3168864.0


In [12]:
if not os.path.exists(paths['gl_activator']):
    gl_activator = processData.getQuickGO('GO:0001228') \
      .rename({'GENE PRODUCT ID': 'uniprot_id', 'SYMBOL': 'hgnc_symbol', 'GO TERM': 'goId'}, axis='columns') \
      .drop_duplicates(subset='uniprot_id')[['uniprot_id', 'hgnc_symbol']] \
      .merge(idMap[['uniprot_id', 'hgnc_symbol', 'd2p2_id']], how='left', on=['uniprot_id', 'hgnc_symbol']) \
      .dropna(axis='index', how='any') \
      .reset_index(drop=True) \
      .astype({'d2p2_id': 'int32'})
    gl_activator.to_csv(paths['gl_activator'], **files.save_kwargs)
if show_df:
    display(gl_activator)

numberOfHits: 565, {'resultsPerPage': 100, 'current': 1, 'total': 6}
Switching to 'download' API...


Unnamed: 0,uniprot_id,hgnc_symbol,d2p2_id
0,A6NCS4,NKX2-6,3146819
1,A6NI15,MSGN1,3202570
2,A6NJ46,NKX6-3,3149032
3,A6NKF2,ARID3C,3155161
4,O00321,ETV2,3168864
5,O00327,ARNTL,3150119
6,O00470,MEIS1,3204317
7,O00482,NR5A2,3200274
8,O00570,SOX1,3151707
9,O00712,NFIB,3153278


In [13]:
if not os.path.exists(paths['gl_repressor']):
    gl_repressor = processData.getQuickGO('GO:0001227') \
      .rename({'GENE PRODUCT ID': 'uniprot_id', 'SYMBOL': 'hgnc_symbol', 'GO TERM': 'goId'}, axis='columns') \
      .drop_duplicates(subset='uniprot_id')[['uniprot_id', 'hgnc_symbol']] \
      .merge(idMap[['uniprot_id', 'hgnc_symbol', 'd2p2_id']], how='left', on=['uniprot_id', 'hgnc_symbol']) \
      .dropna(axis='index', how='any') \
      .reset_index(drop=True) \
      .astype({'d2p2_id': 'int32'})
    gl_repressor.to_csv(paths['gl_repressor'], **files.save_kwargs)
if show_df:
    display(gl_repressor)

numberOfHits: 298, {'resultsPerPage': 100, 'current': 1, 'total': 3}
Switching to 'download' API...


Unnamed: 0,uniprot_id,hgnc_symbol,d2p2_id
0,A0AVK6,E2F8,3151408
1,A0PJY2,FEZF1,3185428
2,A6NFD8,HELT,3183278
3,O14503,BHLHE40,3169926
4,O14529,CUX2,3176904
5,O14753,OVOL1,3166539
6,O14978,ZNF263,3152517
7,O15062,ZBTB5,3156668
8,O15090,ZNF536,3166372
9,O15119,TBX3,3178662


Other gene lists

In [32]:
gl_interest = pd.read_table(paths['gl_interest_raw'], header=None, names=['hgnc_symbol'])
gl_interest = gl_interest.merge(idMap[['uniprot_id', 'hgnc_symbol', 'd2p2_id']], how='left', on='hgnc_symbol')
gl_interest = gl_interest[['uniprot_id', 'hgnc_symbol', 'd2p2_id']].dropna(axis='index', how='any')
gl_interest.to_csv(paths['gl_interest'], **files.save_kwargs)
if show_df:
    display(gl_interest)

Unnamed: 0,uniprot_id,hgnc_symbol,d2p2_id
0,P35637,FUS,3161139


In [33]:
gl_random = idMap[['uniprot_id', 'hgnc_symbol', 'd2p2_id']] \
  .drop(idMap.index[idMap['d2p2_id'].isnull()], axis='index') \
  .drop_duplicates(subset='d2p2_id', keep='first') \
  .sample(n=1000, axis='index', random_state=1)
gl_random.to_csv(paths['gl_random'], **files.save_kwargs)
if show_df:
    display(gl_random)

Unnamed: 0,uniprot_id,hgnc_symbol,d2p2_id
6172,Q08378,GOLGA3,3184221
8787,P61626,LYZ,3166561
8534,Q9C0E8,LNPK,3206549
11920,O00469,PLOD2,3194497
4873,Q4G0M1,ERFE,3208424
11847,Q9NQ66,PLCB1,3152752
7931,Q96L93,KIF16B,3153366
4509,Q9BY08,EBPL,3149165
10905,Q9Y5P1,OR51B2,3148433
6890,P17509,HOXB6,3175463


Process `gl_mediatorTFs` in R - see processDataR.ipynb