In [1]:
# Enzymes: MFO 'Catalytic activity', GO:0003824 
#         OR
#         any EC term

# Metabolic Enzymes: BPO 'primary metabolic process', GO:0044238 
#                   AND 
#           MFO 'Catalytic activity', GO:0003824 OR any EC term 

# Kinases: MFO 'protein kinase activity', GO:0004672 
#         OR 
#         EC: 2.7.10/11/12/13/99.-
#         OR 
#         text search "protein kinase" in Uniprot protein name field

# TFs: MFO GO:0044212 transcription regulatory region DNA binding 
#     OR 
#     BPO GO:0006355 regulation of transcription, DNA-templated
#     OR
#     text search "transcription factor" in Uniprot protein name field

# Nucleic-Acid Binding Proteins: MFO GO:0003676 nucleic-acid binding

# Signaling Proteins: BPO GO:0023052 signaling

In [2]:
import pandas as pd
import numpy as np

In [3]:
# load genelist of genes for matrix to be created 

Genelist = pd.read_csv('Genelist.csv', names = ['Genes'])

Genelist

Unnamed: 0,Genes
0,AADAC
1,AARS
2,AASDHPPT
3,AASS
4,ABCB5
...,...
500,YRDC
501,YWHAG
502,ZIC1
503,ZIC4


In [4]:
# load uniprot data of all genes and all GO terms 
# use the uniprot frame to add on gene conditions and use the uniprot_start frame as a dummy blank frame

Uniprot = pd.read_csv('uniprot_20200225_key.csv', names = ['Gene_Names', 'GO'])
Uniprot_start = pd.read_csv('uniprot_20200225_key.csv', names = ['Gene_Names', 'GO'])

Uniprot_start

Unnamed: 0,Gene_Names,GO
0,Gene names (primary ),Gene ontology (GO)
1,PLAA,cell [GO:0005623]; cell junction [GO:0030054];...
2,NACC1,cell junction [GO:0030054]; cytoplasm [GO:0005...
3,MTSS1,actin cytoskeleton [GO:0015629]; cytoplasm [GO...
4,PNPLA8,endoplasmic reticulum membrane [GO:0005789]; G...
...,...,...
20345,GPRC5B,cell surface [GO:0009986]; cytoplasmic vesicle...
20346,DNAJB12,endoplasmic reticulum [GO:0005783]; endoplasmi...
20347,GCNT3,extracellular exosome [GO:0070062]; Golgi memb...
20348,DLEC1,cytoplasm [GO:0005737]; cytosol [GO:0005829]; ...


In [5]:
Uniprot = pd.read_csv('uniprot_20200225_key.csv', names = ['Gene_Names', 'GO'])

In [6]:
# Enzymes: MFO 'Catalytic activity', GO:0003824 
#         OR
#         any EC term

# first add in MFO Catalytic acivity


Uniprot_Enzyme = Uniprot_start.copy()

infile = open('MFO_catalytic_activity_GO-0003824_descendants.tsv', 'r')

for line in infile: 
    GO_test = str(line[:-1])
    a = Uniprot_Enzyme['GO'].str.contains(GO_test)
    Uniprot_Enzyme[GO_test] = a

###Uniprot_Enzyme

In [7]:
# add in the any EC term 

Any_EC = pd.read_csv('EC_Genes_All.csv', names = ['EC Genes'])

###Any_EC

In [8]:
EC_genes = (Uniprot['Gene_Names']).isin(Any_EC['EC Genes'])

###EC_genes

In [9]:
Uniprot_Enzyme['EC'] = EC_genes

###Uniprot_Enzyme

In [10]:
# filter the matrix down so that it only contains a column of whether any row contains a true value
# note you have to drop Gene_Names and GO otherwise they get counted as true....

Uniprot_Enzyme = Uniprot_Enzyme.drop(['Gene_Names', 'GO'], axis=1)

Enzyme_Cond = Uniprot_Enzyme.any(axis=1, bool_only = True)

In [11]:
Uniprot['Enzyme'] = Enzyme_Cond

In [12]:
Uniprot

Unnamed: 0,Gene_Names,GO,Enzyme
0,Gene names (primary ),Gene ontology (GO),False
1,PLAA,cell [GO:0005623]; cell junction [GO:0030054];...,False
2,NACC1,cell junction [GO:0030054]; cytoplasm [GO:0005...,False
3,MTSS1,actin cytoskeleton [GO:0015629]; cytoplasm [GO...,False
4,PNPLA8,endoplasmic reticulum membrane [GO:0005789]; G...,True
...,...,...,...
20345,GPRC5B,cell surface [GO:0009986]; cytoplasmic vesicle...,False
20346,DNAJB12,endoplasmic reticulum [GO:0005783]; endoplasmi...,False
20347,GCNT3,extracellular exosome [GO:0070062]; Golgi memb...,True
20348,DLEC1,cytoplasm [GO:0005737]; cytosol [GO:0005829]; ...,False


In [13]:
Uniprot['Enzyme'].sum()

5589

In [14]:
# Metabolic Enzymes: BPO 'primary metabolic process', GO:0044238 
#                   AND 
#           MFO 'Catalytic activity', GO:0003824 OR any EC term 
# Note we split out these two terms into two matrixes then combine them in the end to check both conidtions are true

Uniprot_Metabolic_1 = Uniprot_start.copy()

infile = open('BPO_primary_metabolic_process_GO-0044238_descendants.tsv', 'r')

for line in infile: 
    GO_test = str(line[:-1])
    a = Uniprot_Metabolic_1['GO'].str.contains(GO_test)
    Uniprot_Metabolic_1[GO_test] = a

Uniprot_Metabolic_1

Unnamed: 0,Gene_Names,GO,GO:0000012,GO:0000018,GO:0000019,GO:0045950,GO:0000023,GO:0000024,GO:0000025,GO:0000038,...,GO:2001301,GO:2001302,GO:2001303,GO:2001304,GO:2001306,GO:2001311,GO:2001312,GO:2001313,GO:2001314,GO:2001315
0,Gene names (primary ),Gene ontology (GO),False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,PLAA,cell [GO:0005623]; cell junction [GO:0030054];...,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,NACC1,cell junction [GO:0030054]; cytoplasm [GO:0005...,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,MTSS1,actin cytoskeleton [GO:0015629]; cytoplasm [GO...,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,PNPLA8,endoplasmic reticulum membrane [GO:0005789]; G...,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20345,GPRC5B,cell surface [GO:0009986]; cytoplasmic vesicle...,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
20346,DNAJB12,endoplasmic reticulum [GO:0005783]; endoplasmi...,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
20347,GCNT3,extracellular exosome [GO:0070062]; Golgi memb...,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
20348,DLEC1,cytoplasm [GO:0005737]; cytosol [GO:0005829]; ...,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [15]:
Uniprot_Metabolic_1 = Uniprot_Metabolic_1.drop(['Gene_Names', 'GO'], axis=1)

Metabolic_1_Cond = Uniprot_Metabolic_1.any(axis=1, bool_only = True)

In [16]:
Metabolic_1_Cond

0        False
1         True
2         True
3        False
4         True
         ...  
20345     True
20346     True
20347     True
20348    False
20349    False
Length: 20350, dtype: bool

In [17]:
Metabolic_1_Cond.sum()

9809

In [18]:
Uniprot_Metabolic_2 = Uniprot_start.copy()

infile = open('MFO_catalytic_activity_GO-0003824_descendants.tsv', 'r')

for line in infile: 
    GO_test = str(line[:-1])
    a = Uniprot_Metabolic_2['GO'].str.contains(GO_test)
    Uniprot_Metabolic_2[GO_test] = a

Uniprot_Metabolic_2

Unnamed: 0,Gene_Names,GO,GO:0003756,GO:0015036,GO:0000009,GO:0000010,GO:0000014,GO:0000016,GO:0000026,GO:0000030,...,GO:1990883,GO:1990886,GO:1990887,GO:1990888,GO:1990929,GO:1990930,GO:1990931,GO:1990939,GO:1990965,GO:1990984
0,Gene names (primary ),Gene ontology (GO),False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,PLAA,cell [GO:0005623]; cell junction [GO:0030054];...,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,NACC1,cell junction [GO:0030054]; cytoplasm [GO:0005...,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,MTSS1,actin cytoskeleton [GO:0015629]; cytoplasm [GO...,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,PNPLA8,endoplasmic reticulum membrane [GO:0005789]; G...,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20345,GPRC5B,cell surface [GO:0009986]; cytoplasmic vesicle...,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
20346,DNAJB12,endoplasmic reticulum [GO:0005783]; endoplasmi...,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
20347,GCNT3,extracellular exosome [GO:0070062]; Golgi memb...,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
20348,DLEC1,cytoplasm [GO:0005737]; cytosol [GO:0005829]; ...,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [19]:
# add in the any EC term 

Any_EC = pd.read_csv('EC_Genes_All.csv', names = ['EC Genes'])
EC_genes = (Uniprot['Gene_Names']).isin(Any_EC['EC Genes'])
Uniprot_Metabolic_2['EC'] = EC_genes

Uniprot_Metabolic_2

Unnamed: 0,Gene_Names,GO,GO:0003756,GO:0015036,GO:0000009,GO:0000010,GO:0000014,GO:0000016,GO:0000026,GO:0000030,...,GO:1990886,GO:1990887,GO:1990888,GO:1990929,GO:1990930,GO:1990931,GO:1990939,GO:1990965,GO:1990984,EC
0,Gene names (primary ),Gene ontology (GO),False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,PLAA,cell [GO:0005623]; cell junction [GO:0030054];...,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,NACC1,cell junction [GO:0030054]; cytoplasm [GO:0005...,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,MTSS1,actin cytoskeleton [GO:0015629]; cytoplasm [GO...,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,PNPLA8,endoplasmic reticulum membrane [GO:0005789]; G...,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20345,GPRC5B,cell surface [GO:0009986]; cytoplasmic vesicle...,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
20346,DNAJB12,endoplasmic reticulum [GO:0005783]; endoplasmi...,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
20347,GCNT3,extracellular exosome [GO:0070062]; Golgi memb...,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
20348,DLEC1,cytoplasm [GO:0005737]; cytosol [GO:0005829]; ...,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [20]:
Uniprot_Metabolic_2 = Uniprot_Metabolic_2.drop(['Gene_Names', 'GO'], axis=1)

Metabolic_2_Cond = Uniprot_Metabolic_2.any(axis=1, bool_only = True)

Metabolic_2_Cond

0        False
1        False
2        False
3        False
4         True
         ...  
20345    False
20346    False
20347     True
20348    False
20349    False
Length: 20350, dtype: bool

In [21]:
Metabolic_Cross_Ref = pd.DataFrame()

Metabolic_Cross_Ref['Cond_1'] = Metabolic_1_Cond
Metabolic_Cross_Ref['Cond_2'] = Metabolic_2_Cond

Metabolic_Cross_Ref['Final'] = Metabolic_Cross_Ref['Cond_1'] & Metabolic_Cross_Ref['Cond_2']

Metabolic_Cross_Ref

Unnamed: 0,Cond_1,Cond_2,Final
0,False,False,False
1,True,False,False
2,True,False,False
3,False,False,False
4,True,True,True
...,...,...,...
20345,True,False,False
20346,True,False,False
20347,True,True,True
20348,False,False,False


In [22]:
Uniprot['Metabolic'] = Metabolic_Cross_Ref['Final']

In [23]:
Uniprot

Unnamed: 0,Gene_Names,GO,Enzyme,Metabolic
0,Gene names (primary ),Gene ontology (GO),False,False
1,PLAA,cell [GO:0005623]; cell junction [GO:0030054];...,False,False
2,NACC1,cell junction [GO:0030054]; cytoplasm [GO:0005...,False,False
3,MTSS1,actin cytoskeleton [GO:0015629]; cytoplasm [GO...,False,False
4,PNPLA8,endoplasmic reticulum membrane [GO:0005789]; G...,True,True
...,...,...,...,...
20345,GPRC5B,cell surface [GO:0009986]; cytoplasmic vesicle...,False,False
20346,DNAJB12,endoplasmic reticulum [GO:0005783]; endoplasmi...,False,False
20347,GCNT3,extracellular exosome [GO:0070062]; Golgi memb...,True,True
20348,DLEC1,cytoplasm [GO:0005737]; cytosol [GO:0005829]; ...,False,False


In [24]:
Uniprot['Metabolic'].sum()

4316

In [25]:
# Kinases: MFO 'protein kinase activity', GO:0004672 
#         OR 
#         EC: 2.7.10/11/12/13/99.-
#         OR 
#         text search "protein kinase" in Uniprot protein name field

Uniprot_Kinase = Uniprot_start.copy()

infile = open('MFO_protein_kinase_activity_GO-0004672_descendants.csv', 'r')

for line in infile: 
    GO_test = str(line[:-1])
    a = Uniprot_Kinase['GO'].str.contains(GO_test)
    Uniprot_Kinase[GO_test] = a

###Uniprot_Kinase

In [26]:
# add in the kinase EC terms 

Kinase_EC = pd.read_csv('EC_Genes_Kinase.csv', names = ['EC Genes'])

###Kinase_EC

In [27]:
EC_genes = (Uniprot['Gene_Names']).isin(Kinase_EC['EC Genes'])

###EC_genes

In [28]:
Uniprot_Kinase['EC'] = EC_genes

###Uniprot_Kinase

In [29]:
# add in the kinase text search terms

Kinase_Name = pd.read_csv('text_search_protein_kinase.csv', names = ['Name Genes'])

###Kinase_Name

In [30]:
Name_genes = (Uniprot['Gene_Names']).isin(Kinase_Name['Name Genes'])

###Name_genes

In [31]:
Uniprot_Kinase['Name'] = Name_genes

###Uniprot_Kinase

In [32]:
# filter the matrix down so that it only contains a column of whether any row contains a true value
# note you have to drop Gene_Names and GO otherwise they get counted as true....

Uniprot_Kinase = Uniprot_Kinase.drop(['Gene_Names', 'GO'], axis=1)

Kinase_Cond = Uniprot_Kinase.any(axis=1, bool_only = True)

In [33]:
Uniprot['Kinase'] = Kinase_Cond

In [34]:
Uniprot

Unnamed: 0,Gene_Names,GO,Enzyme,Metabolic,Kinase
0,Gene names (primary ),Gene ontology (GO),False,False,False
1,PLAA,cell [GO:0005623]; cell junction [GO:0030054];...,False,False,False
2,NACC1,cell junction [GO:0030054]; cytoplasm [GO:0005...,False,False,False
3,MTSS1,actin cytoskeleton [GO:0015629]; cytoplasm [GO...,False,False,False
4,PNPLA8,endoplasmic reticulum membrane [GO:0005789]; G...,True,True,False
...,...,...,...,...,...
20345,GPRC5B,cell surface [GO:0009986]; cytoplasmic vesicle...,False,False,False
20346,DNAJB12,endoplasmic reticulum [GO:0005783]; endoplasmi...,False,False,False
20347,GCNT3,extracellular exosome [GO:0070062]; Golgi memb...,True,True,False
20348,DLEC1,cytoplasm [GO:0005737]; cytosol [GO:0005829]; ...,False,False,False


In [35]:
Uniprot['Kinase'].sum()

624

In [36]:
# TFs: MFO GO:0044212 transcription regulatory region DNA binding 
#     OR 
#     BPO GO:0006355 regulation of transcription, DNA-templated
#     OR
#     text search "transcription factor" in Uniprot protein name field

Uniprot_TF = Uniprot_start.copy()

infile = open('MFO_TF_regulatory_binding_GO-0044212_descendants.csv', 'r')
infile2  = open('BPO_TF_regulation_GO-0006355_descendants.tsv', 'r')

for line in infile: 
    GO_test = str(line[:-1])
    a = Uniprot_TF['GO'].str.contains(GO_test)
    Uniprot_TF[GO_test] = a

for line in infile2: 
    GO_test = str(line[:-1])
    a = Uniprot_TF['GO'].str.contains(GO_test)
    Uniprot_TF[GO_test] = a
    

###Uniprot_TF

In [37]:
# add in the TF text search terms

TF_Name = pd.read_csv('text_search_transcription_factor.csv', names = ['Name Genes'])

###TF_Name

In [38]:
Name_genes = (Uniprot['Gene_Names']).isin(TF_Name['Name Genes'])

###Name_genes

In [39]:
Uniprot_TF['Name'] = Name_genes

###Uniprot_TF

In [40]:
# filter the matrix down so that it only contains a column of whether any row contains a true value
# note you have to drop Gene_Names and GO otherwise they get counted as true....

Uniprot_TF = Uniprot_TF.drop(['Gene_Names', 'GO'], axis=1)

TF_Cond = Uniprot_TF.any(axis=1, bool_only = True)

In [41]:
Uniprot['Transcription Factor'] = TF_Cond

In [42]:
Uniprot

Unnamed: 0,Gene_Names,GO,Enzyme,Metabolic,Kinase,Transcription Factor
0,Gene names (primary ),Gene ontology (GO),False,False,False,False
1,PLAA,cell [GO:0005623]; cell junction [GO:0030054];...,False,False,False,False
2,NACC1,cell junction [GO:0030054]; cytoplasm [GO:0005...,False,False,False,True
3,MTSS1,actin cytoskeleton [GO:0015629]; cytoplasm [GO...,False,False,False,False
4,PNPLA8,endoplasmic reticulum membrane [GO:0005789]; G...,True,True,False,False
...,...,...,...,...,...,...
20345,GPRC5B,cell surface [GO:0009986]; cytoplasmic vesicle...,False,False,False,False
20346,DNAJB12,endoplasmic reticulum [GO:0005783]; endoplasmi...,False,False,False,False
20347,GCNT3,extracellular exosome [GO:0070062]; Golgi memb...,True,True,False,False
20348,DLEC1,cytoplasm [GO:0005737]; cytosol [GO:0005829]; ...,False,False,False,False


In [43]:
Uniprot['Transcription Factor'].sum()

2803

In [44]:
# Nucleic-Acid Binding Proteins: MFO GO:0003676 nucleic-acid binding
# create a matrix to test whether each gene has contains any of the nucleic-acid binding GO terms

Uniprot_Nucleic = Uniprot_start.copy()

infile = open('MFO_nucleicacid_binding_GO-0003676_descendants.tsv', 'r')

for line in infile: 
    GO_test = str(line[:-1])
    a = Uniprot_Nucleic['GO'].str.contains(GO_test)
    Uniprot_Nucleic[GO_test] = a

###Uniprot_Nucleic

In [45]:
# filter the matrix down so that it only contains a column of whether any row contains a true value
# note you have to drop Gene_Names and GO otherwise they get counted as true....

Uniprot_Nucleic = Uniprot_Nucleic.drop(['Gene_Names', 'GO'], axis=1)

Nucleic_Cond = Uniprot_Nucleic.any(axis=1, bool_only = True)

In [46]:
Uniprot['Nucleic-Acid Binding'] = Nucleic_Cond

In [47]:
Uniprot

Unnamed: 0,Gene_Names,GO,Enzyme,Metabolic,Kinase,Transcription Factor,Nucleic-Acid Binding
0,Gene names (primary ),Gene ontology (GO),False,False,False,False,False
1,PLAA,cell [GO:0005623]; cell junction [GO:0030054];...,False,False,False,False,False
2,NACC1,cell junction [GO:0030054]; cytoplasm [GO:0005...,False,False,False,True,False
3,MTSS1,actin cytoskeleton [GO:0015629]; cytoplasm [GO...,False,False,False,False,False
4,PNPLA8,endoplasmic reticulum membrane [GO:0005789]; G...,True,True,False,False,False
...,...,...,...,...,...,...,...
20345,GPRC5B,cell surface [GO:0009986]; cytoplasmic vesicle...,False,False,False,False,False
20346,DNAJB12,endoplasmic reticulum [GO:0005783]; endoplasmi...,False,False,False,False,False
20347,GCNT3,extracellular exosome [GO:0070062]; Golgi memb...,True,True,False,False,False
20348,DLEC1,cytoplasm [GO:0005737]; cytosol [GO:0005829]; ...,False,False,False,False,False


In [48]:
Uniprot['Nucleic-Acid Binding'].sum()

3849

In [49]:
# Signaling Proteins: BPO GO:0023052 signaling
# create a matrix to test whether each gene has contains any of the signalling GO terms

Uniprot_Signalling = Uniprot_start.copy()

infile = open('BPO_signaling_GO-0023052_descendants.tsv', 'r')

for line in infile: 
    GO_test = str(line[:-1])
    a = Uniprot_Signalling['GO'].str.contains(GO_test)
    Uniprot_Signalling[GO_test] = a

###Uniprot_Signalling

In [50]:
# filter the matrix down so that it only contains a column of whether any row contains a true value
# note you have to drop Gene_Names and GO otherwise they get counted as true....

Uniprot_Signalling = Uniprot_Signalling.drop(['Gene_Names', 'GO'], axis=1)

Signalling_Cond = Uniprot_Signalling.any(axis=1, bool_only = True)

In [51]:
Uniprot['Signalling'] = Signalling_Cond

In [52]:
Uniprot

Unnamed: 0,Gene_Names,GO,Enzyme,Metabolic,Kinase,Transcription Factor,Nucleic-Acid Binding,Signalling
0,Gene names (primary ),Gene ontology (GO),False,False,False,False,False,False
1,PLAA,cell [GO:0005623]; cell junction [GO:0030054];...,False,False,False,False,False,True
2,NACC1,cell junction [GO:0030054]; cytoplasm [GO:0005...,False,False,False,True,False,False
3,MTSS1,actin cytoskeleton [GO:0015629]; cytoplasm [GO...,False,False,False,False,False,True
4,PNPLA8,endoplasmic reticulum membrane [GO:0005789]; G...,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...
20345,GPRC5B,cell surface [GO:0009986]; cytoplasmic vesicle...,False,False,False,False,False,True
20346,DNAJB12,endoplasmic reticulum [GO:0005783]; endoplasmi...,False,False,False,False,False,False
20347,GCNT3,extracellular exosome [GO:0070062]; Golgi memb...,True,True,False,False,False,False
20348,DLEC1,cytoplasm [GO:0005737]; cytosol [GO:0005829]; ...,False,False,False,False,False,False


In [53]:
Uniprot['Signalling'].sum()

6222

In [54]:
print('\n', 'Enzyme Genes: ', Uniprot['Enzyme'].sum(),  "{0:.2f}".format(round(Uniprot['Enzyme'].sum()/Uniprot['Gene_Names'].count(),2)*100), '%', '\n',
      'Metabolic Enzyme Genes: ', Uniprot['Metabolic'].sum(), "{0:.2f}".format(round(Uniprot['Metabolic'].sum()/Uniprot['Gene_Names'].count(),2)*100), '%', '\n',
      'Kinase Genes: ', Uniprot['Kinase'].sum(), "{0:.2f}".format(round(Uniprot['Kinase'].sum()/Uniprot['Gene_Names'].count(),2)*100), '%', '\n',
      'Transcription Factor Genes: ', Uniprot['Transcription Factor'].sum(), "{0:.2f}".format(round(Uniprot['Transcription Factor'].sum()/Uniprot['Gene_Names'].count(),2)*100), '%', '\n',
      'Nucleic Acid Binding Genes: ', Uniprot['Nucleic-Acid Binding'].sum(), "{0:.2f}".format(round(Uniprot['Nucleic-Acid Binding'].sum()/Uniprot['Gene_Names'].count(),2)*100), '%', '\n',
     'Signalling Genes: ', Uniprot['Signalling'].sum(), "{0:.2f}".format(round(Uniprot['Enzyme'].sum()/Uniprot['Signalling'].count(),2)*100), '%', '\n',
     )


 Enzyme Genes:  5589 28.00 % 
 Metabolic Enzyme Genes:  4316 21.00 % 
 Kinase Genes:  624 3.00 % 
 Transcription Factor Genes:  2803 14.00 % 
 Nucleic Acid Binding Genes:  3849 19.00 % 
 Signalling Genes:  6222 27.00 % 



In [55]:
Uniprot.to_csv(r'Uniprot_Classification.csv')

In [56]:
# load genelist of genes for matrix to be created 

Genelist = pd.read_csv('Genelist.csv', names = ['Genes'])

Genelist

Unnamed: 0,Genes
0,AADAC
1,AARS
2,AASDHPPT
3,AASS
4,ABCB5
...,...
500,YRDC
501,YWHAG
502,ZIC1
503,ZIC4


In [57]:
factors = ["Enzyme", "Metabolic", "Kinase", "Transcription Factor", "Nucleic-Acid Binding", "Signalling"]

for item in factors: 
    count_dict = Uniprot.set_index('Gene_Names')[item].to_dict()
    Genelist[item] = Genelist['Genes'].map(count_dict)

Genelist

Unnamed: 0,Genes,Enzyme,Metabolic,Kinase,Transcription Factor,Nucleic-Acid Binding,Signalling
0,AADAC,True,True,False,False,False,False
1,AARS,True,True,False,False,True,False
2,AASDHPPT,True,True,False,False,False,False
3,AASS,True,True,False,False,False,False
4,ABCB5,True,False,False,False,False,False
...,...,...,...,...,...,...,...
500,YRDC,True,True,False,False,True,False
501,YWHAG,False,False,True,False,True,True
502,ZIC1,False,False,False,True,True,True
503,ZIC4,False,False,False,True,True,False


In [58]:
print('\n', 'Enzyme Genes: ', Genelist['Enzyme'].sum(), "{0:.2f}".format(round(Genelist['Enzyme'].sum()/Genelist['Genes'].count(),2)*100), '%', '\n',
      'Metabolic Enzyme Genes: ', Genelist['Metabolic'].sum(), round(Genelist['Metabolic'].sum()/Genelist['Genes'].count(),2)*100, '%', '\n',
      'Kinase Genes: ', Genelist['Kinase'].sum(), "{0:.2f}".format(round(Genelist['Kinase'].sum()/Genelist['Genes'].count(),2)*100), '%', '\n',
      'Transcription Factor Genes: ', Genelist['Transcription Factor'].sum(), round(Genelist['Transcription Factor'].sum()/Genelist['Genes'].count(),2)*100, '%', '\n',
      'Nucleic Acid Binding Genes: ', Genelist['Nucleic-Acid Binding'].sum(), round(Genelist['Nucleic-Acid Binding'].sum()/Genelist['Genes'].count(),2)*100, '%','\n',
     'Signalling Genes: ', Genelist['Signalling'].sum(), round(Genelist['Signalling'].sum()/Genelist['Genes'].count(),2)*100, '%', '\n',
     )


 Enzyme Genes:  279 55.00 % 
 Metabolic Enzyme Genes:  241 48.0 % 
 Kinase Genes:  52 10.00 % 
 Transcription Factor Genes:  101 20.0 % 
 Nucleic Acid Binding Genes:  131 26.0 % 
 Signalling Genes:  221 44.0 % 

