# Hybrid rule-based statistical language model

## add gazetteers to all the categories

This notebook loads the scispacy model and adds look-up lists to the model for cells/proteins, to increase accuracy. The lists come from WikiData.

- cell lines 
- enzymes 
- wiki garbage 
    - wiki lab
    - download csvs for proteins?

In [2]:
import spacy 

nlp = spacy.load('models/scipy_imba')

In [4]:
from spacy.pipeline import EntityRuler

ruler = EntityRuler(nlp)


## Cellosaurus

In [6]:

cell_patterns = []



with open('listar/cells/wikidata.csv') as cello:
    for l in cello.readlines():
        cell_patterns.append(
         {"label": "cell_line", "pattern": l.strip()}
        )
        
        
with open('listar/cells/synonyms.txt') as cello:
    for l in cello.readlines():
        cell_patterns.append(
         {"label": "cell_line", "pattern": l.strip()}
        )

In [7]:
cell_patterns

[{'label': 'cell_line', 'pattern': 'itemLabel'},
 {'label': 'cell_line', 'pattern': 'CHO'},
 {'label': 'cell_line', 'pattern': 'HEK293'},
 {'label': 'cell_line', 'pattern': 'Jurkat'},
 {'label': 'cell_line', 'pattern': 'MCF-7'},
 {'label': 'cell_line', 'pattern': 'Peer'},
 {'label': 'cell_line', 'pattern': 'Sf21'},
 {'label': 'cell_line', 'pattern': 'Vero'},
 {'label': 'cell_line', 'pattern': 'MRC-5'},
 {'label': 'cell_line', 'pattern': 'Raji'},
 {'label': 'cell_line', 'pattern': '3T3-L1'},
 {'label': 'cell_line', 'pattern': 'NIH 3T3'},
 {'label': 'cell_line', 'pattern': 'A-431'},
 {'label': 'cell_line', 'pattern': 'A-549'},
 {'label': 'cell_line', 'pattern': 'AB.9'},
 {'label': 'cell_line', 'pattern': 'BCP-1'},
 {'label': 'cell_line', 'pattern': 'BOSC-23'},
 {'label': 'cell_line', 'pattern': 'BHK-21'},
 {'label': 'cell_line', 'pattern': 'C2C12'},
 {'label': 'cell_line', 'pattern': 'COS cell lines'},
 {'label': 'cell_line', 'pattern': 'Caco-2'},
 {'label': 'cell_line', 'pattern': 'DU14

In [7]:
ruler.add_patterns(cell_patterns)
nlp.add_pipe(ruler)

In [92]:
doc = nlp('HEK293 is my favourite cell line. Although, HSC-2 is not far behind')

In [93]:
print([(ent.text, ent.label_) for ent in doc.ents])

[('HEK293', 'cell_line'), ('cell', 'protein'), ('HSC-2', 'cell_line'), ('not', 'protein'), ('far', 'protein')]


# Worked!    SAVE so don't need to add the patterns again

In [16]:
ruler.to_disk("models/ruler")    # saves patterns and config

# Wiki Labelled

In [10]:
import json 

with open('wiki_labelled.json') as dico:
    js = json.load(dico)

In [11]:
js

[['ABT', 'compound'],
 ['ACP', 'protein'],
 ['ACP', 'gene'],
 ['ADR', 'gene'],
 ['ALS', 'protein'],
 ['ALS', 'gene'],
 ['ALS', 'compound'],
 ['AML3', 'gene'],
 ['APC', 'protein'],
 ['APC', 'gene'],
 ['APC', 'compound'],
 ['APO', 'gene'],
 ['APS', 'gene'],
 ['APS', 'compound'],
 ['ATG', 'compound'],
 ['ATT', 'compound'],
 ['ATTR', 'protein'],
 ['ATTR', 'gene'],
 ['Ang', 'protein'],
 ['Ang', 'gene'],
 ['Axin1', 'protein'],
 ['Axin1', 'gene'],
 ['Axl', 'protein'],
 ['Axl', 'gene'],
 ['BAPN', 'compound'],
 ['BF', 'compound'],
 ['BMP', 'protein'],
 ['BMP', 'gene'],
 ['BMP', 'compound'],
 ['BPS', 'compound'],
 ['BTP2', 'gene'],
 ['BTP2', 'compound'],
 ['Blimp1', 'gene'],
 ['CAT2', 'protein'],
 ['CAT2', 'gene'],
 ['CD102', 'gene'],
 ['CD14', 'protein'],
 ['CD14', 'gene'],
 ['CD4', 'protein'],
 ['CD4', 'gene'],
 ['CD4', 'compound'],
 ['CD63', 'protein'],
 ['CD63', 'gene'],
 ['CDK1', 'protein'],
 ['CDK1', 'gene'],
 ['CDK2', 'protein'],
 ['CDK2', 'gene'],
 ['CFI1', 'gene'],
 ['CKD', 'gene'],
 ['

# Wiki Labelled, fetch lists

In [1]:

    
def is_wikidata(candidate, wikiQx, prop):
    """
    wikiQx: category ('object' in wikidata triple )
    prop: property, like P31 or P279
    """
    url = 'https://query.wikidata.org/sparql'

    query = """
    SELECT 
      ?item ?itemLabel
    WHERE {{
      ?item wdt:{} wd:{}.
      ?item ?label "{}"@en .
  SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }}
    """.format(prop,wikiQx,candidate)


    #Pick a random user agent
    user_agent = random.choice(user_agent_list)
    #Set the headers 
    headers = {'User-Agent': user_agent}
    #Make the request
  

    r = requests.get(url, params = {'format': 'json', 'query': query},headers=headers)
    try:
        data = r.json()
    except:
        print(r.text)

    if data['results']['bindings'] != []:
        return True
    else:
        return False
    
    
    
def is_wikidata_alt(candidate, wikiQx, prop):
    """
    wikiQx: category ('object' in wikidata triple )
    """
    url = 'https://query.wikidata.org/sparql'

    query = """
    SELECT 
      ?item ?itemLabel
    WHERE {{
      ?item wdt:{} wd:{}.
      ?item skos:altLabel "{}"@en .
  SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }}
    """.format(prop,wikiQx,candidate)

    #Pick a random user agent
    user_agent = random.choice(user_agent_list)
    #Set the headers 
    headers = {'User-Agent': user_agent}
    #Make the request
    
    r = requests.get(url, params = {'format': 'json', 'query': query}, headers=headers)
    try:
        data = r.json()
    except:
        print(r.text)

    if data['results']['bindings'] != []:
        return True
    else:
        return False

# Proteins

fetched through the query interface on wikidata

In [17]:
import pandas as pd 

protein_dir = '/Users/valdimareggertsson/Documents/Valdi/Vetrarönn 2019/NER/Listar/proteins/'


protein_df = pd.read_csv(protein_dir + 'proteins_wiki.csv')


In [22]:
protein_alts = pd.read_csv(protein_dir + 'proteins_wiki_alts.csv')


In [27]:
protein_names =  list(protein_df['itemLabel'])
protein_names.extend( list( protein_alts['altLabel']   )  )

In [29]:
protein_patterns = []

for p in protein_names:
    protein_patterns.append( {'label': 'protein', 'pattern': p}  )

# Add the protein patterns in batches to the ruler

In [34]:
batch_size = 10000
ruler.add_patterns(protein_patterns[0 : batch_size])
ruler.add_patterns(protein_patterns[batch_size : 2*batch_size])



In [35]:
ruler.add_patterns(protein_patterns[2*batch_size : 5*batch_size])

In [36]:
ruler.add_patterns(protein_patterns[5*batch_size : 10*batch_size])

In [37]:
ruler.add_patterns(protein_patterns[10*batch_size : 20*batch_size])

In [38]:
ruler.add_patterns(protein_patterns[20*batch_size : 40*batch_size])

In [39]:
ruler.add_patterns(protein_patterns[40*batch_size : 80*batch_size])

In [57]:
ruler.add_patterns(protein_patterns[80*batch_size : 120*batch_size])

In [58]:
ruler.add_patterns(protein_patterns[120*batch_size : 150*batch_size])

In [60]:
ruler.add_patterns(protein_patterns[150*batch_size : 180*batch_size])

In [63]:
ruler.add_patterns(protein_patterns[180*batch_size : 200*batch_size])

In [66]:
ruler.add_patterns(protein_patterns[200*batch_size : 220*batch_size])

In [69]:
ruler.add_patterns(protein_patterns[220*batch_size : ])

In [70]:
ruler.to_disk("models/ruler")    # saves patterns and config

In [67]:
from math import isnan

for i,p in enumerate(protein_patterns[200*batch_size : ]):

    # Remove nan: 
    if not isinstance(p['pattern'], str):
        protein_patterns.remove(p)
        
    if len(p['pattern']) > 20:
        print(p['pattern'])

NADPH:adrenodoxin oxidoreductase, mitochondrial
PI-PLC X domain-containing protein 1-like
uncharacterized protein LOC100003714
MAM domain-containing protein 2
low-density lipoprotein receptor-related protein 12
arylamine N-acetyltransferase-like
arylamine N-acetyltransferase, pineal gland isozyme NAT-3
7 transmembrane receptor
histamine H1 receptor
ras homolog gene family, member H
rho-related GTP-binding protein RhoH
ras-like protein Rhoh
serine/threonine-protein kinase LMTK2
serine/threonine-protein kinase LMTK2-like
uncharacterized protein LOC566185
zinc finger FYVE domain-containing protein 16
solute carrier family 13 member 4
solute carrier family 13 (sodium/sulfate symporter), member 4
coiled-coil domain-containing protein 15
opioid receptor, delta 1b
RAC-beta serine/threonine-protein kinase
beta-3-galactosyltransferase (ssp2)
four and a half LIM domains 5
gastrula zinc finger protein XlCGF57.1-like
gastrula zinc finger protein XlCGF8.2DB-like
somatostatin receptor type 5
inward 

PH domain containing protein
variant surface glycoprotein (VSG
palmitoyl acyltransferase 5
variant surface glycoprotein (VSG)
variant surface glycoprotein (VSG
replication factor c subunit 1
variant surface glycoprotein (VSG)
variant surface glycoprotein (VSG)
expression site-associated gene 1 (ESAG1) protein
Methyltransferase TYW3
expression site-associated gene 11 (ESAG11) protein
expression site-associated gene 2 (ESAG2) protein
leucine-rich repeat protein (LRRP)
39s ribosomal protein l4 mitochondrial
adenosine transporter
cleavage and polyadenylation specificity factor 30 kDa subunit
acyl-CoA binding protein
N-acetyltransferase complex ARD1 subunit
ATP-dependent DEAD/H RNA helicase
RNA binding protein EIF1AD
Domain of unknown function (DUF4464)
kinetoplast-associated protein-like protein
CMGC/RCK protein kinase
DEAD/DEAH box helicase-like protein
Uncharacterised protein family (UPF0160)
RING-variant domain containing protein
amino acid permease 24
CorA-like Mg2+ transporter protein

2OG-Fe(II) oxygenase superfamily
Sas10/Utp3/C1D family/Sas10 C-terminal domain containing protein
Staphylococcal nuclease homologue
26S proteasome non ATPase regulatory subunit
kinetoplast poly(A) polymerase 1
kinetoplastid-specific dual specificity phosphatase
calmodulin-like protein
RNA polymerase B subunit RPB8
RNA binding protein rbp16
C-1-tetrahydrofolate synthase
6-phosphofructo-2-kinase/fructose-2
Cleavage factor I 25 kDa subunit
DNA replication licensing factor MCM2
ras-like small GTPase
ribonucleoside-diphosphate reductase large chain
endonuclease/exonuclease/phosphatase-like protein
ribosomal RNA processing protein 4
ribosomal RNA adenine dimethylase family protein
mitochondrial carrier protein
mitochondrial RNA binding protein 1
AUT2/APG4/ATG4 cysteine peptidase
Mitochondrial ribosomal protein L46
Sad1 / UNC-like C-terminal
Rab-GTPase-TBC domain containing protein
pumilio/PUF RNA binding protein 5
endosomal integral membrane protein
mitogen-activated protein kinase 12
Alpha/

tubulin polymerization promoting protein family
E3 ubiquitin protein ligase TRIM9
ubiquitin-conjugating enzyme variant Kua homologue
vacuolar protein sorting-associated protein 35
tripartite motif protein trim
chaperone protein DnaJ
SNARE domain containing protein
pyruvate dehydrogenase complex E3 binding protein
Lupus La protein homolog
Tetratricopeptide repeat
receptor-type adenylate cyclase GRESAG 4
protein translation factor sui1
DnaJ domain containing protein
predicted zinc finger protein
cAMP-dependent protein kinase catalytic subunit 1
U5 snRNA-associated splicing factor
PRP8 protein homologue
tegument-associated antigen
Chromosome passenger complex (CPC) protein INCENP N terminal
unc-50 related protein homolog
tRNA import complex component
Mitochondrial import inner membrane translocase subunit TIM42
E3 ubiquitin protein ligase HECTD1
tegument-allergen-like protein
calmodulin-like protein
EF hand containing protein
predicted WD40 repeat protein
nucleoside diphosphate kinase
teg

Cytosolic carboxypeptidase-like protein 5
Zinc finger RING type
FHA domain containing protein
Domain of unknown function (DUF4586)
catalytic subunit of the vacuolar transporter chaperone 4
ATP-dependent protease ATPase subunit HslU2
Variant Surface Glycoprotein
variant surface glycoprotein (VSG)
Variant Surface Glycoprotein
Variant Surface Glycoprotein
Variant Surface Glycoprotein
Variant Surface Glycoprotein
5 hydroxytryptamine receptor 1B
variant surface glycoprotein (VSG
variant surface glycoprotein (VSG
variant surface glycoprotein (VSG
variant surface glycoprotein (VSG)
variant surface glycoprotein (VSG)
structural maintenance of chromosomes protein 3
variant surface glycoprotein (VSG
BAC from homologous region on chr8
variant surface glycoprotein (VSG
BAC from homologous region on chr8
variant surface glycoprotein (VSG)
BAC from homologous region on chr8
variant surface glycoprotein (VSG
BAC from homologous region on chr8
variant surface glycoprotein (VSG
BAC from homologous regi

phosphatidylinositol transfer protein
early growth response protein 1
Tetraspanin 18 (Tspan 18)
integrator complex subunit
hox class homeodomain protein djabd bb
dna replication complex gins protein psf2
membrane associated guanylate kinase
retinal specific ATP binding cassette
receptor type adenylate cyclase GRESAG 4
receptor type adenylate cyclase GRESAG 4
tetratricopeptide repeat protein 8
lysine specific demethylase NO66
WD repeat containing protein 48
orexin receptor type 2
complement C1q tumor necrosis factor
endonuclease:exonuclease:phosphatase family
tumor susceptibility gene 101 protein
cercarial elastase (S01 family)
phosphatidylcholine transfer protein
Serine:threonine protein kinase par 1
non capsid protein NS 1
enoyl coenzyme A hydratase
phosphatidylcholine transfer protein
1 phosphatidylinositol 3 phosphate 5 kinase
synaptotagmin protein 5 like
cytochrome c oxidase subunit I
cytochrome c oxidase subunit II
NADH dehydrogenase subunit 6
RAB6 interacting protein 2 (ERC prote

membrane transporter protein
phosphatidylinositol 3- and 4-kinase
Centrosomal protein of 164 kDa
5'-3' exoribonuclease B
NLI interacting factor-like phosphatase
Vitamin K epoxide reductase family
dual specificity protein phosphatase
ribosomal protein L11
RNA cytidine acetyltransferase
Organic solute transport protein 1
EMG1/NEP1 methyltransferase
eukaryotic translation initiation factor 3 subunit b
Meiotic cell cortex C-terminal pleckstrin homology
Macro domain containing protein
methyltransferase domain containing protein
transcription elongation regulator-like protein
protein phosphatase-like protein
L-galactonolactone oxidase
NUC130/3NT domain/SDA1
inositol-1(or 4)-monophosphatase 1
otubain cysteine peptidase
sucrose hydrolase-like protein
Vesicle-associated membrane protein 7
zinc-finger protein ZPR1
T. brucei spp.-specific protein
coatomer beta subunit
exosome-associated protein 1
pumillio RNA binding protein
paralyzed flagella protein 16
T. brucei spp.-specific protein
pteridine 

calcium/calmodulin-dependent protein kinase
phosphatidyltransferase
DnaJ domain containing protein
GRIP domain containing protein
Alpha/beta hydrolase family
Lys-63-specific deubiquitinase BRCC36
Protein of unknown function (DUF1620)
btb/poz domain containing protein
peroxisomal membrane protein 4
Primase zinc finger/Mcm10 replication factor
Protein of unknown function (DUF1193)
Fibronectin type III domain containing protein
Cytokinesis initiation factor 2 isoform 2
Cytokinesis initiation factor 2 isoform 1 (fragment)
mismatch repair protein PMS1
60S ribosomal protein L26
leucine-rich repeat protein (LRRP)
RNA re-capping enzyme
Peptidyl-prolyl cis-trans isomerase C
T. brucei spp.-specific protein
ubiquitin carboxyl-terminal hydrolase
dynein-associated protein
T. brucei spp.-specific protein
T. brucei spp.-specific protein
leucine-rich repeat protein (LRRP)
SLACS retrotransposable element (part)
SLACS reverse transcriptase
proteasome complex subunit Rpn13 ubiquitin receptor
DNA polymera

pumillio RNA binding protein
RNA- binding regulatory protein (pumilio family)
60S ribosomal protein L34
pyruvate dehydrogenase E1 component alpha subunit
heat shock protein 110
Zinc finger CCCH domain-containing protein 35
zinc finger protein family member
Zinc finger CCCH domain-containing protein 38 (fragment)
Zinc finger CCCH domain-containing protein 38
cholinephosphate cytidylyltransferase A
mitochondrial 2-oxoglutarate/malate carrier protein
mitochondrial carrier protein
ttaggg binding factor
pre-RNA processing PIH1/Nop17
SET domain containing protein
prolyl-tRNA synthetase
bifunctional aminoacyl-tRNA synthetase
Complex 1 protein (LYR family)
Flagellum attachment zone protein 18
predicted zinc finger protein
Ras-related protein Rab5A
Multisite-specific tRNA:(cytosine-C(5))-methyltransferase
predicted ankyrin repeat family protein
protein kinase A catalytic subunit 3
Vacuolar protein sorting-associated protein 16 homolog
Exocyst complex component Sec3
Thiopurine S-methyltransferas

hypothetical protein, conserved
hypothetical protein, conserved
reverse transcriptase (RNA-dependent DNA polymerase)
PRP38 family, putative
hypothetical protein, conserved
GCN5-like protein, putative
DNA topoisomerase type IB small subunit
RNA polymerase subunit RPB10, putative
MutL C terminal dimerisation domain containing protein
Serine/threonine-protein phosphatase 2A, putative
Transferrin receptor-like
GIY-YIG catalytic domain containing protein, putative
hypothetical protein, conserved
GPI-GlcNAc transferase complex
hypothetical protein, conserved
Pumilio-family RNA binding repeat
peptidyl-prolyl cis-trans isomerase
phosphatidylethanolamine-binding protein
hypothetical protein, conserved
proteasome regulatory ATPase subunit 1, putative
endoplasmatic reticulum retrieval protein, putative
Uncharacterised protein family UPF0066, putative
hypothetical protein, conserved
ubiquitin-conjugating enzyme E2, putative
proteasome regulatory ATPase subunit 5, putative
proteasome regulatory ATP

hypothetical protein, conserved
hypothetical protein, conserved
hypothetical protein, conserved
conserved protein (fragment)
hypothetical protein, conserved
retrotransposon hot spot protein 2 (RHS2)
hypothetical protein, conserved
hypothetical protein, conserved
ribulose-5-phosphate 3-epimerase, putative
trypanosome RHS family
hypothetical protein, conserved
hypothetical protein, conserved
retrotransposon hot spot protein 2 (RHS2)
3-ketoacyl-CoA reductase, putative
hypothetical protein (fragment)
hypothetical protein, conserved
cytosolic nonspecific dipeptidase, putative
autophagocytosis associated protein, putative
hypothetical protein (fragment)
mitochondrial RNA binding complex 1 subunit, putative
Tetratricopeptide repeat, putative
CAMK/CAMKL family protein kinase, putative
kinetoplastid-specific dual specificity phosphatase
DNA polymerase delta catalytic subunit, putative
variant surface glycoprotein (fragment)
hypothetical protein, conserved
variant surface glycoprotein
GlcNAc-PI 

2OG-Fe(II) oxygenase superfamily
MORN repeat-containing protein
ubiquitin-conjugating enzyme E2
double-strand-break repair protein rad21 homolog, putative
methyltransferase domain containing protein, putative
Mucin-associated surface protein (MASP), subgroup S097
Iron-sulfur assembly protein 2
surface protease GP63, putative
Mucin-associated surface protein (MASP), subgroup S132
mucin TcMUCII, putative
mitochondrial processing peptidase
Mucin-associated surface protein (MASP), subgroup S095
OTU-like cysteine protease
mucin TcMUCII, putative
threonyl-tRNA synthetase
Mucin-associated surface protein (MASP), subgroup S097
peroxisome targeting signal 1 receptor
60S ribosomal protein L8
60S ribosomal protein L2
dispersed gene family protein 1 (DGF-1), putative
Methyltransferase domain/Methyltransferase involved in Williams-Beuren syndrome
hypothetical protein, conserved
Cytochrome c oxidase biogenesis protein Cmc1 like, putative
hypothetical protein, conserved
hypothetical protein, conserve

Mucin-associated surface protein (MASP), subgroup S008
hypothetical protein, conserved
Mucin-associated surface protein (MASP), subgroup S008
Mucin-associated surface protein (MASP), subgroup S008
Ankyrin repeats (3 copies)
hypothetical protein, conserved
hypothetical protein, conserved
26S protease regulatory subunit
BSD domain containing protein, putative
hypothetical protein, conserved
60S ribosomal protein L12
Kinesin-13 1, putative
Endonuclease/Exonuclease/phosphatase family, putative
short-chain dehydrogenase
hypothetical protein, conserved
trans-sialidase, putative
short-chain dehydrogenase
tyrosine aminotransferase
vesicle-associated membrane protein
fatty acid desaturase, putative
NLI interacting factor-like phosphatase
cytochrome c oxidase subunit V, putative
hypothetical protein, conserved
rieske iron-sulfur protein
STE/STE11 serine/threonine-protein kinase, putative
chaperone DNAJ-like protein
hypothetical protein, conserved
methyltransferase domain containing protein
hypot

cysteine peptidase, Clan CA, family C19, putative
ribosomal protein S29
cysteine peptidase, Clan CA, family C19, putative
enoyl-CoA hydratase, mitochondrial precursor, putative
hypothetical protein, conserved (fragment)
nucleosome assembly protein (NAP), putative
hd phosphohydrolase family protein
ubiquinone biosynthesis protein COQ7 homolog, putative
glycosomal membrane protein
metallo-peptidase, Clan MH, Family M20
histone acetyltransferase
NLI interacting factor-like phosphatase, putative
Translation initiation factor eIF-2B subunit epsilon
hypothetical protein, conserved
Eukaryotic translation initiation factor 3 subunit E
hypothetical protein, conserved
hypothetical protein, conserved
Cactus-binding C-terminus of cactin protein
hypothetical protein, conserved
heat shock protein 20, putative
60S ribosomal protein L13, putative
minichromosome maintenance (MCM) complex subunit
Flagellum attachment zone protein 19, putative
Centrosomal protein of 164 kDa
protein kinase, putative
tRNA 

mucin TcMUCII, putative
Mucin-associated surface protein (MASP), subgroup S009
90 kDa surface protein, putative
Mucin-associated surface protein (MASP), subgroup S001
Mucin-associated surface protein (MASP), subgroup S080
Mucin-associated surface protein (MASP), subgroup S004
mucin TcMUCII, putative
mucin TcMUCII, putative
surface protease GP63, putative
90 kDa surface protein, putative
Mucin-associated surface protein (MASP), subgroup S002
Mucin-associated surface protein (MASP), subgroup S073
trans-sialidase, Group V, putative
Mucin-associated surface protein (MASP), subgroup S008
Mucin-associated surface protein (MASP), subgroup S047
UEV domain/Zinc finger, C3HC4 type (RING finger), putative
hypothetical protein, conserved
trans-sialidase, putative
Mucin-associated surface protein (MASP), subgroup S025
Mucin-associated surface protein (MASP), subgroup S065
Mucin-associated surface protein (MASP), subgroup S061
mucin TcMUCII, putative
Mucin-associated surface protein (MASP), subgroup

e3 SUMO protein ligase PIAS2
Peptidase aspartic active site
ATP dependent RNA helicase abstrakt
ATP dependent RNA helicase abstrakt
UDP N acetylglucosamine dolichyl phosphate
bruno-like rna binding protein
bruno-like rna binding protein
bruno-like rna binding protein
bruno-like rna binding protein
G patch domain containing 1
expressed conserved protein
glutamyl tRNA amidotransferase subunit B
monocarboxylate transporter
expressed conserved protein
C2 calcium dependent membrane targeting
tyrosine protein phosphatase non receptor type
divalent metal transporter DMT1B
Mediator of RNA polymerase II transcription
DNA directed RNA polymerase II subunit rpb11 a
arginase 2 mitochondrial
fn3 domain containing protein
protein of unknown function DUF974
heat shock protein 70
tho complex subunit 1
heat shock protein 70
RRM_6 domain containing protein
RRM_6 domain containing protein
SET and MYND domain containing protein 4
coatomer subunit zeta 1
n acyl phosphatidylethanolamine hydrolyzing
echinode

zinc finger C2H2 type
zinc finger C2H2 type
expressed conserved protein
Glutamate receptor ionotropic kainate 2
poly adp ribose polymerase 2
fucosidase alpha L 1 tissue
E3 ubiquitin protein ligase MARCH6
expressed conserved protein
collagen alpha 1VII chain
eukaryotic translation initiation factor 3
cAMP dependent protein kinase type I beta
polypeptide N acetylgalactosaminyltransferase
D glucuronyl C5 epimerase
pre mRNA processing factor 6
forkhead box protein D1
universal stress protein in QAH:OAS
gem associated protein 5
SUN domain containing protein 1
cop9 signalosome complex subunit 3
Vacuolar import degradation protein Vid24
gamma tubulin complex component 2
expressed conserved protein
phospholipid translocating ATPase
cytoplasmic antigen 1
dedicator of cytokinesis protein 11
senescence associated protein
Metal transporter Nramp1
Dedicator of cytokinesis protein 9
transcript antisense to ribosomal RNA protein
eukaryotic translation initiation factor 3
bromodomain adjacent to zinc 

protein regulator of cytokinesis 1
insulin growth factor 1 receptor beta
Golgi associated plant pathogenesis
cancer susceptibility candidate protein 1
Golgi associated plant pathogenesis
Nucleic acid binding OB fold tRNA helicase type
long chain fatty acid coenzyme A ligase 4
SWI:SNF chromatin binding protein
g protein coupled receptor
e3 SUMO protein ligase PIAS2
expressed conserved protein
SPONdin extracellular matrix glycoprotein
Peptidase aspartic active site
ATP dependent RNA helicase abstrakt
uro adherence factor A
ATP dependent RNA helicase abstrakt
Trafficking protein particle complex subunit 3
UDP N acetylglucosamine dolichyl phosphate
tetratricopeptide repeat protein 35 B
coproporphyrinogen III oxidase
serine:threonine protein kinase NIM1
expressed conserved protein
expressed conserved protein
krr1 small subunit processome component
bruno-like rna binding protein
CUGBP Elav family like
squamous cell carcinoma antigen
CUG BP and ETR3 factor
expressed conserved protein
bruno-li

zinc finger BED domain containing protein
protein farnesyltransferase subunit beta
ATP-dependent protease PIM1
diagnostic antigen gp50
ATP dependent RNA helicase DDX52
protein l isoaspartate o methyltransferase
soss complex subunit b1
conserved hypothetical protein
homocysteine responsive endoplasmic
serine:threonine protein phosphatase 6
DNA replication licensing factor MCM7
heat shock protein 70
expressed conserved protein
protein preY mitochondrial
26S proteasome non ATPase regulatory subunit 7
neuropeptide Y receptor
Zinc finger MYND domain containing protein 12
ribosomal protein s14
conserved hypothetical protein
somatostatin receptor
casein kinase I delta
F box:WD repeat containing protein 9
expressed conserved protein
hypothetical transcript
glycerol 3 phosphate dehydrogenase NAD
ras gtpase activating protein
glutathione S transferase
exocyst complex component 7
tRNA dihydrouridine synthase 4
Cytochrome c oxidase subunit VIc
cation chloride cotransporter
nuclear receptor 2C2 ass

DNA methylase N 6 adenine specific
Minor histocompatibility antigen H13
heterolocus tagous nuclear ribonucleoprotein
hypothetical transcript
Clathrin coatomer adaptor adaptin N terminal
DNA replication licensing factor MCM4
rna binding protein musashi
Sterile alpha motif SAM
hypothetical transcript
conserved hypothetical protein
hypothetical transcript
Gamma aminobutyric acid receptor associated
hypothetical transcript
recQ mediated genome instability protein 1
expressed conserved protein
hypothetical transcript
mds1 and evi1 complex locus protein evi1
mitogen activated protein kinase kinase kinase
pre mRNA splicing factor ISY1
maternal embryonic leucine zipper kinase
hypothetical transcript
protein of unknown function DUF2414
expressed conserved protein
conserved hypothetical protein
Zinc finger, C3HC4 type (RING finger)
membrane bound acyltransferase:hhat
TNF receptor associated factor 4
serine:threonine protein kinase PLK1
expressed conserved protein
transitional endoplasmic reticul

Formin proteiny 2 domain containing protein
hypothetical transcript
hypothetical transcript
hypothetical transcript
hypothetical transcript
hypothetical transcript
hypothetical transcript
hypothetical transcript
hypothetical transcript
hypothetical transcript
NADP dependent malic enzyme
hypothetical transcript
hypothetical transcript
hypothetical transcript
hypothetical transcript
hypothetical transcript
hypothetical transcript
hypothetical transcript
hypothetical transcript
hypothetical transcript
hypothetical transcript
hypothetical transcript
hypothetical transcript
hypothetical transcript
protocadherin beta 14
E3 ubiquitin protein ligase synoviolin B
hypothetical transcript
hypothetical transcript
acetyl coenzyme A synthetase cytoplasmic
interleukin enhancer binding factor 2
hypothetical transcript
hypothetical transcript
Muscarinic acetylcholine receptor M2
hypothetical transcript
Vacuolar protein sorting associated protein 13A
synaptic ras gtpase activating protein syngap
hypothe

Mitochondrial import receptor subunit TOM20
Chloride channel CLIC 1
protein of unknown function DUF2419
nucleic acid binding protein
hypothetical transcript
TWiK family of potassium channels protein 7
hypothetical transcript
zinc finger protein 236
hypothetical transcript
protein cysteine N palmitoyltransferase
fatty acid binding protein adipocyte
hypothetical transcript
hypothetical transcript
brain type fatty acid binding protein
Smad nuclear interacting protein 1
coatomer protein complex subunit zeta 1
Williams Beuren syndrome chromosomal region 16
tyrosine protein phosphatase non receptor type
ankyrin repeat domain containing protein 50
solute carrier family 22
phospholipid scramblase 2
ATP dependent RNA helicase DHX8
hypothetical transcript
small conductance calcium activated potassium
hypothetical transcript
hypothetical transcript
diagnostic antigen gp50
dipeptidyl peptidase 3
mitochondrial ribosomal protein L4
ATP dependent RNA helicase DHX37
mitochondrial import inner membrane

KRAB A domain containing protein
KRAB A domain containing protein
transcription factor 25
transcription factor 25
DNA excision repair protein ERCC 6
DNA excision repair protein ERCC 6
hypothetical transcript
nuclear pore complex protein Nup107
hypothetical transcript
KRAB A domain containing protein
KRAB A domain containing protein
hypothetical transcript
hypothetical transcript
DNA repair protein RAD51
Protein phosphatase 1 regulatory subunit 3B
conserved hypothetical protein
hypothetical transcript
sterile alpha motif domain containing protein
carboxylesterase type B
ELKS:Rab6 interacting:CAST family
14-3-3 protein beta:alpha
hypothetical transcript
cytoplasmic tRNA 2 thiolation protein 1
conserved hypothetical protein
hypothetical transcript
hypothetical transcript
leucine rich repeat neuronal protein
hypothetical transcript
protein tyrosine sulfotransferase A
protein tyrosine sulfotransferase
active breakpoint cluster region
hypothetical transcript
cell cycle regulator mat89bb
euka

conserved hypothetical protein
tyrosyl tRNA synthetase
STE20 kinase adapter protein beta
phenylalanyl tRNA synthetase alpha chain B
26S proteasome regulatory subunit N1
lipoma HMGIC fusion partner
lipoma HMGIC fusion partner
diagnostic antigen gp50
conserved hypothetical protein
UHRF1 binding protein 1
rac guanyl nucleotide exchange factor
endoplasmic reticulum Golgi intermediate
Basic leucine zipper (bZIP) transcription factor
C2 domain containing protein 3
dna polymerase subunit delta
mitochondrial ribosomal protein L4
diagnostic antigen gp50
glutamyl tRNA synthetase cytoplasmic
retinal specific ATP binding cassette
metacestode specific membrane protein 1
ATP dependent RNA helicase DDX55
conserved hypothetical protein
conserved hypothetical protein
RhoGEF domain containing protein
85 kDa calcium independent phospholipase A2
NADH dehydrogenase (ubiquinone) 1 alpha
U3 small nucleolar RNA associated protein 14
dyslexia susceptibility 1 candidate gene 1
conserved hypothetical protein
con

zinc finger RNA binding protein 2
oxysterol binding protein 6
neurogenic differentiation factor 1
KRAB A domain containing protein
male specific lethal 3 1
sodium:chloride dependent transporter
ryanodine receptor 44f
protocadherin gamma b17
splicing factor 3A subunit 1
u4:u6.u5 tri snrnp associated protein 2
achaete scute transcription factor
conserved hypothetical protein
Phosphatidylinositol 4 phosphate 5 kinase type 1
purine nucleoside phosphorylase
purine nucleoside phosphorylase
purine nucleoside phosphorylase
TPR repeat containing protein YDR161W
mitochondrial import receptor subunit tom20 homolog
protein of unknown function DUF2419
iron-sulfur cluster assembly accessory protein
RNA-directed DNA polymerase (reverse transcriptase)
KRAB A domain containing protein
sodium bile acid cotransporter
inositol polyphosphate multikinase
sodium bile acid cotransporter
Monocarboxylate transporter 14
coiled coil domain containing protein 130
nadh dehydrogenase fe-s protein 6
osteopetrosis ass

eukaryotic peptide chain release factor subunit 1
Silent information regulator 2 related protein 1
Uncharacterised conserved protein (DUF2305)
Eukaryotic translation initiation factor 4E type 6
ras-like small GTPases
6-phosphofructo-2-kinase/fructose-2
C-1-tetrahydrofolate synthase
Regulator of Vps4 activity in the MVB pathway
RNA editing 3' terminal uridylyl transferase 2
2OG-Fe(II) oxygenase superfamily
pseudouridylate synthase-like protein
nucleotide binding protein-like protein
DNA polymerase sigma-like protein
galactofuranosyltransferase lpg1-like protein
Protein of unknown function (DUF525)
spliced leader RNA PSE-promoter transcription factor
10 kDa heat shock protein
10 kDa heat shock protein
protein disulfide isomerase
Protein of unknown function (DUF2012)
Pentapeptide repeats (9 copies)
Regulator of chromosome condensation (RCC1) repeat
Regulator of chromosome condensation (RCC1) repeat
RNA recognition motif. (a.k.a. RRM
RNA recognition motif (a.k.a. RRM
vacuolar type h+ ATPas

Domain of unknown function (DUF1935)
3-oxo-5-alpha-steroid 4-dehydrogenase
ATP-grasp domain containing protein
cytochrome c oxidase assembly protein
Histone RNA hairpin-binding protein RNA-binding domain containing protein
General negative regulator of transcription subunit 5
Vacuolar protein sorting 55
phosphopantothenate--cysteine ligase
dullard-like phosphatase domain containing protein
helicase-like protein
Stage II sporulation protein E (SpoIIE)
Rieske [2Fe-2S] domain containing protein
elongation factor 2-like protein
Capping enzyme RNA triphosphatase 1
KIF-1 binding protein C terminal
eukaryotic translation initiation factor 3 subunit j
succinyl-CoA synthetase alpha subunit
succinyl-CoA synthetase alpha subunit
BRO1-like domain containing protein
vacuolar protein sorting-like protein
Domain of unknown function (DUF4201)
SNF2 family helicase-like protein
32 kDa ER-associated protein
calpain-like cysteine peptidase
Nin one binding (NOB1) Zn-ribbon like
calpain-like cysteine peptid

hypothetical protein, conserved
3'-nucleotidase/nuclease
Eukaryotic protein of unknown function (DUF866), putative
Iron-containing alcohol dehydrogenase, putative
hypothetical protein, conserved
hypothetical protein, unknown function
WW domain containing protein, putative
vacuolar ATP synthase subunit
Present in the outer mitochondrial membrane proteome 18
hypothetical protein, conserved
Fusaric acid resistance protein-like, putative
Fusaric acid resistance protein-like, putative
alanine aminotransferase
ser/thr protein phosphatase
hypothetical protein, conserved
Protein of unknown function (DUF962), putative
Protein of unknown function (DUF962)
surface antigen protein 2, putative
surface antigen protein 2
surface antigen protein 2
surface antigen protein 2
surface antigen protein 2
hypothetical protein, conserved
surface antigen protein 2, putative
surface antigen protein
promastigote surface antigen 38S
hypothetical protein, conserved
DnaJ domain containing protein
WD repeat and HMG-

conserved hypothetical protein
NADPH--cytochrome P450 reductase
conserved hypothetical protein
conserved hypothetical protein
glycosyltransferase family 62
conserved hypothetical protein
response regulator receiver
conserved hypothetical protein
coproporphyrinogen III oxidase
glycoside hydrolase family 13
glycoside hydrolase family 13
conserved hypothetical protein
fungal specific transcription factor domain-containing protein
Pyruvate decarboxylase
Adenylosuccinate lyase
conserved hypothetical protein
conserved hypothetical protein
conserved hypothetical protein
conserved hypothetical protein
conserved hypothetical protein
ER membrane chaperone-like protein
pre-mRNA splicing factor ATP-dependent RNA helicase PRP43
conserved hypothetical protein
conserved hypothetical protein
conserved hypothetical protein
Arp2/3 complex subunit
conserved hypothetical protein
conserved hypothetical protein
serine/threonine-protein kinase nrc-2
conserved hypothetical protein
conserved hypothetical prote

conserved hypothetical protein
GDP-mannose pyrophosphorylase A, GDP-mannose pyrophosphorylase A, variant
BYS1 domain-containing protein
orotidine 5'-phosphate decarboxylase
Putative major facilitator superfamily (MFS) transporter
NTF2-like superfamily protein, putative
fungal Zn(2)-Cys(6) binuclear cluster domain-containing protein, putative
CAIB/BAIF family enzyme
RING finger domain-containing protein
conserved hypothetical protein
essential cytoplasmic protein Ctr86
DHHC zinc finger domain-containing protein
Telomeric repeat-binding factor 2-interacting protein 1, putative
heterochromatin protein HP1
T-complex protein 1 subunit epsilon
conserved hypothetical protein
ankyrin repeat domain-containing protein
DUF1752 domain-containing protein
cyclohexanone monooxygenase
C3HC zinc finger domain-containing protein
mRNA cap methyltransferase
chromosomal organization and DNA repair protein Mms21
C6 zinc finger domain-containing protein
copper fist DNA-binding domain-containing protein
conse

UBA/TS-N domain-containing protein
Ortholog of A. nidulans FGSC A4 : AN1305, A. niger CBS 513.88 : An11g07290, An07g03270, Aspergillus wentii : Aspwe1_0033496 and Aspergillus niger ATCC 1015 : 178166-mRNA, 40057-mRNA
Ortholog of A. nidulans FGSC A4 : AN9474, A. fumigatus Af293 : Afu4g11260, A. niger CBS 513.88 : An15g04210, An04g05380 and A. oryzae RIB40 : AO090003001095, AO090120000002
protein of unknown function
DUF1688 domain-containing protein
Has domain(s) with predicted ATP binding, protein kinase activity and role in protein phosphorylation
small GTP-binding protein
Ortholog of A. nidulans FGSC A4 : AN12089, A. niger CBS 513.88 : An11g02070, A. oryzae RIB40 : AO090124000044 and Aspergillus versicolor : Aspve1_0045579, Aspve1_0073154, Aspve1_0081145
protein of unknown function
Has domain(s) with predicted role in transmembrane transport and integral component of membrane localization
RNA-dependent RNA polymerase, putative
Ortholog(s) have ferrous iron binding, histone binding, ox

Has domain(s) with predicted zinc ion binding activity
Has domain(s) with predicted DNA binding, zinc ion binding activity, role in transcription, DNA-templated and nucleus localization
Ortholog(s) have glycerol-3-phosphate transmembrane transporter activity, glycerophosphodiester transmembrane transporter activity and role in glycerol-3-phosphate transport, glycerophosphodiester transport, transmembrane transport
Clavaminate synthase-like
Ortholog of A. niger CBS 513.88 : An02g08170, Aspergillus wentii : Aspwe1_0111532, Aspergillus versicolor : Aspve1_0153551 and Aspergillus niger ATCC 1015 : 174147-mRNA
Ortholog(s) have protein serine/threonine phosphatase activity, protein tyrosine/serine/threonine phosphatase activity
DUF3638 domain-containing protein
Ortholog of A. nidulans FGSC A4 : AN2844, A. fumigatus Af293 : Afu3g12255, A. niger CBS 513.88 : An02g08130, Neosartorya fischeri NRRL 181 : NFIA_065040 and Aspergillus versicolor : Aspve1_0080524
Thioesterase/thiol ester dehydrase-is

Glycogen debranching enzyme
Ortholog of A. nidulans FGSC A4 : AN6296, A. fumigatus Af293 : Afu2g12300, A. niger CBS 513.88 : An02g04340, Aspergillus wentii : Aspwe1_0118064 and Aspergillus sydowii : Aspsy1_0152200
Has domain(s) with predicted protein dimerization activity
Ribosome biogenesis protein SLX9, putative
Ortholog(s) have RNA polymerase II core promoter proximal region sequence-specific DNA binding, protein heterodimerization activity and transcriptional activator activity, more
glycerol-3-phosphate dehydrogenase [NAD( )]
protein of unknown function
molybdenum cofactor biosynthetic protein
Ortholog of Aspergillus flavus NRRL 3357 : AFL2T_06852
riboflavin aldehyde-forming enzyme
Has domain(s) with predicted role in transmembrane transport and integral component of membrane localization
Ortholog(s) have sequence-specific DNA binding activity
Protein kinase-like domain-containing protein
Has domain(s) with predicted 3-hydroxyacyl-CoA dehydrogenase activity, coenzyme binding, oxid

pre-mRNA-splicing factor cwc24
Has domain(s) with predicted NAD(P)  transhydrogenase (AB-specific) activity, NADP binding activity, role in oxidation-reduction process and integral component of membrane localization
2-isopropylmalate synthase
c24-sterol methyltransferase
Ortholog of Aspergillus versicolor : Aspve1_0042194, Aspergillus zonatus : Aspzo1_0136371 and Aspergillus sydowii : Aspsy1_0030438
Acyl-CoA ligase-like protein, putative
protein of unknown function
Ser/Thr protein phosphatase
conserved hypothetical protein
Ortholog of A. niger CBS 513.88 : An03g03670, Aspergillus clavatus NRRL 1 : ACLA_097300, Aspergillus niger ATCC 1015 : 54686-mRNA and Aspergillus carbonarius ITEM 5010 : Acar5010_145720, Acar5010_205216
40S ribosomal protein S5
conserved hypothetical protein
Ortholog of A. fumigatus Af293 : Afu6g03450, A. oryzae RIB40 : AO090001000013/wykH, Neosartorya fischeri NRRL 181 : NFIA_007610 and Aspergillus versicolor : Aspve1_0401363
SDA1 domain-containing protein
conserved

conserved hypothetical protein
Has domain(s) with predicted transferase activity, transferring acyl groups other than amino-acyl groups activity
Has domain(s) with predicted transferase activity, transferring hexosyl groups activity and role in metabolic process
peptidyl-prolyl cis-trans isomerase
Has domain(s) with predicted ATP binding, ATPase activity, ATPase activity, coupled to transmembrane movement of substances, nucleoside-triphosphatase activity, nucleotide binding activity and role in transmembrane transport
protein of unknown function
conserved hypothetical protein
Has domain(s) with predicted protein kinase binding activity and role in regulation of cyclin-dependent protein serine/threonine kinase activity
lariat debranching enzyme
Has domain(s) with predicted oxidoreductase activity, transferase activity, transferring acyl groups other than amino-acyl groups, zinc ion binding activity and role in oxidation-reduction process
conserved hypothetical protein
Has domain(s) with

conserved hypothetical protein
protein of unknown function
protein of unknown function
conserved hypothetical protein
Ortholog(s) have monooxygenase activity and role in sterigmatocystin biosynthetic process
conserved hypothetical protein
Ortholog of Aspergillus flavus NRRL 3357 : AFL2T_12379 and Aspergillus zonatus : Aspzo1_0164322
conserved hypothetical protein
Has domain(s) with predicted DNA binding, zinc ion binding activity, role in transcription, DNA-templated and nucleus localization
40S ribosomal protein S9
Ortholog(s) have cytosol, nucleus localization
60S ribosomal protein L21
Ortholog(s) have holo-[acyl-carrier-protein] synthase activity and mitochondrion localization
mitochondrial carrier protein
Ortholog of A. nidulans FGSC A4 : AN7042, A. fumigatus Af293 : Afu4g04060, A. niger CBS 513.88 : An14g00810, Aspergillus wentii : Aspwe1_0043276 and Aspergillus sydowii : Aspsy1_0048943
conserved hypothetical protein
Ortholog(s) have role in cell adhesion involved in biofilm forma

conserved hypothetical protein
phosphatidylinositol 4-kinase
serine/threonine-protein kinase chk2
conserved hypothetical protein
serine/threonine-protein kinase crk1
conserved hypothetical protein
G-patch RNA maturation protein
conserved hypothetical protein
conserved hypothetical protein
glyoxalase/bleomycin resistance protein/dioxygenase
EF hand domain-containing protein
conserved hypothetical protein
conserved hypothetical protein
conserved hypothetical protein
conserved hypothetical protein
RNA polymerase II transcription factor B subunit 5
conserved hypothetical protein
1,3-beta-glucanosyltransferase gel1 precursor
conserved hypothetical protein
conserved hypothetical protein
imidazole glycerol phosphate synthase hisHF
PQ loop repeat protein
conserved hypothetical protein
conserved hypothetical protein
guanine nucleotide-binding protein alpha-2 subunit
conserved hypothetical protein
conserved hypothetical protein
glutamate-cysteine ligase
GNAT family N-acetyltransferase
conserved 

coagulation factor 5/8 type domain-containing protein
nucleotide-sugar transporter
50S ribosomal protein L14
RNP domain-containing protein
histone-lysine N-methyltransferase SET9
6-phosphogluconate dehydrogenase
serine/threonine protein phosphatase 2A
40S ribosomal protein S11
palmitoyltransferase pfa3
glutamate carboxypeptidase-1
hypothetical phospholipase C
NADH-cytochrome b5 reductase-2
hypothetical aspartyl protease
tripeptidyl-peptidase 1
kinetochore protein-17
Ornithine decarboxylase
hypothetical aspartyl protease
DNA repair protein RAD16
hypothetical aspartyl protease
aromatic ring-opening dioxygenase LigB subunit
acyl-protein thioesterase 1
MOSC domain-containing protein
multi-drug resistance-5
metalloreductase transmembrane component
FAD-dependent oxidoreductase-2
Eukaryotic translation initiation factor 3 subunit G
Short chain dehydrogenase
Sec1 family superfamily protein
CBF/NF-Y family transcription factor
Phosphatidylethanolamine N-methyltransferase
palmitoyltransferase er

metacaspase-1 precursor
2-dehydropantoate 2-reductase
alpha/beta hydrolase fold protein
glycosyl hydrolase family 31-4
ZZ type zinc finger domain-containing protein
multi-drug resistance-11
aspartic endopeptidase
mitochondrial import receptor subunit tom-70
U3 small nucleolar ribonucleoprotein IMP4
3-hydroxy-3-methylglutaryl-coenzyme A reductase
regulator-nonsense transcripts 1
FAD binding domain-containing protein
cation-transporting ATPase 4
SUMO-conjugating enzyme ubc9
asparagine synthetase
nucleotide-binding protein 2
NADH-cytochrome b5 reductase-3
multidrug transporter
ATP-dependent RNA helicase dbp-7
general stress protein 39
niemann-Pick C1 protein
ribosomal L1 domain-containing protein 1
transcription factor-15
palmitoyl-protein thioesterase
dihydroorotate reductase PyrE
palmitoyltransferase SWF1
DUF895 domain membrane protein
transporter SEC61 subunit alpha
kynurenine 3-monooxygenase
CAMK/CAMKL/AMPK protein kinase
oxysterol binding protein
transcription elongation factor spt6


Lanthionine synthetase C family protein
herpesviridae ul52/ul70 dna primase
Superkiller viralicidic activity 2 family 2
radical SAM domain-containing protein
protein phosphatase 2b regulatory subunit
iron-sulfur cluster protein ISCA
Ufm1-conjugating enzyme 1
MORN repeat-containing protein
UBA/TS-N domain-containing protein
Myb family DNA-binding domain-containing protein
regulator of chromosome condensation rcc1
3' exoribonuclease family
CHCH domain-containing protein
ribosomal RNA small subunit methyltransferase B
mitochondrial inner membrane translocase subunit TIM17
ENTH domain-containing protein
SAG-related sequence SRS46
rRNA pseudouridine synthase
bromodomain-containing protein
alpha-glucan water dikinase 1
translation initiation factor IF-2
phosphoadenosine phosphosulfate reductase family protein
eIF2 kinase IF2K-A (incomplete catalytic triad)
ribosomal protein RPS23
autophagy-related protein 7 atg7
OTU family cysteine protease
EF hand domain-containing protein
p25-alpha family 

Sterol uptake control 2
translation initiation factor IF-2
MFS multidrug transporter
translation initiation factor eIF-2B subunit epsilon
Phospholipid:diacylglycerol acyltransferase
uroporphyrinogen-III synthase
DASH complex subunit dad4
glucose-repressible alcohol dehydrogenase transcriptional effector
bZIP family transcription factor
Oligopeptide transporter
26S proteasome regulatory subunit N6
MFS SP general alpha glucoside:H+ symporter
asparaginyl-tRNA synthetase
ubiquitin-conjugating enzyme
ribosome biogenesis ATPase RIX7
cell division control 25
Sterol uptake control 2
guanine nucleotide exchange factor LTE1
vacuolar ATP synthase subunit d
D-xylose-proton symporter
endoribonuclease L-PSP
6-hydroxy-D-nicotine oxidase
Choline dehydrogenase
glutathione s-transferase
positive regulator of purine utilization
general alpha-glucoside permease
MFS SP general alpha glucoside:H+ symporter
Aromatic peroxygenase
Sterol uptake control 2
endo-1,4-beta-xylanase B
SPEG neighbor protein
uncharact

In [68]:
len(protein_patterns) / batch_size

242.4103

In [91]:
doc = nlp('Jurkat 1.56*10^6 cells per ml 75.5% viability.')

In [90]:
print([(ent.text, ent.label_) for ent in doc.ents])

[('Jurkats', 'cell_line'), ('6', 'cell_line'), ('per', 'protein'), ('viability', 'protein')]


# Enzyme regexp

In [81]:
enzyme_pattern = {
    'label': 'enzyme', 
    'pattern': [{"TEXT": {"REGEX": r'ase\b'}}]
}

In [1]:
nlp2 = spacy.load('models/scipy_imba')

NameError: name 'spacy' is not defined

In [83]:
ruler = EntityRuler(nlp2)

ruler.add_patterns([enzyme_pattern])

In [84]:
doc = nlp2('my favorite enzyme is protease along with hydrogenase')

In [85]:
print([(ent.text, ent.label_) for ent in doc.ents])

[]


In [38]:
import re
pattern = re.compile("ase$")
if pattern.match('protease'):
    print('ye')
else:
    print('ne')

ne


In [72]:
re.search(r'\bfull\b', 'I am full now ')

<re.Match object; span=(0, 4), match='full'>

In [86]:
re.findall(r'ase\b', 'my favorite enzyme is protease along with hydrogenase')

['ase', 'ase']

### Test

In [None]:
with open('/Users/valdimareggertsson/Documents/Valdi/Vetrarönn 2019/NER') as f.



In [None]:
    bbb

# Throw away fake proteins

In [None]:
ruler = EntityRuler(nlp)
ruler.from_disk("models/ruler")    # saves patterns and config