In [1]:
import numpy as np
import pandas as pd

import bioservices
from bioservices import UniProt

In [7]:
pd.__file__

'/home/simon/anaconda3/envs/Slave37/lib/python3.6/site-packages/pandas/__init__.py'

In [2]:
_valid_columns = [
    # Names & Taxonomy
    "id",
    "entry name",
    "genes",
    "genes(PREFERRED)",
    "genes(ALTERNATIVE)",
    "genes(OLN)",
    "genes(ORF)",
    "organism",
    "organism-id",
    "protein names",
    "proteome",
    "lineage(ALL)",
    "lineage-id",
    "virus hosts",
    # Sequences
    "fragement",
    "sequence",
    "length",
    "mass",
    "encodedon",
    "comment(ALTERNATIVE PRODUCTS)",
    "comment(ERRONEOUS GENE MODEL PREDICTION)",
    "comment(ERRONEOUS INITIATION)",
    "comment(ERRONEOUS TERMINATION)",
    "comment(ERRONEOUS TRANSLATION)",
    "comment(FRAMESHIFT)",
    "comment(MASS SPECTROMETRY)",
    "comment(POLYMORPHISM)",
    "comment(RNA EDITING)",
    "comment(SEQUENCE CAUTION)",
    "feature(ALTERNATIVE SEQUENCE)",
    "feature(NATURAL VARIANT)",
    "feature(NON ADJACENT RESIDUES)",
    "feature(NON STANDARD RESIDUE)",
    "feature(NON TERMINAL RESIDUE)",
    "feature(SEQUENCE CONFLICT)",
    "feature(SEQUENCE UNCERTAINTY)",
    "version(sequence)",
    # Family and Domains
    "domains",
    "domain",
    "comment(DOMAIN)",
    "comment(SIMILARITY)",
    "feature(COILED COIL)",
    "feature(COMPOSITIONAL BIAS)",
    "feature(DOMAIN EXTENT)",
    "feature(MOTIF)",
    "feature(REGION)",
    "feature(REPEAT)",
    "feature(ZINC FINGER)",
    # Function
    "ec",
    "comment(ABSORPTION)",
    "comment(CATALYTIC ACTIVITY)",
    "comment(COFACTOR)",
    "comment(ENZYME REGULATION)",
    "comment(FUNCTION)",
    "comment(KINETICS)",
    "comment(PATHWAY)",
    "comment(REDOX POTENTIAL)",
    "comment(TEMPERATURE DEPENDENCE)",
    "comment(PH DEPENDENCE)",
    "feature(ACTIVE SITE)",
    "feature(BINDING SITE)",
    "feature(DNA BINDING)",
    "feature(METAL BINDING)",
    "feature(NP BIND)",
    "feature(SITE)",
    # Gene Ontologys
    "go",
    "go(biological process)",
    "go(molecular function)",
    "go(cellular component)",
    "go-id",
    # InterPro
    "interpro",
    # Interaction
    "interactor",
    "comment(SUBUNIT)",
    # Publications
    "citation",
    "citationmapping",
    # Date of
    "created",
    "last-modified",
    "sequence-modified",
    "version(entry)",
    # Structure
    "3d",
    "feature(BETA STRAND)",
    "feature(HELIX)",
    "feature(TURN)",
    # Subcellular location
    "feature(SUBCELLULAR LOCATION)",
    "feature(INTRAMEMBRANE)",
    "feature(TOPOLOGICAL DOMAIN)",
    "feature(TRANSMEMBRANE)",
    # Miscellaneous
    "annotation score",
    "score",
    "features",
    "comment(CAUTION)",
    "comment(TISSUE SPECIFICITY)",
    "comment(GENERAL)",
    "keywords",
    "context",
    "existence",
    "tools",
    "reviewed",
    "feature",
    "families",
    "subcellular locations",
    "taxonomy",
    "version",
    "clusters",
    "comments",
    "database",
    "keyword-id",
    "pathway",
    "score",
    # Pathology & Biotech
    "comment(ALLERGEN)",
    "comment(BIOTECHNOLOGY)",
    "comment(DISRUPTION PHENOTYPE)",
    "comment(DISEASE)",
    "comment(PHARMACEUTICAL)",
    "comment(TOXIC DOSE)",
    # PTM / Processsing
    "comment(PTM)",
    "feature(CHAIN)",
    "feature(CROSS LINK)",
    "feature(DISULFIDE BOND)",
    "feature(GLYCOSYLATION)",
    "feature(INITIATOR METHIONINE)",
    "feature(LIPIDATION)",
    "feature(MODIFIED RESIDUE)",
    "feature(PEPTIDE)",
    "feature(PROPEPTIDE)",
    "feature(SIGNAL)",
    "feature(TRANSIT)",
    # Taxonomic lineage
    "lineage(all)",
    "lineage(SUPERKINGDOM)",
    "lineage(KINGDOM)",
    "lineage(SUBKINGDOM)",
    "lineage(SUPERPHYLUM)",
    "lineage(PHYLUM)",
    "lineage(SUBPHYLUM)",
    "lineage(SUPERCLASS)",
    "lineage(CLASS)",
    "lineage(SUBCLASS)",
    "lineage(INFRACLASS)",
    "lineage(SUPERORDER)",
    "lineage(ORDER)",
    "lineage(SUBORDER)",
    "lineage(INFRAORDER)",
    "lineage(PARVORDER)",
    "lineage(SUPERFAMILY)",
    "lineage(FAMILY)",
    "lineage(SUBFAMILY)",
    "lineage(TRIBE)",
    "lineage(SUBTRIBE)",
    "lineage(GENUS)",
    "lineage(SUBGENUS)",
    "lineage(SPECIES GROUP)",
    "lineage(SPECIES SUBGROUP)",
    "lineage(SPECIES)",
    "lineage(SUBSPECIES)",
    "lineage(VARIETAS)",
    "lineage(FORMA)",
    # Taxonomic identifier
    "lineage-id(all)",
    "lineage-id(SUPERKINGDOM)",
    "lineage-id(KINGDOM)",
    "lineage-id(SUBKINGDOM)",
    "lineage-id(SUPERPHYLUM)",
    "lineage-id(PHYLUM)",
    "lineage-id(SUBPHYLUM)",
    "lineage-id(SUPERCLASS)",
    "lineage-id(CLASS)",
    "lineage-id(SUBCLASS)",
    "lineage-id(INFRACLASS)",
    "lineage-id(SUPERORDER)",
    "lineage-id(ORDER)",
    "lineage-id(SUBORDER)",
    "lineage-id(INFRAORDER)",
    "lineage-id(PARVORDER)",
    "lineage-id(SUPERFAMILY)",
    "lineage-id(FAMILY)",
    "lineage-id(SUBFAMILY)",
    "lineage-id(TRIBE)",
    "lineage-id(SUBTRIBE)",
    "lineage-id(GENUS)",
    "lineage-id(SUBGENUS)",
    "lineage-id(SPECIES GROUP)",
    "lineage-id(SPECIES SUBGROUP)",
    "lineage-id(SPECIES)",
    "lineage-id(SUBSPECIES)",
    "lineage-id(VARIETAS)",
    "lineage-id(FORMA)",
    # Cross-references
    "database(db_abbrev)",
    "database(EMBL)",
]

In [3]:
ls = ['ZAP70_HUMAN',
      'CBLB_HUMAN',
      "DC1I1_HUMAN",
"DAPK3_HUMAN",
"DCTN3_HUMAN",
"TBG1_HUMAN",
"MK03_HUMAN",
"NEK2_HUMAN",
"PRKDC_HUMAN",
"H33_HUMAN",
"DC1I2_HUMAN",
"DCTN2_HUMAN",
"DCTN1_HUMAN",
"DYHC1_HUMAN",
"MARE1_HUMAN",
"ARH_HUMAN",
"KIF2B_HUMAN",
"DLRB2_HUMAN",
"DYL2_HUMAN",
"SMUF2_HUMAN",
"DLRB1_HUMAN",
"APC10_HUMAN",
"MD1L1_HUMAN",
"BIRC5_HUMAN",
"DC1L2_HUMAN",
"PP1G_HUMAN",
"CENPF_HUMAN",
"PLK1_HUMAN",
"PP1A_HUMAN",
"DYL1_HUMAN",
"ASPM_HUMAN",
"KRIT1_HUMAN",
"KIF3B_HUMAN",
"EVI5_HUMAN",
"RFIP3_HUMAN",
"TACC1_HUMAN",
"SYUG_HUMAN",
"JTB_HUMAN",
"KPCD3_HUMAN",
"LATS1_HUMAN",
"CHK2_HUMAN",
"CRYAB_HUMAN",
"GNAI2_HUMAN",
"GNAI3_HUMAN",
"41_HUMAN",
"MK01_HUMAN",
"CTNB1_HUMAN",
"ARL3_HUMAN",
"MP2K2_HUMAN",
"CAPG_HUMAN",
"IPP2_HUMAN",
"CTCF_HUMAN",
"GDIB_HUMAN",
"DYN2_HUMAN",
"ANX11_HUMAN",
"BRCA2_HUMAN",
"LIMK1_HUMAN",
"LIMK2_HUMAN",
"IST1_HUMAN",
"CALM1_HUMAN",
"CALM2_HUMAN",
"CALM3_HUMAN",
"GNAI1_HUMAN",
"MP2K1_HUMAN",
"PTN13_HUMAN",
"AAPK1_HUMAN",
"STK3_HUMAN",
"ROCK1_HUMAN",
"SNPC2_HUMAN",
"KCC2B_HUMAN",
"KCC2G_HUMAN",
"DPYL1_HUMAN",
"KPCD1_HUMAN",
"SEPT7_HUMAN",
"CSPP1_HUMAN",
"PKHG6_HUMAN",
"CEP55_HUMAN",
"EFHC1_HUMAN",
"ZFY26_HUMAN",
"MAEA_HUMAN",
"RFIP4_HUMAN",
"SIR2_HUMAN",
"SEP12_HUMAN",
"MPLKI_HUMAN",
"NEK7_HUMAN",
"PDC6I_HUMAN",
"WASF1_HUMAN",
"CC124_HUMAN",
"MAP6_HUMAN",
"CK5P2_HUMAN",
"PKP4_HUMAN",
"BARD1_HUMAN",
"TS101_HUMAN",
"AKAP9_HUMAN",
"LZTS2_HUMAN",
"BBC3_HUMAN",
"CDCA4_HUMAN",
"KPCD2_HUMAN",
"PLK3_HUMAN",
"SAV1_HUMAN",
"CENPJ_HUMAN",
"RIC8A_HUMAN",
"NDE1_HUMAN",
"DTL_HUMAN",
"SPAST_HUMAN",
"AURKC_HUMAN",
"RUVB2_HUMAN",
"CHMP3_HUMAN",
"CLIC4_HUMAN",
"MAP4_HUMAN",
"TBA4A_HUMAN",
"GPSM2_HUMAN",
"NUMA1_HUMAN",
"KIF14_HUMAN",
"ARHG2_HUMAN",
"MKNK1_HUMAN",
"SEPT9_HUMAN",
"BRCA1_HUMAN",
"PLK4_HUMAN",
"CENPA_HUMAN",
"SUMO1_HUMAN",
"PP2AA_HUMAN",
"KIF2C_HUMAN",
"RCC2_HUMAN",
"NUDC_HUMAN",
"ORC6_HUMAN",
"CLIP1_HUMAN",
"CENPE_HUMAN",
"BOREA_HUMAN",
"AURKB_HUMAN",
"INCE_HUMAN",
"HIP1_HUMAN",
"PITM1_HUMAN",
"CTRO_HUMAN",
"STX16_HUMAN",
"OGT1_HUMAN",
"SGTA_HUMAN",
"TM1L1_HUMAN",
"SVIL_HUMAN",
"STABP_HUMAN",
"CLD1_HUMAN",
"LMNA_HUMAN",
"ITB1_HUMAN",
"MYL1_HUMAN",
"PTMA_HUMAN",
"ANXA2_HUMAN",
"RHOC_HUMAN",
"VIME_HUMAN",
"TPM1_HUMAN",
"BIP_HUMAN",
"GTR1_HUMAN",
"RALA_HUMAN",
"ENPL_HUMAN",
"DESM_HUMAN",
"LMNB1_HUMAN",
"COF1_HUMAN",
"MYL9_HUMAN",
"WEE1_HUMAN",
"GDIA_HUMAN",
"STX2_HUMAN",
"NUP62_HUMAN",
"UBP8_HUMAN",
"CRK_HUMAN",
"IQGA1_HUMAN",
"EMD_HUMAN",
"PLCD1_HUMAN",
"KS6A3_HUMAN",
"TERA_HUMAN",
"DEST_HUMAN",
"ARP3_HUMAN",
"ARP2_HUMAN",
"RHOA_HUMAN",
"ARF6_HUMAN",
"RB11A_HUMAN",
"RHOB_HUMAN",
"RACK1_HUMAN",
"AKA12_HUMAN",
"LMNB2_HUMAN",
"ZO1_HUMAN",
"GOGA2_HUMAN",
"SSRP1_HUMAN",
"DLG1_HUMAN",
"RHG05_HUMAN",
"BIRC2_HUMAN",
"CUL3_HUMAN",
"DAG1_HUMAN",
"KEAP1_HUMAN",
"RBP1_HUMAN",
"KS6A2_HUMAN",
"KS6A1_HUMAN",
"OCLN_HUMAN",
"CHM1B_HUMAN",
"TEX14_HUMAN",
"PLCD3_HUMAN",
"SCPDL_HUMAN",
"SYNE1_HUMAN",
"SEPT1_HUMAN",
"SSH1_HUMAN",
"CPNE1_HUMAN",
"SH3G1_HUMAN",
"VAMP8_HUMAN",
"UN45A_HUMAN",
"TBB1_HUMAN",
"ECT2_HUMAN",
"NCOA5_HUMAN",
"PLCB1_HUMAN",
"CYLD_HUMAN",
"KI13B_HUMAN"]


In [4]:
ls2 = ["id:O14576",
"id:O43293",
"id:O75935",
"id:P23258",
"id:P27361",
"id:P51955",
"id:P78527",
"id:P84243",
"id:Q13409",
"id:Q13561",
"id:Q14203",
"id:Q14204",
"id:Q15691",
"id:Q5SW96",
"id:Q8N4N8",
"id:Q8TF09",
"id:Q96FJ2",
"id:Q9HAU4",
"id:Q9NP97",
"id:Q9UM13",
"id:Q9Y6D9",
"id:O15392",
"id:O43237",
"id:P36873",
"id:P49454",
"id:P53350",
"id:P62136",
"id:P63167",
"id:Q8IZT6",
"id:O00522",
"id:O15066",
"id:O60447",
"id:O75154",
"id:O75410",
"id:O76070",
"id:O76095",
"id:O94806",
"id:O95835",
"id:O96017",
"id:P02511",
"id:P04899",
"id:P08754",
"id:P11171",
"id:P28482",
"id:P35222",
"id:P36405",
"id:P36507",
"id:P40121",
"id:P41236",
"id:P49711",
"id:P50395",
"id:P50570",
"id:P50995",
"id:P51587",
"id:P53667",
"id:P53671",
"id:P53990",
"id:P0DP23",
"id:P0DP24",
"id:P0DP25",
"id:P63096",
"id:Q02750",
"id:Q12923",
"id:Q13131",
"id:Q13188",
"id:Q13464",
"id:Q13487",
"id:Q13554",
"id:Q13555",
"id:Q14194",
"id:Q15139",
"id:Q16181",
"id:Q1MSJ5",
"id:Q3KR16",
"id:Q53EZ4",
"id:Q5JVL4",
"id:Q68DK2",
"id:Q7L5Y9",
"id:Q86YS3",
"id:Q8IXJ6",
"id:Q8IYM1",
"id:Q8TAP9",
"id:Q8TDX7",
"id:Q8WUM4",
"id:Q92558",
"id:Q96CT7",
"id:Q96JE9",
"id:Q96SN8",
"id:Q99569",
"id:Q99728",
"id:Q99816",
"id:Q99996",
"id:Q9BRK4",
"id:Q9BXH1",
"id:Q9BXL8",
"id:Q9BZL6",
"id:Q9H4B4",
"id:Q9H4B6",
"id:Q9HC77",
"id:Q9NPQ8",
"id:Q9NXR1",
"id:Q9NZJ0",
"id:Q9UBP0",
"id:Q9UQB9",
"id:Q9Y230",
"id:Q9Y3E7",
"id:Q9Y696",
"id:P27816",
"id:P68366",
"id:P81274",
"id:Q14980",
"id:Q15058",
"id:Q92974",
"id:Q9BUB5",
"id:Q9UHD8",
"id:P38398",
"id:O00444",
"id:P49450",
"id:P63165",
"id:P67775",
"id:Q99661",
"id:Q9P258",
"id:Q9Y266",
"id:Q9Y5N6",
"id:P30622",
"id:Q02224",
"id:Q53HL2",
"id:Q96GD4",
"id:Q9NQS7",
"id:O00291",
"id:O00562",
"id:O14578",
"id:O14662",
"id:O15294",
"id:O43765",
"id:O75674",
"id:O95425",
"id:O95630",
"id:O95832",
"id:P02545",
"id:P05556",
"id:P05976",
"id:P06454",
"id:P07355",
"id:P08134",
"id:P08670",
"id:P09493",
"id:P11021",
"id:P11166",
"id:P11233",
"id:P14625",
"id:P17661",
"id:P20700",
"id:P23528",
"id:P24844",
"id:P30291",
"id:P31150",
"id:P32856",
"id:P37198",
"id:P40818",
"id:P46108",
"id:P46940",
"id:P50402",
"id:P51178",
"id:P51812",
"id:P55072",
"id:P60981",
"id:P61158",
"id:P61160",
"id:P61586",
"id:P62330",
"id:P62491",
"id:P62745",
"id:P63244",
"id:Q02952",
"id:Q03252",
"id:Q07157",
"id:Q08379",
"id:Q08945",
"id:Q12959",
"id:Q13017",
"id:Q13490",
"id:Q13618",
"id:Q14118",
"id:Q14145",
"id:Q15311",
"id:Q15349",
"id:Q15418",
"id:Q16625",
"id:Q7LBR1",
"id:Q8IWB6",
"id:Q8N3E9",
"id:Q8NBX0",
"id:Q8NF91",
"id:Q8WYJ6",
"id:Q8WYL5",
"id:Q99829",
"id:Q99961",
"id:Q9BV40",
"id:Q9H3U1",
"id:Q9H4B7",
"id:Q9H8V3",
"id:Q9HCD5",
"id:Q9NQ66",
"id:Q9NQC7",
"id:Q9NQT8",
"id:Q9NQW6",
"id:Q9P2Y5",
"id:Q9UDY2",
"id:Q9UFW8",
"id:Q9UHB6",
"id:Q9UIF7",
"id:Q9UKX2",
"id:Q9ULW0",
"id:Q9UM54",
"id:Q9UN37",
"id:Q9UPT5",
"id:Q9Y371",
"id:Q9Y490",
"id:Q9Y5K6",
"id:O43663",
"id:O95235",
"id:O95239",
"id:P33176",
"id:P42345"]


In [None]:
from IPython.display import clear_output
def parse_uniprot(entries):

    u = UniProt(verbose=False)
    data = pd.DataFrame()

    for entry in entries:
        clear_output(wait=True)
        try:
#             ent = "id:" + entry
            z = u.search(
                entry, frmt="tab", columns=",".join(_valid_columns), limit=1, maxTrials=5
            )
#             df = pd.read_csv(io.StringIO(str(z)), sep="\t")
#             print(df.head())
#             pd.concat([data, df], sort=False)
        except:
            print(entry)
        print("Progress:", np.round(entries.index(entry) / len(entries) * 100, 2), "%")
    return data

In [None]:
%time z = parse_uniprot(ls2)

In [5]:
# df bandymas
u = UniProt(verbose=False)

%time u.get_df(ls2)

0
1
2
CPU times: user 869 ms, sys: 118 ms, total: 987 ms
Wall time: 36.1 s


Unnamed: 0,Entry,Entry name,Gene names,Gene names (primary ),Gene names (synonym ),Gene names (ordered locus ),Gene names (ORF ),Organism,Organism ID,Protein names,...,Taxonomic lineage IDs (GENUS),Taxonomic lineage IDs (SUBGENUS),Taxonomic lineage IDs (SPECIES GROUP),Taxonomic lineage IDs (SPECIES SUBGROUP),Taxonomic lineage IDs (SPECIES),Taxonomic lineage IDs (SUBSPECIES),Taxonomic lineage IDs (VARIETAS),Taxonomic lineage IDs (FORMA),Cross-reference (db_abbrev),Cross-reference (EMBL)
0,Q9NQW6,ANLN_HUMAN,[ANLN],ANLN,,,,Homo sapiens (Human),9606,Anillin,...,9605,,,,,,,,,AF273437;BC034692;BC070066;CR936650;AK001468;A...
1,Q02952,AKA12_HUMAN,[AKAP12 AKAP250],AKAP12,AKAP250,,,Homo sapiens (Human),9606,A-kinase anchor protein 12 (AKAP-12) (A-kinase...,...,9605,,,,,,,,,U81607;AB003476;AB210003;CR749527;AF001504;M96...
2,Q9Y3E7,CHMP3_HUMAN,[CHMP3 CGI149 NEDF VPS24 CGI-149],CHMP3,CGI149 NEDF VPS24,,CGI-149,Homo sapiens (Human),9606,Charged multivesicular body protein 3 (Chromat...,...,9605,,,,,,,,,AF219226;AY364249;AF151907;AK290725;AK294389;A...
3,P36405,ARL3_HUMAN,[ARL3 ARFL3],ARL3,ARFL3,,,Homo sapiens (Human),9606,ADP-ribosylation factor-like protein 3,...,9605,,,,,,,,,U07151;AF493889;AK312525;CR407637;BC009841;
4,Q8IZT6,ASPM_HUMAN,[ASPM MCPH5],ASPM,MCPH5,,,Homo sapiens (Human),9606,Abnormal spindle-like microcephaly-associated ...,...,9605,,,,,,,,,AF509326;AY099890;AY099891;AY099892;AY099893;A...
5,P61158,ARP3_HUMAN,[ACTR3 ARP3],ACTR3,ARP3,,,Homo sapiens (Human),9606,Actin-related protein 3 (Actin-like protein 3),...,9605,,,,,,,,,AF006083;AF127773;AK312659;AC110769;CH471103;B...
6,P30622,CLIP1_HUMAN,[CLIP1 CYLN1 RSN],CLIP1,CYLN1 RSN,,,Homo sapiens (Human),9606,CAP-Gly domain-containing linker protein 1 (Cy...,...,9605,,,,,,,,,X64838;M97501;BC114213;BC117209;BC126305;
7,P62330,ARF6_HUMAN,[ARF6],ARF6,,,,Homo sapiens (Human),9606,ADP-ribosylation factor 6,...,9605,,,,,,,,,M57763;AY296206;AF047432;AF493885;AK313790;CR5...
8,P06454,PTMA_HUMAN,[PTMA TMSA],PTMA,TMSA,,,Homo sapiens (Human),9606,Prothymosin alpha [Cleaved into: Prothymosin a...,...,9605,,,,,,,,,M14630;M14483;M67480;J04797;M67480;J04797;M267...
9,P36873,PP1G_HUMAN,[PPP1CC],PPP1CC,,,,Homo sapiens (Human),9606,Serine/threonine-protein phosphatase PP1-gamma...,...,9605,,,,,,,,,X74008;BC014073;L07395;


In [None]:
z.head()

In [None]:
u = UniProt(verbose=True)

In [None]:
u.get_df(ls, nChunk=100, organism=None)