# Análise da sequência e das features presentes no NCBI

In [1]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqFeature

In [2]:
record_PNPLA3 = SeqIO.read("PNPLA3.gb", "genbank")
record_PNPLA3

SeqRecord(seq=Seq('CAGGGTCCATACATACAGCGGAATATTATTCAGCCCTAAAAAGGAAGGAAATTC...GAT'), id='NG_008631.1', name='NG_008631', description='Homo sapiens patatin like phospholipase domain containing 3 (PNPLA3), RefSeqGene on chromosome 22', dbxrefs=[])

1º verificar as anotações correspondentes aos genes de interesse

In [3]:
tam=len(record_PNPLA3)
print(" tamanho da sequencia: ", tam)
#print(record_PNPLA3.seq)

 tamanho da sequencia:  30830


In [21]:
print("ID do registo: ",record_PNPLA3.id)
print("Nome do registo: ",record_PNPLA3.name)
print("Descrição: ", record_PNPLA3.description)

ID do registo:  NG_008631.1
Nome do registo:  NG_008631
Descrição:  Homo sapiens patatin like phospholipase domain containing 3 (PNPLA3), RefSeqGene on chromosome 22


In [5]:
print("Anotações: ",record_PNPLA3.annotations)

Anotações:  {'molecule_type': 'DNA', 'topology': 'linear', 'data_file_division': 'PRI', 'date': '18-MAY-2020', 'accessions': ['NG_008631'], 'sequence_version': 1, 'keywords': ['RefSeq', 'RefSeqGene'], 'source': 'Homo sapiens (human)', 'organism': 'Homo sapiens', 'taxonomy': ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Primates', 'Haplorrhini', 'Catarrhini', 'Hominidae', 'Homo'], 'comment': 'REVIEWED REFSEQ: This record has been curated by NCBI staff. The\nreference sequence was derived from Z97055.1, AL023654.2 and\nAL035398.19.\nThis sequence is a reference standard in the RefSeqGene project.\nSummary: The protein encoded by this gene is a triacylglycerol\nlipase that mediates triacylglycerol hydrolysis in adipocytes. The\nencoded protein, which appears to be membrane bound, may be\ninvolved in the balance of energy usage/storage in adipocytes.\n[provided by RefSeq, Jul 2008].'}


In [6]:
print("tipo de molecula: ", record_PNPLA3.annotations["molecule_type"])
print("tpologia: ", record_PNPLA3.annotations["topology"])
print("organismo: ", record_PNPLA3.annotations["organism"])
print("taxonomia: ", record_PNPLA3.annotations["taxonomy"])

tipo de molecula:  DNA
tpologia:  linear
organismo:  Homo sapiens
taxonomia:  ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Primates', 'Haplorrhini', 'Catarrhini', 'Hominidae', 'Homo']


2º  verificar e analisar a informação complementar fornecida pela lista de features e seus qualifiers

In [7]:
print(record_PNPLA3.features)

[SeqFeature(FeatureLocation(ExactPosition(0), ExactPosition(30830), strand=1), type='source'), SeqFeature(FeatureLocation(ExactPosition(5066), ExactPosition(28844), strand=1), type='gene'), SeqFeature(CompoundLocation([FeatureLocation(ExactPosition(5066), ExactPosition(5360), strand=1), FeatureLocation(ExactPosition(8196), ExactPosition(8429), strand=1), FeatureLocation(ExactPosition(10085), ExactPosition(10151), strand=1), FeatureLocation(ExactPosition(14139), ExactPosition(14349), strand=1), FeatureLocation(ExactPosition(15867), ExactPosition(15928), strand=1), FeatureLocation(ExactPosition(18312), ExactPosition(18534), strand=1), FeatureLocation(ExactPosition(21254), ExactPosition(21387), strand=1), FeatureLocation(ExactPosition(25952), ExactPosition(26057), strand=1), FeatureLocation(ExactPosition(27415), ExactPosition(28844), strand=1)], 'join'), type='mRNA', location_operator='join'), SeqFeature(FeatureLocation(ExactPosition(5066), ExactPosition(5360), strand=1), type='exon'), Se

In [8]:
for feat in record_PNPLA3.features:
    print(feat)

type: source
location: [0:30830](+)
qualifiers:
    Key: chromosome, Value: ['22']
    Key: db_xref, Value: ['taxon:9606']
    Key: map, Value: ['22q13.31']
    Key: mol_type, Value: ['genomic DNA']
    Key: organism, Value: ['Homo sapiens']

type: gene
location: [5066:28844](+)
qualifiers:
    Key: db_xref, Value: ['GeneID:80339', 'HGNC:HGNC:18590', 'MIM:609567']
    Key: gene, Value: ['PNPLA3']
    Key: gene_synonym, Value: ['ADPN; C22orf20; iPLA(2)epsilon']
    Key: note, Value: ['patatin like phospholipase domain containing 3']

type: mRNA
location: join{[5066:5360](+), [8196:8429](+), [10085:10151](+), [14139:14349](+), [15867:15928](+), [18312:18534](+), [21254:21387](+), [25952:26057](+), [27415:28844](+)}
qualifiers:
    Key: db_xref, Value: ['GeneID:80339', 'HGNC:HGNC:18590', 'MIM:609567']
    Key: gene, Value: ['PNPLA3']
    Key: gene_synonym, Value: ['ADPN; C22orf20; iPLA(2)epsilon']
    Key: product, Value: ['patatin like phospholipase domain containing 3']
    Key: transcr

In [9]:
tam_feature=(len(record_PNPLA3.features))
print(tam_feature , "features")

19 features


In [10]:
feature=[]
for k in range(tam_feature):
    feature.append(k)
for i in feature:
    print(record_PNPLA3.features[i].location)
    print(record_PNPLA3.features[i].type)

[0:30830](+)
source
[5066:28844](+)
gene
join{[5066:5360](+), [8196:8429](+), [10085:10151](+), [14139:14349](+), [15867:15928](+), [18312:18534](+), [21254:21387](+), [25952:26057](+), [27415:28844](+)}
mRNA
[5066:5360](+)
exon
join{[5173:5360](+), [8196:8429](+), [10085:10151](+), [14139:14349](+), [15867:15928](+), [18312:18534](+), [21254:21387](+), [25952:26057](+), [27415:27644](+)}
CDS
[5212:5230](+)
misc_feature
[5296:5359](+)
misc_feature
[5305:5320](+)
misc_feature
[8273:8276](+)
misc_feature
[14148:14157](+)
misc_feature
[18392:18395](+)
misc_feature
[8196:8429](+)
exon
[10085:10151](+)
exon
[14139:14349](+)
exon
[15867:15928](+)
exon
[18312:18534](+)
exon
[21254:21387](+)
exon
[25952:26057](+)
exon
[27415:28844](+)
exon


In [11]:
featsource=[]
for i in range(tam_feature):
    if record_PNPLA3.features[i].type=="source":
        featsource.append(i)
for k in featsource:
    #print(record_PNPLA3.features[k].qualifiers)
    print("organismo: ",record_PNPLA3.features[k].qualifiers['organism'])
    print("tipo de molécula: ",record_PNPLA3.features[k].qualifiers['mol_type'])
    print("referência externa: ",record_PNPLA3.features[k].qualifiers['db_xref'])
    print("cromossoma: ", record_PNPLA3.features[k].qualifiers['chromosome'])

organismo:  ['Homo sapiens']
tipo de molécula:  ['genomic DNA']
referência externa:  ['taxon:9606']
cromossoma:  ['22']


In [12]:
s=0
genes=[]
for j in range(tam_feature):
    if record_PNPLA3.features[j].type=="gene":
        s=s+1
        genes.append({(j+1):record_PNPLA3.features[j].qualifiers["gene"]}) #j+1 pq a contagem começa em zero
print("existe ",s, " gene(s) anotado(s) no registo")
print(genes)

existe  1  gene(s) anotado(s) no registo
[{2: ['PNPLA3']}]


In [13]:
featgene=[]
for i in range(tam_feature):
    if record_PNPLA3.features[i].type=="gene":
        featgene.append(i)
print("Genes anotados: ")
for k in featgene:
    print("")
    print("gene: ", record_PNPLA3.features[k].qualifiers['gene'])
    print("gene sinónimo: ",record_PNPLA3.features[k].qualifiers['gene_synonym'])
    print("nota: ",record_PNPLA3.features[k].qualifiers['note'])
    print("referências externas: ",record_PNPLA3.features[k].qualifiers['db_xref'])

Genes anotados: 

gene:  ['PNPLA3']
gene sinónimo:  ['ADPN; C22orf20; iPLA(2)epsilon']
nota:  ['patatin like phospholipase domain containing 3']
referências externas:  ['GeneID:80339', 'HGNC:HGNC:18590', 'MIM:609567']


In [14]:
soma=0
mRNA=[]
for j in range(tam_feature):
    if record_PNPLA3.features[j].type=="mRNA":
        soma=soma+1
        mRNA.append({(j+1):record_PNPLA3.features[j].qualifiers["gene"]}) #j+1 pq a contagem começa em zero
print("existe ",s, " mRNA(s) anotado(s) no registo")
print(genes)

existe  1  mRNA(s) anotado(s) no registo
[{2: ['PNPLA3']}]


In [15]:
featmRNA=[]
for i in range(tam_feature):
    if record_PNPLA3.features[i].type=="mRNA":
        featmRNA.append(i)
print("mRNAs anotados: ")
for k in featmRNA:
    print("")
    print("gene: ", record_PNPLA3.features[k].qualifiers['gene'])
    print("gene sinónimo: ",record_PNPLA3.features[k].qualifiers['gene_synonym'])
    print("id da transcriptação: ",record_PNPLA3.features[k].qualifiers['transcript_id'])
    print("produto/ significado biológico: ",record_PNPLA3.features[k].qualifiers['product'])
    print("referências externas: ",record_PNPLA3.features[k].qualifiers['db_xref'])

mRNAs anotados: 

gene:  ['PNPLA3']
gene sinónimo:  ['ADPN; C22orf20; iPLA(2)epsilon']
id da transcriptação:  ['NM_025225.3']
produto/ significado biológico:  ['patatin like phospholipase domain containing 3']
referências externas:  ['GeneID:80339', 'HGNC:HGNC:18590', 'MIM:609567']


In [22]:
som=0
exon=[]
for j in range(tam_feature):
    if record_PNPLA3.features[j].type=="exon":
        som=som+1
        exon.append({(j+1):record_PNPLA3.features[j].location}) #j+1 pq a contagem começa em zero
print("nº de exões: ",som, )
print(exon)

nº de exões:  9
[{4: FeatureLocation(ExactPosition(5066), ExactPosition(5360), strand=1)}, {12: FeatureLocation(ExactPosition(8196), ExactPosition(8429), strand=1)}, {13: FeatureLocation(ExactPosition(10085), ExactPosition(10151), strand=1)}, {14: FeatureLocation(ExactPosition(14139), ExactPosition(14349), strand=1)}, {15: FeatureLocation(ExactPosition(15867), ExactPosition(15928), strand=1)}, {16: FeatureLocation(ExactPosition(18312), ExactPosition(18534), strand=1)}, {17: FeatureLocation(ExactPosition(21254), ExactPosition(21387), strand=1)}, {18: FeatureLocation(ExactPosition(25952), ExactPosition(26057), strand=1)}, {19: FeatureLocation(ExactPosition(27415), ExactPosition(28844), strand=1)}]


In [16]:
so=0
seq_cod=[]
for j in range(tam_feature):
    if record_PNPLA3.features[j].type=="CDS":
        so=so+1
        seq_cod.append({(j+1):record_PNPLA3.features[j].qualifiers["gene"]}) 
print("existe ",so, " sequência(s) codificante(s) anotada(s) no registo")
print(seq_cod)

existe  1  sequência(s) codificante(s) anotada(s) no registo
[{5: ['PNPLA3']}]


In [23]:
featCDS=[]
for i in range(tam_feature):
    if record_PNPLA3.features[i].type=="CDS":
        featCDS.append(i)
print("Proteínas codificadas")
for k in featCDS:
    print(" ")
    print("gene: ", record_PNPLA3.features[k].qualifiers['gene'])
    print("gene sinónimo: ",record_PNPLA3.features[k].qualifiers['gene_synonym'])
    print("id da proteína: ",record_PNPLA3.features[k].qualifiers['protein_id'])
    print("codão Start: ",record_PNPLA3.features[k].qualifiers['codon_start'])
    print("produto/ significado biológico: ",record_PNPLA3.features[k].qualifiers['product'])
    print("nota: ",record_PNPLA3.features[k].qualifiers['note'])
    print("referências externas: ",record_PNPLA3.features[k].qualifiers['db_xref'])
    print("sequência de aminoácidos transcrita: ",record_PNPLA3.features[k].qualifiers['translation'])

Proteínas codificadas
 
gene:  ['PNPLA3']
gene sinónimo:  ['ADPN; C22orf20; iPLA(2)epsilon']
id da proteína:  ['NP_079501.2']
codão Start:  ['1']
produto/ significado biológico:  ['1-acylglycerol-3-phosphate O-acyltransferase PNPLA3']
nota:  ['adiponutrin; patatin-like phospholipase domain-containing protein 3; acylglycerol O-acyltransferase; calcium-independent phospholipase A2-epsilon; iPLA2epsilon; iPLA2-epsilon; acylglycerol transacylase; lysophosphatidic acid acyltransferase']
referências externas:  ['CCDS:CCDS14054.1', 'GeneID:80339', 'HGNC:HGNC:18590', 'MIM:609567']
sequência de aminoácidos transcrita:  ['MYDAERGWSLSFAGCGFLGFYHVGATRCLSEHAPHLLRDARMLFGASAGALHCVGVLSGIPLEQTLQVLSDLVRKARSRNIGIFHPSFNLSKFLRQGLCKCLPANVHQLISGKIGISLTRVSDGENVLVSDFRSKDEVVDALVCSCFIPFYSGLIPPSFRGVRYVDGGVSDNVPFIDAKTTITVSPFYGEYDICPKVKSTNFLHVDITKLSLRLCTGNLYLLSRAFVPPDLKVLGEICLRGYLDAFRFLEEKGICNRPQPGLKSSSEGMDPEVAMPSWANMSLDSSPESAALAVRLEGDELLDHLRLSILPWDESILDTLSPRLATALSEEMKDKGGYMSKICNLLPIRIMSYVMLPCTLPVESAIAIVQRLVTWLPDMP