# Análise da sequência e das features presentes no NCBI

In [1]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqFeature

In [2]:
record_TFAP2B = SeqIO.read("TFAP2B.gb", "genbank")
record_TFAP2B

SeqRecord(seq=Seq('CCTTGTATTTATGAGACACTTCAGCTCTGATGCTATACATTGTATGTGTTTATT...CCC'), id='NG_008438.1', name='NG_008438', description='Homo sapiens transcription factor AP-2 beta (TFAP2B), RefSeqGene on chromosome 6', dbxrefs=[])

1º verificar as anotações correspondentes aos genes de interesse

In [3]:
tam=len(record_TFAP2B)
print(" tamanho da sequencia: ", tam)
#print(record_TFAP2B.seq)

 tamanho da sequencia:  35888


In [4]:
print("ID: ",record_TFAP2B.id)
print("Nome: ",record_TFAP2B.name)
print("Descrição: ", record_TFAP2B.description)

ID:  NG_008438.1
Nome:  NG_008438
Descrição:  Homo sapiens transcription factor AP-2 beta (TFAP2B), RefSeqGene on chromosome 6


In [5]:
print("Anotações: ",record_TFAP2B.annotations)

Anotações:  {'molecule_type': 'DNA', 'topology': 'linear', 'data_file_division': 'PRI', 'date': '21-AUG-2022', 'accessions': ['NG_008438'], 'sequence_version': 1, 'keywords': ['RefSeq', 'RefSeqGene'], 'source': 'Homo sapiens (human)', 'organism': 'Homo sapiens', 'taxonomy': ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Primates', 'Haplorrhini', 'Catarrhini', 'Hominidae', 'Homo'], 'references': [Reference(title='Mutations in TFAP2B cause Char syndrome, a familial form of patent ductus arteriosus', ...), Reference(title='Char Syndrome', ...)], 'comment': 'REVIEWED REFSEQ: This record has been curated by NCBI staff. The\nreference sequence was derived from AL031224.1 and AL049693.16.\nThis sequence is a reference standard in the RefSeqGene project.\nSummary: This gene encodes a member of the AP-2 family of\ntranscription factors. AP-2 proteins form homo- or hetero-dimers\nwith other AP-2 family members and bind 

In [6]:
print("tipo de molecula: ", record_TFAP2B.annotations["molecule_type"])
print("tpologia: ", record_TFAP2B.annotations["topology"])
print("organismo: ", record_TFAP2B.annotations["organism"])
print("taxonomia: ", record_TFAP2B.annotations["taxonomy"])

tipo de molecula:  DNA
tpologia:  linear
organismo:  Homo sapiens
taxonomia:  ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Primates', 'Haplorrhini', 'Catarrhini', 'Hominidae', 'Homo']


2º  verificar e analisar a informação complementar fornecida pela lista de features e seus qualifiers

In [7]:
print(record_TFAP2B.features)

[SeqFeature(FeatureLocation(ExactPosition(0), ExactPosition(35888), strand=1), type='source'), SeqFeature(FeatureLocation(ExactPosition(5145), ExactPosition(33894), strand=1), type='gene'), SeqFeature(CompoundLocation([FeatureLocation(ExactPosition(5145), ExactPosition(5247), strand=1), FeatureLocation(ExactPosition(9681), ExactPosition(10140), strand=1), FeatureLocation(ExactPosition(14893), ExactPosition(14954), strand=1), FeatureLocation(ExactPosition(22335), ExactPosition(22555), strand=1), FeatureLocation(ExactPosition(24249), ExactPosition(24368), strand=1), FeatureLocation(ExactPosition(26430), ExactPosition(26572), strand=1), FeatureLocation(ExactPosition(29366), ExactPosition(33894), strand=1)], 'join'), type='mRNA', location_operator='join'), SeqFeature(FeatureLocation(ExactPosition(5145), ExactPosition(5247), strand=1), type='exon'), SeqFeature(CompoundLocation([FeatureLocation(ExactPosition(5166), ExactPosition(5247), strand=1), FeatureLocation(ExactPosition(9681), ExactPos

In [8]:
for feat in record_TFAP2B.features:
    print(feat)

type: source
location: [0:35888](+)
qualifiers:
    Key: chromosome, Value: ['6']
    Key: db_xref, Value: ['taxon:9606']
    Key: map, Value: ['6p12.3']
    Key: mol_type, Value: ['genomic DNA']
    Key: organism, Value: ['Homo sapiens']

type: gene
location: [5145:33894](+)
qualifiers:
    Key: db_xref, Value: ['GeneID:7021', 'HGNC:HGNC:11743', 'MIM:601601']
    Key: gene, Value: ['TFAP2B']
    Key: gene_synonym, Value: ['AP-2B; AP-2beta; AP2-B; PDA2']
    Key: note, Value: ['transcription factor AP-2 beta']

type: mRNA
location: join{[5145:5247](+), [9681:10140](+), [14893:14954](+), [22335:22555](+), [24249:24368](+), [26430:26572](+), [29366:33894](+)}
qualifiers:
    Key: db_xref, Value: ['GeneID:7021', 'HGNC:HGNC:11743', 'MIM:601601']
    Key: gene, Value: ['TFAP2B']
    Key: gene_synonym, Value: ['AP-2B; AP-2beta; AP2-B; PDA2']
    Key: product, Value: ['transcription factor AP-2 beta']
    Key: transcript_id, Value: ['NM_003221.4']

type: exon
location: [5145:5247](+)
qualifie

In [9]:
tam_feature=(len(record_TFAP2B.features))
print(tam_feature , "features")

14 features


In [10]:
feature=[]
for k in range(tam_feature):
    feature.append(k)
for i in feature:
    print(record_TFAP2B.features[i].location)
    print(record_TFAP2B.features[i].type)

[0:35888](+)
source
[5145:33894](+)
gene
join{[5145:5247](+), [9681:10140](+), [14893:14954](+), [22335:22555](+), [24249:24368](+), [26430:26572](+), [29366:33894](+)}
mRNA
[5145:5247](+)
exon
join{[5166:5247](+), [9681:10140](+), [14893:14954](+), [22335:22555](+), [24249:24368](+), [26430:26572](+), [29366:29667](+)}
CDS
[9687:10017](+)
misc_feature
[22505:22508](+)
misc_feature
[29586:29664](+)
misc_feature
[9681:10140](+)
exon
[14893:14954](+)
exon
[22335:22555](+)
exon
[24249:24368](+)
exon
[26430:26572](+)
exon
[29366:33894](+)
exon


In [11]:
featsource=[]
for i in range(tam_feature):
    if record_TFAP2B.features[i].type=="source":
        featsource.append(i)
for k in featsource:
    #print(record_TFAP2B.features[k].qualifiers)
    print("organismo: ",record_TFAP2B.features[k].qualifiers['organism'])
    print("tipo de molécula: ",record_TFAP2B.features[k].qualifiers['mol_type'])
    print("referência externa: ",record_TFAP2B.features[k].qualifiers['db_xref'])
    print("cromossoma: ", record_TFAP2B.features[k].qualifiers['chromosome'])

organismo:  ['Homo sapiens']
tipo de molécula:  ['genomic DNA']
referência externa:  ['taxon:9606']
cromossoma:  ['6']


In [12]:
s=0
genes=[]
for j in range(tam_feature):
    if record_TFAP2B.features[j].type=="gene":
        s=s+1
        genes.append({(j+1):record_TFAP2B.features[j].qualifiers["gene"]}) #j+1 pq a contagem começa em zero
print("existe ",s, " gene(s) anotado(s) no registo")
print(genes)

existe  1  gene(s) anotado(s) no registo
[{2: ['TFAP2B']}]


In [13]:
featgene=[]
for i in range(tam_feature):
    if record_TFAP2B.features[i].type=="gene":
        featgene.append(i)
print("Genes anotados: ")
for k in featgene:
    print("")
    print("gene: ", record_TFAP2B.features[k].qualifiers['gene'])
    print("gene sinónimo: ",record_TFAP2B.features[k].qualifiers['gene_synonym'])
    print("nota: ",record_TFAP2B.features[k].qualifiers['note'])
    print("referências externas: ",record_TFAP2B.features[k].qualifiers['db_xref'])

Genes anotados: 

gene:  ['TFAP2B']
gene sinónimo:  ['AP-2B; AP-2beta; AP2-B; PDA2']
nota:  ['transcription factor AP-2 beta']
referências externas:  ['GeneID:7021', 'HGNC:HGNC:11743', 'MIM:601601']


In [22]:
soma=0
mRNA=[]
for j in range(tam_feature):
    if record_TFAP2B.features[j].type=="mRNA":
        soma=soma+1
        mRNA.append({(j+1):record_TFAP2B.features[j].qualifiers["gene"]}) #j+1 pq a contagem começa em zero
print("existe ",soma, " mRNA(s) anotado(s) no registo")
print(genes)

existe  1  mRNA(s) anotado(s) no registo
[{2: ['TFAP2B']}]


In [23]:
featmRNA=[]
for i in range(tam_feature):
    if record_TFAP2B.features[i].type=="mRNA":
        featmRNA.append(i)
print("mRNAs anotados: ")
for k in featmRNA:
    print("")
    print("gene: ", record_TFAP2B.features[k].qualifiers['gene'])
    print("gene sinónimo: ",record_TFAP2B.features[k].qualifiers['gene_synonym'])
    print("id da transcriptação: ",record_TFAP2B.features[k].qualifiers['transcript_id'])
    print("produto/ significado biológico: ",record_TFAP2B.features[k].qualifiers['product'])
    print("referências externas: ",record_TFAP2B.features[k].qualifiers['db_xref'])

mRNAs anotados: 

gene:  ['TFAP2B']
gene sinónimo:  ['AP-2B; AP-2beta; AP2-B; PDA2']
id da transcriptação:  ['NM_003221.4']
produto/ significado biológico:  ['transcription factor AP-2 beta']
referências externas:  ['GeneID:7021', 'HGNC:HGNC:11743', 'MIM:601601']


In [29]:
som=0
exon=[]
for j in range(tam_feature):
    if record_TFAP2B.features[j].type=="exon":
        som=som+1
        exon.append({(j+1):record_TFAP2B.features[j].location}) #j+1 pq a contagem começa em zero
print("nº de exões: ",som)
print(exon)

nº de exões:  7
[{4: FeatureLocation(ExactPosition(5145), ExactPosition(5247), strand=1)}, {9: FeatureLocation(ExactPosition(9681), ExactPosition(10140), strand=1)}, {10: FeatureLocation(ExactPosition(14893), ExactPosition(14954), strand=1)}, {11: FeatureLocation(ExactPosition(22335), ExactPosition(22555), strand=1)}, {12: FeatureLocation(ExactPosition(24249), ExactPosition(24368), strand=1)}, {13: FeatureLocation(ExactPosition(26430), ExactPosition(26572), strand=1)}, {14: FeatureLocation(ExactPosition(29366), ExactPosition(33894), strand=1)}]


In [25]:
so=0
seq_cod=[]
for j in range(tam_feature):
    if record_TFAP2B.features[j].type=="CDS":
        so=so+1
        seq_cod.append({(j+1):record_TFAP2B.features[j].qualifiers["gene"]}) 
print("existe ",so, " sequência(s) codificante(s) anotada(s) no registo")
print(seq_cod)

existe  1  sequência(s) codificante(s) anotada(s) no registo
[{5: ['TFAP2B']}]


In [26]:
featCDS=[]
for i in range(tam_feature):
    if record_TFAP2B.features[i].type=="CDS":
        featCDS.append(i)
print("Proteínas codificadas")
for k in featCDS:
    print(" ")
    print("gene: ", record_TFAP2B.features[k].qualifiers['gene'])
    print("gene sinónimo: ",record_TFAP2B.features[k].qualifiers['gene_synonym'])
    print("id da proteína: ",record_TFAP2B.features[k].qualifiers['protein_id'])
    print("produto/ significado biológico: ",record_TFAP2B.features[k].qualifiers['product'])
    print("nota: ",record_TFAP2B.features[k].qualifiers['note'])
    print("referências externas: ",record_TFAP2B.features[k].qualifiers['db_xref'])
    print("sequência de aminoácidos transcrita: ",record_TFAP2B.features[k].qualifiers['translation'])

Proteínas codificadas
 
gene:  ['TFAP2B']
gene sinónimo:  ['AP-2B; AP-2beta; AP2-B; PDA2']
id da proteína:  ['NP_003212.2']
produto/ significado biológico:  ['transcription factor AP-2-beta']
nota:  ['activating enhancer binding protein 2 beta; AP2-beta']
referências externas:  ['CCDS:CCDS4934.2', 'GeneID:7021', 'HGNC:HGNC:11743', 'MIM:601601']
sequência de aminoácidos transcrita:  ['MHSPPRDQAAIMLWKLVENVKYEDIYEDRHDGVPSHSSRLSQLGSVSQGPYSSAPPLSHTPSSDFQPPYFPPPYQPLPYHQSQDPYSHVNDPYSLNPLHQPQQHPWGQRQRQEVGSEAGSLLPQPRAALPQLSGLDPRRDYHSVRRPDVLLHSAHHGLDAGMGDSLSLHGLGHPGMEDVQSVEDANNSGMNLLDQSVIKKVPVPPKSVTSLMMNKDGFLGGMSVNTGEVFCSVPGRLSLLSSTSKYKVTVGEVQRRLSPPECLNASLLGGVLRRAKSKNGGRSLRERLEKIGLNLPAGRRKAANVTLLTSLVEGEAVHLARDFGYICETEFPAKAVSEYLNRQHTDPSDLHSRKNMLLATKQLCKEFTDLLAQDRTPIGNSRPSPILEPGIQSCLTHFSLITHGFGAPAICAALTALQNYLTEALKGMDKMFLNNTTTNRHTSGEGPGSKTGDKEEKHRK']
