Búsqueda de secuencias codificantes de proteíanas asociadas a Prolina Deshidrogenasa en el género *Phaseolus*. Esta proteína está involucrada en la formación de capas celulares entre en parénquima externo e interno de la vaina que son más fibrosas y fuertemente lignificadas. 

0. Preparación del entorno

In [4]:
!pip install Bio
!pip install pyrodigal
!pip install pycirclize
!pip install pyrodigal

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
import credentials
import matplotlib.pyplot as plt
import numpy  as np
import pandas as pd
import pyrodigal
import requests
import seaborn as sns
import subprocess
import sys
from Bio import SeqIO
from Bio import Entrez
from io                 import StringIO
from matplotlib.patches import Patch
from pycirclize         import Circos
from pycirclize.parser  import Gff
from requests.adapters  import HTTPAdapter, Retry

1. Obtención de una secuencia genómica

Se va a utilizar el ensamblaje "COL_Plunatus_1.0" y se va a buscar en el cromosoma Pl03 con número de acceso JAAFYQ010000003.1. Inicialmente se encontró el número de acceso CM023859.1 pero no funciona para este código ya en el archivo resultante no aparece la secuencia sino una referencia a ella.

In [6]:
accession = "JAAFYQ010000003.1"
genome = Entrez.efetch(db="nucleotide",
                       id=accession,
                       format="gb",
                       rettype="text")
record = SeqIO.read(genome, "genbank")
chr_length = len(record.seq)

In [7]:
record

SeqRecord(seq=Seq('AAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAACCCTAAACCC...GGG'), id='JAAFYQ010000003.1', name='JAAFYQ010000003', description='Phaseolus lunatus cultivar G27455 chromosome Pl03, whole genome shotgun sequence', dbxrefs=['BioProject:PRJNA596114', 'BioSample:SAMN13612625'])

In [8]:
orf_finder = pyrodigal.OrfFinder()
orf_finder.train(bytes(record.seq))
orf_genes  = orf_finder.find_genes(bytes(record.seq))

In [9]:
orf_genes

<pyrodigal._pyrodigal.Genes at 0x7fd132d35a50>

In [10]:
aa_file = accession + ".faa"
prefix  = "G27455" #cultivar
with open(aa_file, "w") as orf_gene:
    orf_genes.write_translations(orf_gene,sequence_id=prefix)

In [11]:
! head JAAFYQ010000003.1.faa

>G27455_1 # 201 # 488 # -1 # ID=1_1;partial=00;start_type=ATG;rbs_motif=None;rbs_spacer=None;gc_cont=0.569
MGLGFSHLASVFMHQGTGAAALGFRVPRLGLRHQVSGFRVQALALGCRVQGSATRVQGSA
TRVQGQALGYRVQAPGFRVQGLGAREQGSGFRQQG*
>G27455_2 # 502 # 648 # -1 # ID=1_2;partial=00;start_type=ATG;rbs_motif=None;rbs_spacer=None;gc_cont=0.497
MTQAPGFRAHGLGFRIDSKWFRVQGLGFRHQGLGYRHQGSGFKVQAPG*
>G27455_3 # 992 # 1111 # 1 # ID=1_3;partial=00;start_type=ATG;rbs_motif=None;rbs_spacer=None;gc_cont=0.558
MPEPCTLCLNPEFWCLNPEPRCLNPEPRCLNPKPWYLKP*
>G27455_4 # 1329 # 1481 # -1 # ID=1_4;partial=00;start_type=GTG;rbs_motif=None;rbs_spacer=None;gc_cont=0.497
MLRVEGSSFRVRVYAPGFRVQAPGYRTQVPGFRAHGLGFWIDAKWFRVQG*
>G27455_5 # 1640 # 1780 # -1 # ID=1_5;partial=00;start_type=GTG;rbs_motif=None;rbs_spacer=None;gc_cont=0.539


In [12]:
gff_file = accession + ".gff"
prefix  = "G27455"
with open(gff_file, "w") as orf_gene:
    orf_genes.write_gff(orf_gene,sequence_id=prefix)

In [13]:
! head JAAFYQ010000003.1.gff

##gff-version  3
# Sequence Data: seqnum=1;seqlen=45046558;seqhdr="G27455"
# Model Data: version=pyrodigal.v2.1.0;run_type=Single;model="Ab initio";gc_cont=32.40;transl_table=11;uses_sd=0
G27455	pyrodigal_v2.1.0	CDS	201	488	29.1	-	0	ID=G27455_1;partial=00;start_type=ATG;rbs_motif=None;rbs_spacer=None;gc_cont=0.569;conf=99.88;score=29.14;cscore=21.87;sscore=7.27;rscore=0.00;uscore=4.38;tscore=1.76;
G27455	pyrodigal_v2.1.0	CDS	502	648	6.7	-	0	ID=G27455_2;partial=00;start_type=ATG;rbs_motif=None;rbs_spacer=None;gc_cont=0.497;conf=82.24;score=6.67;cscore=4.73;sscore=1.93;rscore=0.00;uscore=1.57;tscore=1.01;
G27455	pyrodigal_v2.1.0	CDS	992	1111	0.9	+	0	ID=G27455_3;partial=00;start_type=ATG;rbs_motif=None;rbs_spacer=None;gc_cont=0.558;conf=55.35;score=0.93;cscore=-1.00;sscore=1.94;rscore=0.00;uscore=1.61;tscore=0.82;
G27455	pyrodigal_v2.1.0	CDS	1329	1481	5.5	-	0	ID=G27455_4;partial=00;start_type=GTG;rbs_motif=None;rbs_spacer=None;gc_cont=0.497;conf=77.78;score=5.45;cscore=5.80;sscore=-0.35;r

In [14]:
uniprot_api_url  = "https://rest.uniprot.org/uniprotkb/stream"
uniprot_api_args = {"compressed" : "false",
                    "format"     : "fasta",
                    "query"      : "(lignification) AND (reviewed:true)"}
uniprot_ref_seqs = requests.get(uniprot_api_url,params=uniprot_api_args).text

In [15]:
uniprot_ref_seqs

'>sp|E3VNM4|TGA10_ARATH Transcription factor TGA10 OS=Arabidopsis thaliana OX=3702 GN=TGA10 PE=1 SV=1\nMQGHHQNHHQHLSSSSATSSHGNFMNKDGYDIGEIDPSLFLYLDGQGHHDPPSTAPSPLH\nHHHTTQNLAMRPPTSTLNIFPSQPMHIEPPPSSTHNTDNTRLVPAAQPSGSTRPASDPSM\nDLTNHSQFHQPPQGSKSIKKEGNRKGLASSDHDIPKSSDPKTLRRLAQNREAARKSRLRK\nKAYVQQLESCRIKLTQLEQEIQRARSQGVFFGGSLIGGDQQQGGLPIGPGNISSEAAVFD\nMEYARWLEEQQRLLNELRVATQEHLSENELRMFVDTCLAHYDHLINLKAMVAKTDVFHLI\nSGAWKTPAERCFLWMGGFRPSEIIKVIVNQIEPLTEQQIVGICGLQQSTQEAEEALSQGL\nEALNQSLSDSIVSDSLPPASAPLPPHLSNFMSHMSLALNKLSALEGFVLQADNLRHQTIH\nRLNQLLTTRQEARCLLAVAEYFHRLQALSSLWLARPRQDG\n>sp|O23349|AO1_ARATH Primary amine oxidase 1 OS=Arabidopsis thaliana OX=3702 GN=AO1 PE=1 SV=1\nMNTSILAILFLIQCVFTLGLHFHPLDPLTPQEINKTSFIVKKSHLGNLKDLTFHYLDLEE\nPNKSHVLQWLSPNPSKKPPPPRRRSFVVVRAGGQTYELIIDLTTSKIASSRIYTGHGFPS\nFTFIELFKASKLPLTYPPFKKSILDRSLNISEVSCIPFTVGWYGETTTRRELKASCFYRD\nGSVNVFTRPIEGITVTIDVDSMQVIKYSDRFRKPIPDKEGNDFRTKHRPFPFFCNVSDTG\nFKILGNRVKWANWKFHVGFTARAGVTISTASVLDPRTKRFRRVMYRGHVSETFVPYMDPT\nYEWYYRTFMDIGEFGF

In [16]:
uniprot_seqs_file = open("uniprot_sequences.fasta", "wt")
uniprot_seqs_file.write(uniprot_ref_seqs)
uniprot_seqs_file.close()

In [17]:
! head uniprot_sequences.fasta

>sp|E3VNM4|TGA10_ARATH Transcription factor TGA10 OS=Arabidopsis thaliana OX=3702 GN=TGA10 PE=1 SV=1
MQGHHQNHHQHLSSSSATSSHGNFMNKDGYDIGEIDPSLFLYLDGQGHHDPPSTAPSPLH
HHHTTQNLAMRPPTSTLNIFPSQPMHIEPPPSSTHNTDNTRLVPAAQPSGSTRPASDPSM
DLTNHSQFHQPPQGSKSIKKEGNRKGLASSDHDIPKSSDPKTLRRLAQNREAARKSRLRK
KAYVQQLESCRIKLTQLEQEIQRARSQGVFFGGSLIGGDQQQGGLPIGPGNISSEAAVFD
MEYARWLEEQQRLLNELRVATQEHLSENELRMFVDTCLAHYDHLINLKAMVAKTDVFHLI
SGAWKTPAERCFLWMGGFRPSEIIKVIVNQIEPLTEQQIVGICGLQQSTQEAEEALSQGL
EALNQSLSDSIVSDSLPPASAPLPPHLSNFMSHMSLALNKLSALEGFVLQADNLRHQTIH
RLNQLLTTRQEARCLLAVAEYFHRLQALSSLWLARPRQDG
>sp|O23349|AO1_ARATH Primary amine oxidase 1 OS=Arabidopsis thaliana OX=3702 GN=AO1 PE=1 SV=1


In [29]:
from pathlib import Path

In [63]:
makeblastdb_path = r'C:\Zeus'
makeblastdb_command = [makeblastdb_path,'-in',aa_file,'-dbtype','prot']
subprocess.call(makeblastdb_command)

FileNotFoundError: ignored