In [1]:
import Bio

In [2]:
from Bio.Alphabet import generic_dna, generic_rna, generic_protein, IUPAC

In [3]:
from Bio.Seq import Seq
dna= Seq('ATACCGCTAGCC', generic_dna)
dna

Seq('ATACCGCTAGCC', DNAAlphabet())

In [4]:
dna.alphabet

DNAAlphabet()

In [5]:
#SEQUENCE INDEX
dna[0:3]

Seq('ATA', DNAAlphabet())

In [6]:
dna[0::3]

Seq('ACCG', DNAAlphabet())

In [7]:
# JOINING OF SEQUENCES
seq= Seq('ATCGGATGAGCA')
seq2= dna[0:3]+seq
seq2

Seq('ATAATCGGATGAGCA')

In [8]:
# FINDING THE NUMBER OF A SPECIFIC NUCLEOTIDE PRESENT
seq2.count('A')

6

In [9]:
# FINDING THE GC CONTENT
from Bio.SeqUtils import GC
GC(seq2)

40.0

In [10]:
seq2.count('ATG')

1

In [11]:
#Determine the location of a nucleotide sequence
seq2.find('G')

6

In [12]:
seq2.count_overlap('AT')

3

# PROTEIN SYNTHESIS

In [13]:
# Transcribe DNA to mRNA
seq3= Seq('ATGACTGCATTAGATACGA')
mRNA= seq3.transcribe()
mRNA

Seq('AUGACUGCAUUAGAUACGA', RNAAlphabet())

In [14]:
# TRANSLATE mRNA to Proteins
mRNA.translate()



Seq('MTALDT', ExtendedIUPACProtein())

In [15]:
from Bio.Data import CodonTable

In [16]:
dir(CodonTable)

['Alphabet',
 'AmbiguousCodonTable',
 'AmbiguousForwardTable',
 'CodonTable',
 'IUPAC',
 'IUPACData',
 'NCBICodonTable',
 'NCBICodonTableDNA',
 'NCBICodonTableRNA',
 'TranslationError',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 'ambiguous_dna_by_id',
 'ambiguous_dna_by_name',
 'ambiguous_generic_by_id',
 'ambiguous_generic_by_name',
 'ambiguous_rna_by_id',
 'ambiguous_rna_by_name',
 'generic_by_id',
 'generic_by_name',
 'list_ambiguous_codons',
 'list_possible_proteins',
 'make_back_table',
 'register_ncbi_table',
 'standard_dna_table',
 'standard_rna_table',
 'unambiguous_dna_by_id',
 'unambiguous_dna_by_name',
 'unambiguous_rna_by_id',
 'unambiguous_rna_by_name']

In [17]:
print(CodonTable.unambiguous_dna_by_name['Standard'])

Table 1 Standard, SGC0

  |  T      |  C      |  A      |  G      |
--+---------+---------+---------+---------+--
T | TTT F   | TCT S   | TAT Y   | TGT C   | T
T | TTC F   | TCC S   | TAC Y   | TGC C   | C
T | TTA L   | TCA S   | TAA Stop| TGA Stop| A
T | TTG L(s)| TCG S   | TAG Stop| TGG W   | G
--+---------+---------+---------+---------+--
C | CTT L   | CCT P   | CAT H   | CGT R   | T
C | CTC L   | CCC P   | CAC H   | CGC R   | C
C | CTA L   | CCA P   | CAA Q   | CGA R   | A
C | CTG L(s)| CCG P   | CAG Q   | CGG R   | G
--+---------+---------+---------+---------+--
A | ATT I   | ACT T   | AAT N   | AGT S   | T
A | ATC I   | ACC T   | AAC N   | AGC S   | C
A | ATA I   | ACA T   | AAA K   | AGA R   | A
A | ATG M(s)| ACG T   | AAG K   | AGG R   | G
--+---------+---------+---------+---------+--
G | GTT V   | GCT A   | GAT D   | GGT G   | T
G | GTC V   | GCC A   | GAC D   | GGC G   | C
G | GTA V   | GCA A   | GAA E   | GGA G   | A
G | GTG V   | GCG A   | GAG E   | GGG G   | G
--+---------

In [18]:
print(CodonTable.NCBICodonTable)

<class 'Bio.Data.CodonTable.NCBICodonTable'>


In [19]:
seq3.complement()

Seq('TACTGACGTAATCTATGCT')

In [20]:
mRNA.reverse_complement()

Seq('UCGUAUCUAAUGCAGUCAU', RNAAlphabet())

In [21]:
#DIRECT TRANSLATION
seq3.translate()

Seq('MTALDT', ExtendedIUPACProtein())

In [22]:
mRNA.back_transcribe()

Seq('ATGACTGCATTAGATACGA', DNAAlphabet())

# ANALYSIS OF SARS-CoV-19 

In [23]:
from Bio import SeqIO

In [24]:
covid= open('sequence.fasta')
for seq_record in SeqIO.parse(covid,'fasta'):
    print(seq_record.id)
    print(repr(seq_record))
    print(len(seq_record))

NC_004718.3
SeqRecord(seq=Seq('ATATTAGGTTTTTACCTACCCAGGAAAAGCCAACCAACCTCGATCTCTTGTAGA...AAA', SingleLetterAlphabet()), id='NC_004718.3', name='NC_004718.3', description='NC_004718.3 SARS coronavirus, complete genome', dbxrefs=[])
29751


In [25]:
nCoV_record= SeqIO.read('sequence.fasta','fasta')
nCoV_record

SeqRecord(seq=Seq('ATATTAGGTTTTTACCTACCCAGGAAAAGCCAACCAACCTCGATCTCTTGTAGA...AAA', SingleLetterAlphabet()), id='NC_004718.3', name='NC_004718.3', description='NC_004718.3 SARS coronavirus, complete genome', dbxrefs=[])

In [26]:
nCoV_DNA= nCoV_record.seq
nCoV_DNA

Seq('ATATTAGGTTTTTACCTACCCAGGAAAAGCCAACCAACCTCGATCTCTTGTAGA...AAA', SingleLetterAlphabet())

In [27]:
nCoV_mRNA= nCoV_DNA.transcribe()
nCoV_mRNA

Seq('AUAUUAGGUUUUUACCUACCCAGGAAAAGCCAACCAACCUCGAUCUCUUGUAGA...AAA', RNAAlphabet())

In [28]:
nCoV_protein= nCoV_mRNA.translate()
nCoV_protein

Seq('ILGFYLPRKSQPTSISCRSVL*TNFKICVAVARLHA*CTYAV*TIINFTVVDKK...KKK', HasStopCodon(ExtendedIUPACProtein(), '*'))

In [29]:
len(nCoV_protein)

9917

In [30]:
len(nCoV_DNA)/3

9917.0

In [31]:
nCoV_aa= nCoV_protein.split('*')
nCoV_aa

[Seq('ILGFYLPRKSQPTSISCRSVL', HasStopCodon(ExtendedIUPACProtein(), '*')),
 Seq('TNFKICVAVARLHA', HasStopCodon(ExtendedIUPACProtein(), '*')),
 Seq('CTYAV', HasStopCodon(ExtendedIUPACProtein(), '*')),
 Seq('TIINFTVVDKKRVTRPSSADCLRFRPCCSRSSAYLGFVRV', HasStopCodon(ExtendedIUPACProtein(), '*')),
 Seq('PKGKMESLVLGVNEKTHVQLSLPVLQVRDVLVRGFGDSVEEALSEAREHLKNGT...FAV', HasStopCodon(ExtendedIUPACProtein(), '*')),
 Seq('VQPVLHRAAQALVLMSSTGLLIFTTKKLLVLQSS', HasStopCodon(ExtendedIUPACProtein(), '*')),
 Seq('KLIAVASRRRMRKAIY', HasStopCodon(ExtendedIUPACProtein(), '*')),
 Seq('TLTL', HasStopCodon(ExtendedIUPACProtein(), '*')),
 Seq('LRGILCLTTNMKRLFITWLKIVQRLLSMTFSSLE', HasStopCodon(ExtendedIUPACProtein(), '*')),
 Seq('MVTWYHIYHVSV', HasStopCodon(ExtendedIUPACProtein(), '*')),
 Seq('LNTQWLI', HasStopCodon(ExtendedIUPACProtein(), '*')),
 Seq('SMLYVILMRVIVIH', HasStopCodon(ExtendedIUPACProtein(), '*')),
 Seq('KKYSSHTIAVMMIISIRRIGMTS', HasStopCodon(ExtendedIUPACProtein(), '*')),
 Seq('RILTSYAYMLT', HasStop

In [32]:
n_CoV_clean= [str(i) for i in nCoV_aa]
n_CoV_clean

['ILGFYLPRKSQPTSISCRSVL',
 'TNFKICVAVARLHA',
 'CTYAV',
 'TIINFTVVDKKRVTRPSSADCLRFRPCCSRSSAYLGFVRV',
 'PKGKMESLVLGVNEKTHVQLSLPVLQVRDVLVRGFGDSVEEALSEAREHLKNGTCGLVELEKGVLPQLEQPYVFIKRSDALSTNHGHKVVELVAEMDGIQYGRSGITLGVLVPHVGETPIAYRNVLLRKNGNKGAGGHSYGIDLKSYDLGDELGTDPIEDYEQNWNTKHGSGALRELTRELNGGAVTRYVDNNFCGPDGYPLDCIKDFLARAGKSMCTLSEQLDYIESKRGVYCCRDHEHEIAWFTERSDKSYEHQTPFEIKSAKKFDTFKGECPKFVFPLNSKVKVIQPRVEKKKTEGFMGRIRSVYPVASPQECNNMHLSTLMKCNHCDEVSWQTCDFLKATCEHCGTENLVIEGPTTCGYLPTNAVVKMPCPACQDPEIGPEHSVADYHNHSNIETRLRKGGRTRCFGGCVFAYVGCYNKRAYWVPRASADIGSGHTGITGDNVETLNEDLLEILSRERVNINIVGDFHLNEEVAIILASFSASTSAFIDTIKSLDYKSFKTIVESCGNYKVTKGKPVKGAWNIGQQRSVLTPLCGFPSQAAGVIRSIFARTLDAANHSIPDLQRAAVTILDGISEQSLRLVDAMVYTSDLLTNSVIIMAYVTGGLVQQTSQWLSNLLGTTVEKLRPIFEWIEAKLSAGVEFLKDAWEILKFLITGVFDIVKGQIQVASDNIKDCVKCFIDVVNKALEMCIDQVTIAGAKLRSLNLGEVFIAQSKGLYRQCIRGKEQLQLLMPLKAPKEVTFLEGDSHDTVLTSEEVVLKNGELEALETPVDSFTNGAIVGTPVCVNGLMLLEIKDKEQYCALSPGLLATNNVFRLKGGAPIKGVTFGEDTVWEVQGYKNVRITFELDERVDKVLNEKCSVYTVESGTEVTEFACVVAEAVVKTLQPVSDLLTNM

In [33]:
import pandas as pd
df= pd.DataFrame({'amino acids':n_CoV_clean})
df

Unnamed: 0,amino acids
0,ILGFYLPRKSQPTSISCRSVL
1,TNFKICVAVARLHA
2,CTYAV
3,TIINFTVVDKKRVTRPSSADCLRFRPCCSRSSAYLGFVRV
4,PKGKMESLVLGVNEKTHVQLSLPVLQVRDVLVRGFGDSVEEALSEA...
...,...
268,N
269,F
270,
271,CYPHVILIAS


In [34]:
df['count']= df['amino acids'].str.len()
df

Unnamed: 0,amino acids,count
0,ILGFYLPRKSQPTSISCRSVL,21
1,TNFKICVAVARLHA,14
2,CTYAV,5
3,TIINFTVVDKKRVTRPSSADCLRFRPCCSRSSAYLGFVRV,40
4,PKGKMESLVLGVNEKTHVQLSLPVLQVRDVLVRGFGDSVEEALSEA...,4386
...,...,...
268,N,1
269,F,1
270,,0
271,CYPHVILIAS,10


In [35]:
df.nlargest(10,'count')

Unnamed: 0,amino acids,count
4,PKGKMESLVLGVNEKTHVQLSLPVLQVRDVLVRGFGDSVEEALSEA...,4386
259,INEQIKMSDNGPQSNQRSAPRITFGGPTDSTDNNQNGGRNGARPKQ...,428
236,SSGLNELTIIIILFGTLTLLIMADNGTITVEELKQLLEQWNLVIGF...,242
203,SGLLKSGLLLILLLLKCLSVFLDNQKELTFVERATTLCPSHKQPRM...,101
198,LVVLPLLDGHLVLALLFKYLLLCKWHIGSMALELPKMFSMRTKNKS...,81
179,RQDKLVLLLIIIINCQMISWVVSLLGILGTLMLLQLVIIIINIGIL...,80
204,LLQTIHLSQEIVMSLLASLTTQFMILCNLSLTHSKKSWTSTSKIIH...,78
146,FQKWSRLQLTMLKFHSCFGVRMDMLKPSTQNYKQVKRGNQVLRCLT...,72
165,FILFWKKVGLSLEKTTELWFQVIFLLTTKRTCLFSYYFLLSLVVVT...,67
65,QHVIGLMLAITYLPTLVLRDSSFSQQKRSKPLRKHLSCHMVLPLYA...,66


In [36]:
from collections import Counter

In [37]:
Counter(nCoV_protein).most_common(10)

[('L', 1368),
 ('S', 737),
 ('T', 689),
 ('V', 683),
 ('A', 575),
 ('I', 566),
 ('K', 503),
 ('G', 438),
 ('F', 437),
 ('N', 418)]

In [38]:
from Bio.PDB import PDBParser

In [39]:
dir(Bio.PDB)

['AbstractPropertyMap',
 'Atom',
 'CaPPBuilder',
 'Chain',
 'DSSP',
 'Dice',
 'Entity',
 'ExposureCN',
 'FastMMCIFParser',
 'FragmentMapper',
 'HSExposure',
 'HSExposureCA',
 'HSExposureCB',
 'MMCIF2Dict',
 'MMCIFIO',
 'MMCIFParser',
 'Model',
 'NeighborSearch',
 'PDBExceptions',
 'PDBIO',
 'PDBList',
 'PDBParser',
 'PPBuilder',
 'Polypeptide',
 'Residue',
 'ResidueDepth',
 'Select',
 'Selection',
 'Structure',
 'StructureAlignment',
 'StructureBuilder',
 'Superimposer',
 'Vector',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 'calc_angle',
 'calc_dihedral',
 'extract',
 'get_surface',
 'ic_data',
 'internal_coords',
 'is_aa',
 'm2rotaxis',
 'make_dssp_dict',
 'mmcifio',
 'parse_pdb_header',
 'protein_letters_3to1',
 'refmat',
 'rotaxis',
 'rotaxis2m',
 'rotmat',
 'standard_aa_names',
 'vector_to_axis',
 'vectors']

In [40]:
parser= PDBParser()

In [41]:
structure= parser.get_structure('SARS-CoV-19','2g9t.pdb')
len(structure)



1

In [42]:
len(structure)

1

In [43]:
structure_2= parser.get_structure('Structure','2ga6.pdb')



In [44]:
len(structure_2)

1

In [45]:
model= structure[0]
for chain in model:
    print(chain)

<Chain id=A>
<Chain id=B>
<Chain id=C>
<Chain id=D>
<Chain id=E>
<Chain id=F>
<Chain id=G>
<Chain id=H>
<Chain id=I>
<Chain id=J>
<Chain id=K>
<Chain id=L>
<Chain id=M>
<Chain id=N>
<Chain id=O>
<Chain id=P>
<Chain id=Q>
<Chain id=R>
<Chain id=S>
<Chain id=T>
<Chain id=U>
<Chain id=V>
<Chain id=W>
<Chain id=X>


In [46]:
import nglview as nv

_ColormakerRegistry()

In [47]:
nv.demo()

NGLWidget()

In [48]:
view= nv.show_biopython(structure)
view

NGLWidget()

In [52]:
import py3Dmol

In [55]:
view2= py3Dmol.view(query='pdb:2GA6')
view2.setStyle({'cartoon':{'color':'spectrum'}})

<py3Dmol.view at 0x2353f29bb08>

In [56]:
view2.render_image()

<py3Dmol.view at 0x2353f29bb08>