In [None]:
# Sequence Aligment
# - a method of arranging sequences of DNA, RNA or amino acids or proteins to identify regions of similarity
# the similarity being identifies, may be a result of functional, structural or evolutionary relationships between sequences
# - useful in identifying similarity and homology (= descent from a common ancestor)

# Terms
# - matches (A-A)
# - mismatches(C-T)
# - gap (G--)

In [None]:
# Alignment Type
# - global alignment: finds the best concordance/agreement between all characters in two sequences: mostly from end to end // by needle
# - local alignment: finds just the subsequences that align the best: in this method we consider subsequences within each of the 2 sequences and try to match them to obtain the best aligment // by water


In [None]:
# Local alignment used
# - 2 seq have a small matched region
# - 2 seq are of different lenghts
# - overlapping seq
# - one sequences is a subsequence of the other
# - blast (basic local alignment search tool)
# - emboss (pairwise seq alignment)

In [6]:
from Bio import pairwise2
from Bio.pairwise2 import format_alignment
from Bio.Seq import Seq

In [8]:
seq1 = Seq('ACTCG')
seq2 = Seq('ATTCG')

In [9]:
# Global Alignment
alignments = pairwise2.align.globalxx(seq1,seq2)

In [10]:
alignments

[Alignment(seqA='ACT-CG', seqB='A-TTCG', score=4.0, start=0, end=6),
 Alignment(seqA='AC-TCG', seqB='A-TTCG', score=4.0, start=0, end=6),
 Alignment(seqA='ACTCG', seqB='ATTCG', score=4.0, start=0, end=5)]

In [12]:
# To display the aligment
print(format_alignment(*alignments[0]))

ACT-CG
| | ||
A-TTCG
  Score=4



In [15]:
# View all
for a in alignments:
    print(format_alignment(*a))

ACT-CG
| | ||
A-TTCG
  Score=4

AC-TCG
|  |||
A-TTCG
  Score=4

ACTCG
|.|||
ATTCG
  Score=4



In [18]:
# Local Alignment
loc_alignments = pairwise2.align.localxx(seq1,seq2)

In [19]:
for a in loc_alignments:
    print(format_alignment(*a))

ACT-CG
| | ||
A-TTCG
  Score=4

AC-TCG
|  |||
A-TTCG
  Score=4

ACTCG
|.|||
ATTCG
  Score=4



In [22]:
# Get alignment by only score
alignment2 = pairwise2.align.globalxx(seq1,seq2,one_alignment_only=True,score_only=True)

In [23]:
alignment2

4.0

In [24]:
# Check for similarity or percentage of similarity using Alignment
# - (fraction of nucleotides that is the same/total number of nucleotides) * 100%
# more than 50% --> are homologus

In [25]:
seq1

Seq('ACTCG')

In [27]:
alignment2/len(seq1) * 100

80.0

In [28]:
seq2

Seq('ATTCG')

In [29]:
# Get alignment by only the score
loc_alignment2 = pairwise2.align.localxx(seq1,seq2,one_alignment_only=True,score_only=True)

In [30]:
loc_alignment2/len(seq1) * 100

80.0

In [31]:
# Find out all the possible global alignments wiht the max similatiry score
# - Matching characters :2 points
# - each mismatching character: -1 point
# + 0.5 points are deduced when opening a gap
# + 0.1 points are deduced when extending it

In [35]:
# Global Alignment with max similarity
glb_alignment = pairwise2.align.globalms(seq1,seq2,2,-1,-0.5,-0.1)

In [36]:
for a in glb_alignment:
    print(format_alignment(*a))

ACT-CG
| | ||
A-TTCG
  Score=7

AC-TCG
|  |||
A-TTCG
  Score=7

ACTCG
|.|||
ATTCG
  Score=7

