**Sequence alignments**

In [3]:
pip install biopython

Collecting biopython
  Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.84


In [1]:
seqA = "CCGGTTTTT"
seqB = "AGTTTAA"
seqC = "AGGTTT"
sequences = [seqA, seqB, seqC]

import numpy as np
coordinates = np.array([[1, 3, 4, 7, 9], [0, 2, 2, 5, 5], [0, 2, 3, 6, 6]])

In [4]:
from Bio.Align import Alignment
alignment = Alignment(sequences, coordinates)
alignment

<Alignment object (3 rows x 8 columns) at 0x7d3e44076e60>

In [5]:
alignment.sequences

['CCGGTTTTT', 'AGTTTAA', 'AGGTTT']

In [6]:
alignment.coordinates

array([[1, 3, 4, 7, 9],
       [0, 2, 2, 5, 5],
       [0, 2, 3, 6, 6]])

In [7]:
print(alignment)

                  1 CGGTTTTT 9
                  0 AG-TTT-- 5
                  0 AGGTTT-- 6



**Creating an Alignment object from aligned sequences**

In [8]:
lines = ["CGGTTTTT", "AG-TTT--", "AGGTTT--"]
for line in lines:
    print(line)

CGGTTTTT
AG-TTT--
AGGTTT--


In [9]:
lines = [line.encode() for line in lines]  # convert to bytes
lines

[b'CGGTTTTT', b'AG-TTT--', b'AGGTTT--']

In [10]:
sequences, coordinates = Alignment.parse_printed_alignment(lines)
sequences

[b'CGGTTTTT', b'AGTTT', b'AGGTTT']

In [11]:
sequences = [sequence.decode() for sequence in sequences]
sequences

['CGGTTTTT', 'AGTTT', 'AGGTTT']

In [12]:
print(coordinates)

[[0 2 3 6 8]
 [0 2 2 5 5]
 [0 2 3 6 6]]


In [13]:
from Bio.Seq import Seq
sequences[0] = "C" + sequences[0]
sequences[1] = sequences[1] + "AA"
sequences

['CCGGTTTTT', 'AGTTTAA', 'AGGTTT']

In [14]:
coordinates[0, :] += 1
print(coordinates)

[[1 3 4 7 9]
 [0 2 2 5 5]
 [0 2 3 6 6]]


In [16]:
alignment = Alignment(sequences, coordinates)
print(alignment)

                  1 CGGTTTTT 9
                  0 AG-TTT-- 5
                  0 AGGTTT-- 6



In [19]:
ungapped_alignment = Alignment(["ACGTACGT", "AAGTACGT", "ACGTACCT"])
print(ungapped_alignment.coordinates)
print(ungapped_alignment)

[[0 8]
 [0 8]
 [0 8]]
                  0 ACGTACGT 8
                  0 AAGTACGT 8
                  0 ACGTACCT 8



**Common alignment attributes**

*Slicing and indexing an alignment*

In [20]:
print(alignment)
alignment.length

                  1 CGGTTTTT 9
                  0 AG-TTT-- 5
                  0 AGGTTT-- 6



8

In [22]:
alignment[0]

'CGGTTTTT'

In [23]:
alignment[1]

'AG-TTT--'

In [24]:
alignment[2]

'AGGTTT--'

In [25]:
alignment[0, :]

'CGGTTTTT'

In [26]:
alignment[1, :]

'AG-TTT--'

In [27]:
alignment[0, 1:-1]

'GGTTTT'

In [28]:
alignment[1, 1:-1]

'G-TTT-'

In [29]:
alignment[0, (1, 2, 4)]

'GGT'

In [30]:
alignment[1, range(0, 5, 2)]

'A-T'

In [31]:
alignment[0, 2]

'G'

In [32]:
alignment[2, 6]

'-'

In [33]:
alignment[:, 0]

'CAA'

In [34]:
alignment[:, 1]

'GGG'

In [35]:
alignment[:, 2]

'G-G'

In [36]:
alignment[1:]
print(alignment[1:])

target            0 AG-TTT 5
                  0 ||-||| 6
query             0 AGGTTT 6



In [37]:
alignment[:, :4]
print(alignment[:, :4])

                  1 CGGT 5
                  0 AG-T 3
                  0 AGGT 4



In [38]:
alignment[:, -6:]
print(alignment[:, -6:])

                  3 GTTTTT 9
                  2 -TTT-- 5
                  2 GTTT-- 6



In [39]:
print(alignment[:, (1, 3, 0)])

                  0 GTC 3
                  0 GTA 3
                  0 GTA 3



**Getting information about the alignment**

*Alignment shape*

In [40]:
len(alignment)

3

In [41]:
alignment.length

8

In [42]:
alignment.shape

(3, 8)

**Comparing alignments**

*Finding the indices of aligned sequences*

In [43]:
pairwise_alignment = alignment[:2, :]
print(pairwise_alignment)

target            1 CGGTTTTT 9
                  0 .|-|||-- 8
query             0 AG-TTT-- 5



In [44]:
print(pairwise_alignment.aligned)

[[[1 3]
  [4 7]]

 [[0 2]
  [2 5]]]


In [46]:
pairwise_alignment1 = Alignment(["AAACAAA", "AAAGAAA"],
                                np.array([[0, 3, 4, 4, 7], [0, 3, 3, 4, 7]]))  # fmt: skip

pairwise_alignment2 = Alignment(["AAACAAA", "AAAGAAA"],
                                np.array([[0, 3, 3, 4, 7], [0, 3, 4, 4, 7]]))  # fmt: skip

print(pairwise_alignment1)
print(pairwise_alignment2)

target            0 AAAC-AAA 7
                  0 |||--||| 8
query             0 AAA-GAAA 7

target            0 AAA-CAAA 7
                  0 |||--||| 8
query             0 AAAG-AAA 7



In [48]:
pairwise_alignment1.aligned

array([[[0, 3],
        [4, 7]],

       [[0, 3],
        [4, 7]]])

In [49]:
pairwise_alignment2.aligned

array([[[0, 3],
        [4, 7]],

       [[0, 3],
        [4, 7]]])

In [50]:
print(alignment)

                  1 CGGTTTTT 9
                  0 AG-TTT-- 5
                  0 AGGTTT-- 6



In [51]:
alignment.indices

array([[ 1,  2,  3,  4,  5,  6,  7,  8],
       [ 0,  1, -1,  2,  3,  4, -1, -1],
       [ 0,  1,  2,  3,  4,  5, -1, -1]])

In [54]:
alignment.sequences

['CCGGTTTTT', 'AGTTTAA', 'AGGTTT']

In [55]:
alignment.inverse_indices

[array([-1,  0,  1,  2,  3,  4,  5,  6,  7]),
 array([ 0,  1,  3,  4,  5, -1, -1]),
 array([0, 1, 2, 3, 4, 5])]

**Counting identities, mismatches, and gaps**

In [56]:
print(pairwise_alignment)

target            1 CGGTTTTT 9
                  0 .|-|||-- 8
query             0 AG-TTT-- 5



In [57]:
pairwise_alignment.counts()

AlignmentCounts(gaps=3, identities=4, mismatches=1)

In [58]:
print(alignment)

                  1 CGGTTTTT 9
                  0 AG-TTT-- 5
                  0 AGGTTT-- 6



In [59]:
alignment.counts()

AlignmentCounts(gaps=8, identities=14, mismatches=2)

**Letter frequencies**

Frekanslar yöntemi, her harfin hizalamanın her sütununda ne sıklıkta göründüğünü hesaplar

In [60]:
alignment.frequencies

{'C': array([1., 0., 0., 0., 0., 0., 0., 0.]),
 'G': array([0., 3., 2., 0., 0., 0., 0., 0.]),
 'T': array([0., 0., 0., 3., 3., 3., 1., 1.]),
 'A': array([2., 0., 0., 0., 0., 0., 0., 0.]),
 '-': array([0., 0., 1., 0., 0., 0., 2., 2.])}

**Substitutions**

Her bir nükleotid çifti arasındaki yer değiştirme sayısını bulmak için yer değiştirme yöntemini kullanın:

In [61]:
m = alignment.substitutions
print(m)

    A   C   G   T
A 1.0 0.0 0.0 0.0
C 2.0 0.0 0.0 0.0
G 0.0 0.0 4.0 0.0
T 0.0 0.0 0.0 9.0



In [62]:
m["C", "A"]

2.0

In [63]:
m["A", "C"]

0.0

In [64]:
m += m.transpose()
m /= 2.0
print(m)

    A   C   G   T
A 1.0 1.0 0.0 0.0
C 1.0 0.0 0.0 0.0
G 0.0 0.0 4.0 0.0
T 0.0 0.0 0.0 9.0



In [65]:
m["A", "C"]

1.0

In [66]:
m["C", "A"]

1.0

**Alignments as arrays**

In [68]:
align_array = np.array(alignment)
align_array.shape

(3, 8)

In [69]:
align_array

array([[b'C', b'G', b'G', b'T', b'T', b'T', b'T', b'T'],
       [b'A', b'G', b'-', b'T', b'T', b'T', b'-', b'-'],
       [b'A', b'G', b'G', b'T', b'T', b'T', b'-', b'-']], dtype='|S1')

In [70]:
align_array = np.array(alignment, dtype="U")

In [71]:
align_array

array([['C', 'G', 'G', 'T', 'T', 'T', 'T', 'T'],
       ['A', 'G', '-', 'T', 'T', 'T', '-', '-'],
       ['A', 'G', 'G', 'T', 'T', 'T', '-', '-']], dtype='<U1')

**Operations on an alignment**

*Sorting an alignment*