## Bioinformatics: Sequence Alignment
Sequence alignment in bioinformatics is the process of comparing DNA, RNA, or protein sequences to find similarities. It helps scientists understand relationships, functions, or evolutionary history. By lining up sequences, we can spot matches, differences, and important regions.

In [1]:
from collections import Counter
import random

## Hamming Distance
The number of positions that differ in two strings +++

In [11]:
a = ['A', 'T', 'T', 'G', 'T', 'C']
b = ['A', 'C', 'T', 'C', 'T', 'C']

distance = 0

print(f'Distance: {distance}')

for i in range (len(a)):
  if a[i] != b[i]:
    distance += 1

print(f'Hamming distance: {distance}')

Distance: 0
Hamming distance: 2


In [3]:
# Counts how many nucleobases (A,T,C,G) are different from the REFERENCE SEQUENCE in the first row 

#same length version
def generate_matrix():
    bases = ['A', 'T', 'C', 'G']
    matrix_rows = random.randint(5, 10) #number of dna sequences

    sequences = []
    length = random.randint(5,8)
    for row in range(matrix_rows):
        sequence = [random.choice(bases) for column in range(length)]
        sequences.append(sequence)

    return sequences

In [4]:
X = generate_matrix()

for row in X:
    print(" ".join(row))

A C A C C G G
T C T A G C T
A G C T T A T
A T C A G A T
A A T C A G C
G T T G T A T


In [5]:
def Hamming_distance(matrix, reference_index=0):
    reference = matrix[reference_index] #compare to the first row
    distance = 0
    for row in matrix:
        for i in range(len(reference)):
            if row[i] != reference[i]:
                distance += 1
    return distance

In [6]:
Hamming_distance(X)

29

In [7]:
#different lengths version
def generate_random_dna_sequences():
    bases = ['A', 'T', 'C', 'G']
    num_sequences = random.randint(5, 10)

    sequences = []
    for _ in range(num_sequences):
        length = random.randint(5, 15)
        sequence = [random.choice(bases) for _ in range(length)]
        sequences.append(sequence)

    return sequences

random_dna = generate_random_dna_sequences()
for i, seq in enumerate(random_dna, 1):
    print(seq)

['C', 'C', 'A', 'G', 'G', 'C', 'A', 'A']
['C', 'G', 'G', 'A', 'G', 'T', 'A', 'T', 'A', 'C', 'G', 'C']
['A', 'A', 'G', 'A', 'A']
['T', 'C', 'T', 'C', 'G', 'C', 'T', 'C', 'T', 'C', 'T', 'A', 'C', 'A', 'A']
['C', 'A', 'G', 'G', 'A', 'C', 'C', 'C', 'T', 'A', 'G', 'A', 'A', 'C', 'T']


... the problem with this is that it only works for SAME lengths

## Consensus string 
Gets the most common nucleobases per index and then use it in the final string

In [8]:
def consensus_string(matrix):
    consensus = []
    num_columns = len(matrix[0])

    for col in range(num_columns):
        column_bases = [row[col] for row in matrix]   
        counts = Counter(column_bases)                
        most_common = counts.most_common(1)[0][0]     
        consensus.append(most_common)

    return ''.join(consensus)

In [9]:
P = generate_matrix()

for row in P:
    print(" ".join(row))

consensus_string(P)

C A C G C T C
G T G A A G C
T A T A T C T
A C C C C T C
G A A A G G A
G G G A C C A
A A A G T A A
A C G T C A G


'GAGACTC'