In [77]:
#MSA
from Bio.Seq import Seq
import Bio.SeqIO as SeqIO
import random
import numpy as np

sick_patient_seq = SeqIO.read('HBS.fasta', 'fasta')
print(sick_patient_seq)
print(sick_patient_seq.seq)


# number of individuals in each generation 
NUM_SEQ = 10
POP_SIZE = 10
SEQ_LEN = 15

ID: Sick
Name: Sick
Description: Sick patient HBB
Number of features: 0
Seq('ACATTTGCTTCTGACACAACTGTGTTCACTAGCAACCTCAAACAGACACCATGG...CAA', SingleLetterAlphabet())
ACATTTGCTTCTGACACAACTGTGTTCACTAGCAACCTCAAACAGACACCATGGTGCATCTGACTCCTGTGGAGAAGTCTGCCGTTACTGCCCTGTGGGGCAAGGTGAACGTGGATGAAGTTGGTGGTGAGGCCCTGGGCAGGCTGCTGGTGGTCTACCCTTGGACCCAGAGGTTCTTTGAGTCCTTTGGGGATCTGTCCACTCCTGATGCTGTTATGGGCAACCCTAAGGTGAAGGCTCATGGCAAGAAAGTGCTCGGTGCCTTTAGTGATGGCCTGGCTCACCTGGACAACCTCAAGGGCACCTTTGCCACACTGAGTGAGCTGCACTGTGACAAGCTGCACGTGGATCCTGAGAACTTCAGGCTCCTGGGCAACGTGCTGGTCTGTGTGCTGGCCCATCACTTTGGCAAAGAATTCACCCCACCAGTGCAGGCTGCCTATCAGAAAGTGGTGGCTGGTGTGGCTAATGCCCTGGCCCACAAGTATCACTAAGCTCGCTTTCTTGCTGTCCAATTTCTATTAAAGGTTCCTTTGTTCCCTAAGTCCAACTACTAAACTGGGGGATATTATGAAGGGCCTTGAGCATCTGGATTCTGCCTAATAAAAAACATTTATTTTCATTGCAA


In [78]:
def generate_sequence(base, length=5, prob=0.5):
    pattern = ""
    a = ["A","C","G","T"]
    
    for i in range(0, length):
        r = random.randrange(0,4)
        p = random.random()
        #print(r)
        if p >= prob:
            pattern += base[i]
        else:
            pattern += a[r]
            
    return pattern 

def generate_set(base):
    seq_set = []
    
    for i in range(0, NUM_SEQ):
        seq_set.append(generate_sequence(base, SEQ_LEN))
       
    return seq_set

base = 'ACATTTGCTTCTGACACAACTGTGTTCACTAGCAACCTCAAACAGACACCATGG'
sequences = generate_set(base)

In [79]:
sequences

['CCATTTGAACTTTAC',
 'ATTAAAACAACTGAA',
 'AAATGCTATTGTGAG',
 'TCCTTTTTATCTTGC',
 'AGATCTGAATGTTGC',
 'AGAACAGCTATTCAC',
 'GCGCTTGCTTCGGAC',
 'ACCCTCCCTGATGAC',
 'GCAGTTTGTTCTAGG',
 'CCGTTTGCATGTGAG']

In [80]:

def initial_population(sequences, population_size):
    population = []
    movements = [i for i in range(-5, 5)]
    
    for _ in range(population_size):
        population.append(random.sample(movements, len(sequences)))
        
    return population

initial_pop = initial_population(sequences, POP_SIZE)

In [81]:
initial_pop

[[-5, 1, -2, 3, -4, -1, 0, 2, -3, 4],
 [3, 4, 0, 2, -4, -3, -2, -1, -5, 1],
 [-1, -3, 1, 2, 4, -4, -5, -2, 3, 0],
 [1, -1, -3, 2, -2, -4, 4, -5, 3, 0],
 [-1, 0, -2, -3, 1, -5, 3, -4, 2, 4],
 [-1, 1, 2, -5, -4, -2, 0, -3, 3, 4],
 [-5, -3, -1, 1, -4, 3, 2, -2, 0, 4],
 [2, -4, -1, 1, -2, 3, 0, -5, -3, 4],
 [1, -4, -1, 4, 3, -2, -5, 2, 0, -3],
 [-4, 4, -1, -5, -3, 3, 2, 1, -2, 0]]

In [143]:
def calc_score(seqs):
    s = np.array(seqs)
    #print(set(s[:,0]))
    
    score = 0
    
    #looking for num of cols
    for i in range(0,s.shape[1]):
        #print(len(set(s[:,i])))
        score += len(set(s[:,i]))
        
    return score

def MSA(ps, top=5):
    scores = []
    top_scores = []
    
    for i in range(0, len(ps)):
        scores.append((initial_pop[i],calc_score(ps[i])))
    scores = sorted(scores, key=lambda x: x[1])
    
    for i in range(0, top):
        top_scores.append(scores[i])
#     for i in range(0, len(top_scores)):
#         print(top_scores[i])
        
    return top_scores 

def pad_sequence(seq, pad, l ,r):
    sample_size = r - l + SEQ_LEN + 1
    sample = ['-' for i in range(sample_size)]
    
    sample[pad - l: pad - l + len(seq)] = seq

    return sample



#padded based on population
padded_sequences = []
for i in range(0,len(initial_pop)):
    #highest and lowest padding values
    l = min(initial_pop[i])
    r = max(initial_pop[i])
    #for each chromosome
    padded_seqs = []
    for p,s in zip(initial_pop[i], sequences):
        padded_seqs.append(pad_sequence(s, p, l, r))
    padded_sequences.append(padded_seqs)
    
    
MSA(padded_sequences)


([-5, -3, -1, 1, -4, 3, 2, -2, 0, 4], 88)
([-1, 0, -2, -3, 1, -5, 3, -4, 2, 4], 90)
([-5, 1, -2, 3, -4, -1, 0, 2, -3, 4], 91)
([-1, -3, 1, 2, 4, -4, -5, -2, 3, 0], 92)
([2, -4, -1, 1, -2, 3, 0, -5, -3, 4], 92)


[([-5, -3, -1, 1, -4, 3, 2, -2, 0, 4], 88),
 ([-1, 0, -2, -3, 1, -5, 3, -4, 2, 4], 90),
 ([-5, 1, -2, 3, -4, -1, 0, 2, -3, 4], 91),
 ([-1, -3, 1, 2, 4, -4, -5, -2, 3, 0], 92),
 ([2, -4, -1, 1, -2, 3, 0, -5, -3, 4], 92)]