In [1]:
#MSA
from Bio.Seq import Seq
import Bio.SeqIO as SeqIO
import random
import numpy as np

sick_patient_seq = SeqIO.read('HBS.fasta', 'fasta')
print(sick_patient_seq)
print(sick_patient_seq.seq)


# number of individuals in each generation 
NUM_SEQ = 10
POP_SIZE = 10
SEQ_LEN = 15

def generate_sequence(base, length=5, prob=0.5):
    pattern = ""
    a = ["A","C","G","T"]
    
    for i in range(0, length):
        r = random.randrange(0,4)
        p = random.random()
        #print(r)
        if p >= prob:
            pattern += base[i]
        else:
            pattern += a[r]
            
    return pattern 

def generate_set(base):
    seq_set = []
    
    for i in range(0, NUM_SEQ):
        seq_set.append(generate_sequence(base, SEQ_LEN))
       
    return seq_set

base = 'ACATTTGCTTCTGACACAACTGTGTTCACTAGCAACCTCAAACAGACACCATGG'
sequences = generate_set(base)

ID: Sick
Name: Sick
Description: Sick patient HBB
Number of features: 0
Seq('ACATTTGCTTCTGACACAACTGTGTTCACTAGCAACCTCAAACAGACACCATGG...CAA', SingleLetterAlphabet())
ACATTTGCTTCTGACACAACTGTGTTCACTAGCAACCTCAAACAGACACCATGGTGCATCTGACTCCTGTGGAGAAGTCTGCCGTTACTGCCCTGTGGGGCAAGGTGAACGTGGATGAAGTTGGTGGTGAGGCCCTGGGCAGGCTGCTGGTGGTCTACCCTTGGACCCAGAGGTTCTTTGAGTCCTTTGGGGATCTGTCCACTCCTGATGCTGTTATGGGCAACCCTAAGGTGAAGGCTCATGGCAAGAAAGTGCTCGGTGCCTTTAGTGATGGCCTGGCTCACCTGGACAACCTCAAGGGCACCTTTGCCACACTGAGTGAGCTGCACTGTGACAAGCTGCACGTGGATCCTGAGAACTTCAGGCTCCTGGGCAACGTGCTGGTCTGTGTGCTGGCCCATCACTTTGGCAAAGAATTCACCCCACCAGTGCAGGCTGCCTATCAGAAAGTGGTGGCTGGTGTGGCTAATGCCCTGGCCCACAAGTATCACTAAGCTCGCTTTCTTGCTGTCCAATTTCTATTAAAGGTTCCTTTGTTCCCTAAGTCCAACTACTAAACTGGGGGATATTATGAAGGGCCTTGAGCATCTGGATTCTGCCTAATAAAAAACATTTATTTTCATTGCAA


In [32]:
#Creation of initial population(movement of each sequence is a number between MOV_LEFT and MOV_RIGHT)
def initial_population(sequences, population_size, max_gap=3, mov_prob=0.1):
    population = []
    num_seq = len(sequences)
    sequence_movement = []
    sequences_movement = []
    #movement probability \
    for _ in range(population_size):
        for i in range(0, num_seq):
            sequence_movement = []
            for j in range(0, SEQ_LEN):
                random_mov = np.random.random()
                if (random_mov < mov_prob):
                    #np.random.randint(max_gap, size=(num_seq, SEQ_LEN))
                    seq_movement = np.random.randint(max_gap)
                else:
                    seq_movement = 0
                sequence_movement.append(seq_movement)
            sequences_movement.append(sequence_movement)
        population.append(sequences_movement)

    return population

#now a matrix of gaps for each character
initial_pop = initial_population(sequences, POP_SIZE)
print(initial_pop)

[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0], [2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], [0, 0, 1, 0

In [38]:
def pad_sequence(pop, seqs, max_gap):
    new_seq = ''
    new_sequences = []
    for p in pop:
        new_seq = ''
        for seq, movement in zip(seqs,p):
            if movement == 0:
                new_seq += seq
            else:
                for i in range(0, movement):
                    new_seq += '-'
                new_seq += seq
        new_sequences.append(new_seq) 
    print(new_sequences)
    
def padded_sequences(initial_pop):
    #padded based on population
    padded_sequences = []
    max_gap = 0
    for i in range(0,len(initial_pop)):
        for j in range(0, len(initial_pop[i])):
            if max(initial_pop[i][j]) > max_gap:
                max_gap = max(initial_pop[i][j])
    #now for the max gap pad all the sequences and store them
    for i in range(0, POP_SIZE):
        padded_sequence = pad_sequence(initial_pop[i], sequences[i], max_gap)
        padded_sequences.append(padded_sequence)
    return padded_sequences
print(padded_sequences(initial_pop))


['ACGGATGCTGCGGAA', 'ACGGATG--CTGCGGAA', '--ACGGATGCTGCGGAA', 'ACGGATGCTGCGGAA', 'ACGGA-TGCTGCGG--AA', 'ACGGA-TGCTGCGGAA', 'ACGGAT-GCTGCGGAA', 'ACGGATGCTGC--GGAA', 'ACGGATGCTGCG-GAA', 'ACGGATGCTGCGGAA', 'ACGGAT-GCTGCGGAA', 'ACGGATGCTGCGGAA', 'ACGGATGCTGCGGAA', 'AC--GG--ATGCTGCGGAA', 'ACGGATGCTGCGGAA', 'ACGGATGCTGCGGAA', 'ACGGATGCTGCGGAA', 'ACGGATGCTGC-GGAA', 'ACGGATGCTGCGG-AA', 'ACGGATGCTGCGGAA', 'ACGGATGCTGCG-GAA', 'AC-GG--ATGCTGCGGAA', 'ACGGATGCTGC-GGAA', 'ACGGATGCTGCGGAA', 'ACGGATGCTGCGG--AA', 'ACGGATGCTGCGGAA', 'ACGGATGCTGCGGAA', 'ACGGATGCTGCGGAA', 'ACGGA-TGCTGC--GGAA', 'ACGGATGCTGCG--GAA', 'ACGGATGCTGCGGAA', 'ACG-GATGCT--GCGGAA', 'ACGGATG-CT-GCGGAA', 'ACGGATGCTGCGGAA', 'A-CGGATGCTGCGG--AA', 'ACG-G--ATGCT-GCGGAA', 'ACGGATGCTGC--GGAA', 'ACGGAT-GCTGCGGAA', 'ACGG--AT--GCTGCGGAA', 'ACGGATGCTGCGGAA', 'ACGGATGCTGCGGAA', 'ACGGATGCTGCGGAA', 'ACGGATGCTGCGGAA', 'ACGGATGCTGCGGAA', 'ACGGATGCTGCGGAA', '-ACGGATGC-TGCGGAA', 'ACGGATGCTGCGGAA', 'ACGGATGCTGCGGAA', 'ACGGATGCTGCGGAA', '-ACGGATGCTGCGGA