# Examples of alignments with different score parameters
Let\`s try to align some reads with fragments of V-segments on V-segment with different parameters

In [100]:
from Bio import pairwise2
from Bio.pairwise2 import format_alignment
from Bio import AlignIO
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Align.Applications import MuscleCommandline

In [26]:
# Train example

# Alignment module from Biopython is similar to EMBOSS
# pairwise alignment return list of alignments, each of them contains
# seq1, seq2, score, start of alignment (first nongap), end of alignment (last nongap)

templ = 'ATATATTTGTGGG'
a = 'GTGG'
pairwise2.align.localxx(templ, a)

[('ATATATTTGTGGG', '--------GT-GG', 4.0, 8, 13),
 ('ATATATTTGTGGG', '--------GTG-G', 4.0, 8, 13),
 ('ATATATTTGTGGG', '--------GTGG-', 4.0, 8, 12)]

## Different alignments of reads on 1 allele IgV
Load core genes and reads

In [113]:
with open('/home/arleg/ig_construction/data/main/simple_fasta/all/ig_hv_all') as source:
    cores = list(SeqIO.parse(source, 'fasta'))[:10]

reads = []
with open('/home/arleg/PycharmProjects/igsegments/data/SRR6435693.fasta') as source:
    readss = SeqIO.parse(source, 'fasta')
    for i, read in enumerate(readss):
        reads.append(read)
        if i > 100:
            break

In [114]:
SeqIO.write(cores, '10cores', 'fasta')

10

In [117]:
from v_segment_generation import combinations


path_to_genes = '10cores'
path_to_heptamers='../data/conserve/hv7'
path_to_nonamers='../data/conserve/hv9'
length=23
monomers=('A', 'T', 'G', 'C')
cum_distribution=(0.25, 0.5, 0.75, 1)

vs = combinations(path_to_genes = '10cores',
path_to_heptamers='../data/conserve/hv7',
path_to_nonamers='../data/conserve/hv9',
length=23,
monomers=('A', 'T', 'G', 'C'),
cum_distribution=(0.25, 0.5, 0.75, 1))

In [126]:
print(pairwise2.align.localxx(vs[0], str(reads[0].seq))[0])

('CTGGGCCTGGACCCAGCAGCCCTCTGGGAAGGCGCTGGGGCACCTCAGCTCCAGGGGCAGCACACACTTCAGCCCAGCCTTTCTGGGCCAACTCTCCATCTGTAGAGACACATCCAAGGCCCAGTTATCCCTGCAGCTGAGCTCCGTGATGGCCAAGGGCAGGGCCGCACATTCCCGTGGGACACAGCGTCCCCTCGGCCCACCGGGGAACGACACAAACG', 'CTGGGCCTGGACCCAGCAGCCCTCTGGGAAGGCGCTGGGGCACCTCAGCTCCAGGGGCAGCACACACTTCAGCCCAGCCTTTCTGGGCCAACTCTCCATCTGTAGAGACACATCCAAGGCCCAGTTATCCCTGCAGCTGAGCTCCGTGATGGCCAAGGGCAGGGCCGCACA-T----T----C-C--CG-----T-----------GG---G------A--', 182.0, 0, 219)


In [89]:
f = '''CLUSTAL X (1.81) multiple sequence alignment


COATB_BPIKE/30-81                   AEPNAATNYATEAMDSLKTQAIDLISQTWPVVTTVVVAGLVIRLFKKFSS
Q9T0Q8_BPIKE/1-52                   AEPNAATNYATEAMDSLKTQAIDLISQTWPVVTTVVVAGLVIKLFKKFVS
COATB_BPI22/32-83                   DGTSTATSYATEAMNSLKTQATDLIDQTWPVVTSVAVAGLAIRLFKKFSS
COATB_BPM13/24-72                   AEGDDP---AKAAFNSLQASATEYIGYAWAMVVVIVGATIGIKLFKKFTS
COATB_BPZJ2/1-49                    AEGDDP---AKAAFDSLQASATEYIGYAWAMVVVIVGATIGIKLFKKFAS
Q9T0Q9_BPFD/1-49                    AEGDDP---AKAAFDSLQASATEYIGYAWAMVVVIVGATIGIKLFKKFTS
COATB_BPIF1/22-73                   FAADDATSQAKAAFDSLTAQATEMSGYAWALVVLVVGATVGIKLFKKFVS

COATB_BPIKE/30-81                   KA
Q9T0Q8_BPIKE/1-52                   RA
COATB_BPI22/32-83                   KA
COATB_BPM13/24-72                   KA
COATB_BPZJ2/1-49                    KA
Q9T0Q9_BPFD/1-49                    KA
COATB_BPIF1/22-73                   RA'''

with open('f', 'w') as dest:
    dest.write(f)

In [92]:
for i in AlignIO.parse('f', 'clustal'):
    print(i)

SingleLetterAlphabet() alignment with 7 rows and 52 columns
AEPNAATNYATEAMDSLKTQAIDLISQTWPVVTTVVVAGLVIRL...SKA COATB_BPIKE/30-81
AEPNAATNYATEAMDSLKTQAIDLISQTWPVVTTVVVAGLVIKL...SRA Q9T0Q8_BPIKE/1-52
DGTSTATSYATEAMNSLKTQATDLIDQTWPVVTSVAVAGLAIRL...SKA COATB_BPI22/32-83
AEGDDP---AKAAFNSLQASATEYIGYAWAMVVVIVGATIGIKL...SKA COATB_BPM13/24-72
AEGDDP---AKAAFDSLQASATEYIGYAWAMVVVIVGATIGIKL...SKA COATB_BPZJ2/1-49
AEGDDP---AKAAFDSLQASATEYIGYAWAMVVVIVGATIGIKL...SKA Q9T0Q9_BPFD/1-49
FAADDATSQAKAAFDSLTAQATEMSGYAWALVVLVVGATVGIKL...SRA COATB_BPIF1/22-73


('',
 '\nMUSCLE v3.8.31 by Robert C. Edgar\n\nhttp://www.drive5.com/muscle\nThis software is donated to the public domain.\nPlease cite: Edgar, R.C. Nucleic Acids Res 32(5), 1792-97.\n\nfasta 8 seqs, max length 296, avg  length 293\n00:00:00    23 MB(-5%)  Iter   1    2.78%  K-mer dist pass 1\n00:00:00    23 MB(-5%)  Iter   1  100.00%  K-mer dist pass 1\n00:00:00    23 MB(-5%)  Iter   1    2.78%  K-mer dist pass 2\n00:00:00    23 MB(-5%)  Iter   1  100.00%  K-mer dist pass 2\n00:00:00    23 MB(-5%)  Iter   1   14.29%  Align node       \n00:00:00    25 MB(-5%)  Iter   1   28.57%  Align node\n00:00:00    25 MB(-5%)  Iter   1   42.86%  Align node\n00:00:00    25 MB(-5%)  Iter   1   57.14%  Align node\n00:00:00    25 MB(-5%)  Iter   1   71.43%  Align node\n00:00:00    26 MB(-5%)  Iter   1   85.71%  Align node\n00:00:00    26 MB(-5%)  Iter   1  100.00%  Align node\n00:00:00    26 MB(-5%)  Iter   1  100.00%  Align node\n00:00:00    26 MB(-5%)  Iter   1   12.50%  Root alignment\n00:00:00    2

# STOCKHOLM 1.0
#=GF SQ 8
IGHV1-18*02 CAGGTTCAGCTGGTGCAGTCTGGAGCTGAGGTGAAGAAGCCTGGGGCCTCAGTGAAGGTCTCCTGCAAGGCTTCTGGTTACACCTTTACCAGCTATGGTATCAGCTGGGTGCGACAGGCCCCTGGACAAGGGCTTGAGTGGATGGGATGGATCAGCGCTTACAATGGTAACACAAACTATGCACAGAAGCTCCAGGGCAGAGTCACCATGACCACAGACACATCCACGAGCACAGCCTACATGGAGCTGAGGAGCCTAAGATCTGACGACACGGCC--------------------
#=GS IGHV1-18*02 AC IGHV1-18*02
#=GS IGHV1-18*02 DE IGHV1-18*02
IGHV1-18*03 CAGGTTCAGCTGGTGCAGTCTGGAGCTGAGGTGAAGAAGCCTGGGGCCTCAGTGAAGGTCTCCTGCAAGGCTTCTGGTTACACCTTTACCAGCTATGGTATCAGCTGGGTGCGACAGGCCCCTGGACAAGGGCTTGAGTGGATGGGATGGATCAGCGCTTACAATGGTAACACAAACTATGCACAGAAGCTCCAGGGCAGAGTCACCATGACCACAGACACATCCACGAGCACAGCCTACATGGAGCTGAGGAGCCTGAGATCTGACGACATGGCCGTGTATTACTGTGCGAGAGA
#=GS IGHV1-18*03 AC IGHV1-18*03
#=GS IGHV1-18*03 DE IGHV1-18*03
IGHV1-18*04 CAGGTTCAGCTGGTGCAGTCTGGAGCTGAGGTGAAGAAGCCTGGGGCCTCAGTGAAGGTCTCCTGCAAGGCTTCTGGTTACACCTTTACCAGCTACGGTATCAGCTGGGTGCGACAGGCCCCTGGACAAGGGCTTGAGTGGATGGGATGGATCAGCGCTTACAATGGTAACACAAACTATGCACAGAAGCTCCAGGGCAGAGTCACCATGACCACA

In [110]:
for i in pairwise2.align.localxx(allele.seq, reads[1])[0]:
    print(i, end='\n***\n')


In [7]:
fas = '''>L22582|IGHV1-69*01|Homo sapiens|F|V-REGION|376..671|296 nt|1| | | | |296+0=296| | |
caggtgcagctggtgcagtctggggctgaggtgaagaagcctgggtcctcggtgaaggtc
tcctgcaaggcttctggaggcaccttcagcagctatgctatcagctgggtgcgacaggcc
cctggacaagggcttgagtggatgggagggatcatccctatctttggtacagcaaactac
gcacagaagttccagggcagagtcacgattaccgcggacgaatccacgagcacagcctac
atggagctgagcagcctgagatctgaggacacggccgtgtattactgtgcgagaga
>Z27506|IGHV1-69*02|Homo sapiens|F|V-REGION|1..294|294 nt|1| | | | |294+0=294| | |
caggtccagctggtgcaatctggggctgaggtgaagaagcctgggtcctcggtgaaggtc
tcctgcaaggcttctggaggcaccttcagcagctatactatcagctgggtgcgacaggcc
cctggacaagggcttgagtggatgggaaggatcatccctatccttggtatagcaaactac
gcacagaagttccagggcagagtcacgattaccgcggacaaatccacgagcacagcctac
atggagctgagcagcctgagatctgaggacacggccgtgtattactgtgcgaga'''

x = pairwise2.align.globalxx('''caggtgcagctggtgcagtctggggctgaggtgaagaagcctgggtcctcggtgaaggtc
tcctgcaaggcttctggaggcaccttcagcagctatgctatcagctgggtgcgacaggcc
cctggacaagggcttgagtggatgggagggatcatccctatctttggtacagcaaactac
gcacagaagttccagggcagagtcacgattaccgcggacgaatccacgagcacagcctac
atggagctgagcagcctgagatctgaggacacggccgtgtattactgtgcgagaga'''.replace('\n', ''), 
'''caggtccagctggtgcaatctggggctgaggtgaagaagcctgggtcctcggtgaaggtc
tcctgcaaggcttctggaggcaccttcagcagctatactatcagctgggtgcgacaggcc
cctggacaagggcttgagtggatgggaaggatcatccctatccttggtatagcaaactac
gcacagaagttccagggcagagtcacgattaccgcggacaaatccacgagcacagcctac
atggagctgagcagcctgagatctgaggacacggccgtgtattactgtgcgaga'''.replace('\n', ''))
print(format_alignment(*x[0]))

caggtgc-agctggtgcag-tctggggctgaggtgaagaagcctgggtcctcggtgaaggtctcctgcaaggcttctggaggcaccttcagcagctatg-ctatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggag-ggatcatccctatct-ttggtac-agcaaactacgcacagaagttccagggcagagtcacgattaccgcggacgaa-tccacgagcacagcctacatggagctgagcagcctgagatctgaggacacggccgtgtattactgtgcgagaga
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
caggt-ccagctggtgca-atctggggctgaggtgaagaagcctgggtcctcggtgaaggtctcctgcaaggcttctggaggcaccttcagcagctat-actatcagctgggtgcgacaggcccctggacaagggcttgagtggatggga-aggatcatccctatc-cttggta-tagcaaactacgcacagaagttccagggcagagtcacgattaccgcggac-aaatccacgagcacagcctacatggagctgagcagcctgagatctgaggacacggccgtgtattactgtgc--gaga
  Score=287

