In [425]:
import pandas as pd 
import numpy as np
import random
from Bio import pairwise2
from Bio.Seq import Seq 

In [426]:

# for efficient exact matching
def bwt(p):
    """Burrows-Wheeler transform of a string"""
    if len(p)==0:
        return None
    s='$'+p
    rotations = list(enumerate([s[i:] + s[:i] for i in range(len(s))]))
    rotations.sort(key=lambda x: x[-1])
    df = pd.DataFrame([[rot[0],rot[1][0],rot[1][-1]] for rot in rotations])
    df.columns = ['index','L','R']

    
    letters = sorted(list(df['R'].unique()))
    rank = pd.DataFrame()
    for let in letters:
        rank[let] = (df['R']==let).cumsum() - (df['R']==let)
    
    rank.loc[len(s)]=df['R'].value_counts().sort_index()

    rng = pd.DataFrame([],index=letters)
    d = df['L'].value_counts().sort_index()
    rng['start']=d.cumsum() - d
    rng['end']=d.cumsum()
    
    return df ,rank,rng



In [427]:

# Uses BWT to get the exact matching regions in the 2 sequences
def get_lmem(q,df,rank,rng):
    """Burrows-Wheeler search of a string"""
    n =len(q)
    st,end = rng.loc[q[-1]]
    j2 = n-1
    lmem_list=[]
    l_=2
    for j in range(n-2,-1,-1):
        q_ = q[j]
        #print(q_)
        st_ = rank.loc[st,q_]
        end_ = rank.loc[end,q_]
        
        if(st_==end_):
            #print('No match')
            j1 = j+1 
            l = j2-j1
            if(l>=l_-1):
                i1=df.iloc[st:end]['index'].values[0]-1
                #print(i1)
                i2 = i1 + l
                lmem_list.append((i1,i2+1,j1,j2+1,i1-j1))
            j2 = j
            st = rng.loc[q_,'start']
            end = rng.loc[q_,'end']

        elif j == 0:
            #print('Match 0')
            st = rng.loc[q_,'start'] + st_
            end = rng.loc[q_,'start'] + end_     
            j1 = j
            l = j2-j1
            if(l>=l_-1):
                i1=df.iloc[st:end]['index'].values[0]-1
                #print(i1)
                i2 = i1 + l
                lmem_list.append((i1,i2+1,j1,j2+1,i1-j1))

        else:
            st = rng.loc[q_,'start'] + st_
            end = rng.loc[q_,'start'] + end_
        
    return lmem_list
        

In [428]:

def get_colinear_sets(lmem_list):
    """Get colinear sets from lmem_list"""
    lmem_list.sort(key=lambda x: x[-1])
    thresh = 5 # for easier visualization 30
    colinear_sets = []
    l= [lmem_list[0]]
    for i in range(1,len(lmem_list)):
        if abs(lmem_list[i][-1]-lmem_list[i-1][-1])<=thresh:
            l.append(lmem_list[i])
        else:
            if(len(l)>1):
                colinear_sets.append(l)
            l = [lmem_list[i]]
    if len(l)>1:
        colinear_sets.append(l)
    return colinear_sets

In [429]:


def remove_overlap(col_set):
    for i in range(len(col_set)-1):
        lmem1 = col_set[i]
        lmem2 = col_set[i+1]
        l1 = lmem1[1]-lmem1[0]
        l2 = lmem2[1]-lmem2[0]
        if lmem1[1]>lmem2[0]:
            print("Bazinga")
            if l1>l2:
                col_set[i+1] = [lmem1[1],lmem2[1],lmem2[2],lmem2[2]+lmem2[1]-lmem1[1],lmem2[2]-lmem1[1]]
            else:
                col_set[i] = [lmem1[0],lmem2[0],lmem1[2],lmem1[2]+lmem2[0]-lmem1[0],lmem1[4]]
    return col_set


In [430]:
def non_outlier(col_set):
    col_set.sort(key=lambda x: x[2])
    thresh = 2  # for easier visualization 10
    n=len(col_set)
    dp = np.ones(n)
    for i in range(1,n):
        for j in range(i-1,-1,-1):
            if abs(col_set[i][-1]-col_set[j][-1])<=thresh:
                dp[i] = max(dp[i],dp[j]+1)
    i = dp.argmax()
    non_outlier_set = [col_set[i]]
    for j in range(i-1,-1,-1):
        if (abs(col_set[i][-1]-col_set[j][-1])<=thresh) and (dp[j]==dp[i]-1):
            non_outlier_set.append(col_set[j])
            i=j
    non_outlier_set.reverse()
    return non_outlier_set


In [431]:

def preprocess_colinear_sets(colinear_sets): 
    """Preprocess colinear sets"""
    for i in range(len(colinear_sets)):
        col_set = colinear_sets[i]
        col_set.sort(key=lambda x: x[2])
        col_set = non_outlier(col_set) #Remover outliers
        col_set = remove_overlap(col_set) #Remove overlap
        colinear_sets[i] = col_set
    return colinear_sets

        

In [432]:
#Write helper functions for gap filling and alignment here

In [433]:
p = ''.join([random.choice('ACGT') for i in range(500)]) #for easier visualization 100000
q = ''.join([random.choice('ACGT') for i in range(500)])

# p='AAATTGGCC'
# q='AAACCGGCC'


In [434]:
print(p)
print(q)

TCACCGCGCGCGTCCTATCCTGCATGACGTTCGTCATGTTCTATGTACTATATAGTTGGAGAGCGTTGTCTCTATTACTAACTCTACGGTAGAGCTGCGCGAGAGTGTGATCATATGACCACCAACCTAGGTACATTTGATACTTAGAAGGGGACCTCAAGCCATCGAGACATCAACGCAAGGAAGCACGGGGGAAATCATCAGAGTCTTAATGCTGTATTTGTGTAAATGTCGGACTTGATTGTAAATTGCCGAAGAGCTAAATGGGGTCCCGCACTTGACCTGGTCCGCGCGTGTCTTGAGTAATTGTGGTCGATCGGTCCCGTCTCCCCAAAGCTCGTTACTGGTAGGATCTCGAACGTGTACCACCTCCACCCGGGACCGAGGCACGTAATCTCAAGTCGAAGCTTTGCTCCGCGTAGTTCGGAGGTTTCTTCTCCGGCCTATAATGGTGCCGAATAAACTCAACCGGCAGGCAGGTTGTACGTGGTCTTAAGCAC
TTCGAGTTACGGGTTATCACTTTCGGACCAGTAAGTGTGTCGGTCTTTGGTAAAGTTTGGCTCGGGCTGTCTTTTCATAGCGGCCCCGAAGCAAGGCCTGTGTACCACCTAGGGACCTGTCGGTTAAATTCAGGTAAGCATGCAGGTAGACTGTCAGCGTACAATTTTCTAACGCCATCTCACGTCACAATACGGACACCGCCTATCTCCGACGCGGCCAGTGATCACGTTATCGAATAACAATGGTACGAGCCAGGAGTACTCACATCTGATCCCCCGGGTACTATACCGCATAACGCTGCGCTCCGCAGGGAAACACCATTCGCCAATAACCGGCGAATTAGGTTACCAACATGTTAGTACCCTGCAAGGGCTGGCGACTATCAGTGCAGATGTCCACACGAGTAGGGTGAACAGCAAGACGGACCCTGCGATTTGGATTCTTCTAAACGCATCCCCGTAACCGTTAGGTACCCCCAACTGGCGTGCATATTTCAAA

In [435]:
df,rank,rng = bwt(p)
df

Unnamed: 0,index,L,R
0,0,$,C
1,461,A,T
2,333,A,C
3,195,A,G
4,262,A,T
...,...,...,...
496,221,T,T
497,431,T,G
498,136,T,A
499,409,T,C


In [436]:
lmem_set = get_lmem(q,df,rank,rng)
pd.DataFrame(lmem_set,columns=['i1','i2','j1','j2','offset'])

Unnamed: 0,i1,i2,j1,j2,offset
0,331,336,495,500,-164
1,217,222,490,495,-273
2,20,24,486,490,-466
3,98,101,483,486,-385
4,342,346,479,483,-137
...,...,...,...,...,...
110,0,4,16,20,-16
111,339,343,12,16,327
112,84,89,7,12,77
113,300,304,3,7,297


In [437]:
colinear_sets = get_colinear_sets(lmem_set)
colinear_sets

[[(18, 22, 427, 431, -409), (38, 43, 443, 448, -405)],
 [(98, 101, 483, 486, -385), (55, 60, 435, 440, -380)],
 [(93, 97, 372, 376, -279),
  (114, 117, 392, 395, -278),
  (98, 102, 376, 380, -278),
  (174, 180, 448, 454, -274),
  (217, 222, 490, 495, -273)],
 [(125, 128, 363, 366, -238), (44, 52, 280, 288, -236)],
 [(78, 82, 293, 297, -215), (127, 132, 341, 346, -214)],
 [(232, 237, 422, 427, -190),
  (177, 183, 366, 372, -189),
  (60, 64, 249, 253, -189),
  (0, 4, 184, 188, -184),
  (169, 173, 351, 355, -182)],
 [(331, 336, 495, 500, -164),
  (254, 258, 418, 422, -164),
  (160, 163, 324, 327, -164),
  (78, 82, 237, 241, -159)],
 [(328, 333, 474, 479, -146),
  (322, 326, 463, 467, -141),
  (266, 270, 407, 411, -141),
  (342, 346, 479, 483, -137)],
 [(191, 197, 310, 316, -119),
  (313, 317, 431, 435, -118),
  (21, 25, 137, 141, -116),
  (129, 134, 244, 249, -115),
  (97, 101, 212, 216, -115),
  (106, 113, 220, 227, -114)],
 [(169, 173, 264, 268, -95), (77, 82, 168, 173, -91)],
 [(160, 1

In [438]:
colinear_sets = preprocess_colinear_sets(colinear_sets)
colinear_sets

[[(18, 22, 427, 431, -409)],
 [(55, 60, 435, 440, -380)],
 [(93, 97, 372, 376, -279),
  (98, 102, 376, 380, -278),
  (114, 117, 392, 395, -278)],
 [(44, 52, 280, 288, -236), (125, 128, 363, 366, -238)],
 [(78, 82, 293, 297, -215), (127, 132, 341, 346, -214)],
 [(60, 64, 249, 253, -189),
  (177, 183, 366, 372, -189),
  (232, 237, 422, 427, -190)],
 [(160, 163, 324, 327, -164),
  (254, 258, 418, 422, -164),
  (331, 336, 495, 500, -164)],
 [(266, 270, 407, 411, -141), (322, 326, 463, 467, -141)],
 [(21, 25, 137, 141, -116),
  (97, 101, 212, 216, -115),
  (106, 113, 220, 227, -114),
  (129, 134, 244, 249, -115)],
 [(77, 82, 168, 173, -91)],
 [(160, 164, 216, 220, -56), (356, 360, 411, 415, -55)],
 [(51, 55, 76, 80, -25), (370, 374, 395, 399, -25)],
 [(0, 4, 16, 20, -16),
  (271, 276, 288, 293, -17),
  (339, 343, 355, 359, -16),
  (422, 425, 440, 443, -18)],
 [(124, 131, 106, 113, 18), (167, 171, 147, 151, 20)],
 [(251, 255, 208, 212, 43), (300, 304, 256, 260, 44)],
 [(135, 139, 55, 59, 80)

In [439]:
#touple as in (i1,i2,j1,j2)
normal_set=[]
for similar_set in colinear_sets:
    sets=[]
    for i in range(len(similar_set)-1):
        exact_region1=similar_set[i]
        exact_region2=similar_set[i+1]
        gap_region=(exact_region1[1],exact_region2[0],exact_region1[3],exact_region2[2])
        sets.append(gap_region)
    normal_set.append(sets)

In [440]:
normal_set

[[],
 [],
 [(97, 98, 376, 376), (102, 114, 380, 392)],
 [(52, 125, 288, 363)],
 [(82, 127, 297, 341)],
 [(64, 177, 253, 366), (183, 232, 372, 422)],
 [(163, 254, 327, 418), (258, 331, 422, 495)],
 [(270, 322, 411, 463)],
 [(25, 97, 141, 212), (101, 106, 216, 220), (113, 129, 227, 244)],
 [],
 [(164, 356, 220, 411)],
 [(55, 370, 80, 395)],
 [(4, 271, 20, 288), (276, 339, 293, 355), (343, 422, 359, 440)],
 [(131, 167, 113, 147)],
 [(255, 300, 212, 256)],
 [(139, 229, 59, 151), (233, 495, 155, 415)],
 [(379, 422, 280, 321)],
 [],
 [(265, 466, 129, 330)],
 [(256, 279, 90, 113)],
 [],
 [(397, 457, 180, 241)],
 [],
 [(304, 344, 7, 47), (348, 457, 51, 162)],
 [(434, 439, 76, 80)]]

In [441]:
similar_regions={}
normal_regions={}


In [442]:
for sim_r in colinear_sets:
    for touple in sim_r:
        similar_regions[touple[0]]=touple

In [443]:
for gap_r in normal_set:
    for touple in gap_r:
        normal_regions[touple[0]]=touple

In [444]:
final_p=""
final_q=""
alignment=""

In [445]:
position = 0
while position<len(p):
    if position in similar_regions:
        touple=similar_regions[position]
        final_p+=p[touple[0]:touple[1]]
        final_q+=q[touple[2]:touple[3]]
        alignment+="|"*(touple[1]-touple[0])
        position=touple[1]
    elif position in normal_regions:
        touple=normal_regions[position]
        tempp=p[touple[0]:touple[1]]
        tempq=q[touple[2]:touple[3]]
        alignments = pairwise2.align.globalms(tempp, tempq,2, -.5,-.5,-0.5)
        onealignment=alignments[0]  # taking only 1 max score alignment since otherwise there could be many possiblities
        alignment_p=onealignment[0]
        alignment_q=onealignment[1]
        all=""
        for i in range(len(alignment_p)): 
            if alignment_p[i]==alignment_q[i]:
                all+="|" 
            else:
                all+=" "
        alignment+=all
        final_p+=alignment_p
        final_q+=alignment_q
        position=touple[1]
    else:
        final_p+=p[position]
        final_q+=q[position]
        position+=1
        alignment+=" "

In [446]:
with open('alignment.txt', 'w') as f:
    print(final_p, file=f)
    print(alignment, file=f)
    print(final_q, file=f)