# BIF - Projet Minimapper

#### Objectifs
1. FM-index et la BWT pour indexer le génome de référence (TP 4)  
2. Pattern matching exacte de taille k (ancre)  
3. Programme de programmation dynamique implémenté dans le TP 5

#### Bonus
4. Aligner rev-complement
5. Améliorer le temps de calcul
6. Améliorer l’empreinte mémoire

## Get BWT (Tests)

In [1]:
#Import class that can create bwt on a reference string
from reference import Reference

In [2]:
# Test for Reference constructor
ref = Reference()

In [3]:
# Test for Reference createIndex, build SA and BWT
ref.createIndex("AGTCGTC$") 

In [4]:
# Test for Reference suffixArray
ref.sa

[7, 0, 6, 3, 4, 1, 5, 2]

In [5]:
# Test for Reference BWT
ref.bwt

['C', '$', 'T', 'T', 'C', 'A', 'G', 'G']

## Sauvegarder / Recuperer

In [6]:
# Test for Reference save()
ref.save("myObject")

In [7]:
# Test for Reference load()
ref2 = Reference()
ref2.load("myObject")
print(ref2.text)
print(ref2.sa)
print(ref2.bwt)

AGTCGTC$
[7, 0, 6, 3, 4, 1, 5, 2]
['C', '$', 'T', 'T', 'C', 'A', 'G', 'G']


## Search

In [8]:
from exactPatternIdentification import ExactPatternIdentification

In [9]:
epi = ExactPatternIdentification(ref)

In [10]:
pattern ="GT"


In [11]:
epi.search(pattern)

[1, 4]

In [12]:
epi.search("TCG")

[2]

## Parse reads.fasta

In [13]:
def openReads(filename : str):
    stream = open(filename, "r")
    return stream

def getNextRead(stream):
    name = stream.readline()[1:].strip()
    if not name:
        return -1,-1
    content = stream.readline().strip()
    return name,content

In [14]:
stream = openReads("test1/reads.fasta")

In [15]:
name,content = getNextRead(stream)
print(name)
print(content)

read1
CGAGCTGGTCCTAACCCGGAGACCGCAGGCTGCGCGCGTATCGCAGCATCTGGCATTACGCCGCATCGAGTGCATGCACGAGAGAAGGAAGGGCACTGTT


In [16]:
name,content = getNextRead(stream)
print(name)
print(content)

read2
TGAAGACTAATCCTCATTCCCTGTGTCACCGCAATTTCAGCCAAGCCAGCCACGCGCTCCTTGTTAGTCGTATATGGCGTTAATGAGCTTCAAACCCCGA


In [17]:
class DMLinearMem():
    '''
    stores a matrix 2x|T| (2 lines and |T|+1columns), sequences S and T and the score system (match, mismatch, gap)
    defines some semi-global alignment functions, with linear memory complexity 
    '''

    def __init__(self, S, T, match, mismatch, gap):
        ''' defines and stores initial values'''
        
        self.S=S
        self.T=T
        self.gap=gap
        self.match=match
        self.mismatch=mismatch
        
        #only 2 lines
        self.matrix = [0 for i in range(2)]
        
        #Init lines with 0
        for i in range(2):
            self.matrix[i] = [0 for j in range(len(T)+1)]
    
    def printLine(self,count):
        '''
        Print one line of the matrix
        if first line, print also T
        else print only second line after
        '''
        width = 4
        
        #Print headers
        if(count== 0):

            vide = " "
            line = f"{vide:>{2*width}}"
        
            for j in range(0,len(self.T)):
                line += f"{self.T[j]:>{width}}"
            print(line)

            line = f"{vide:>{width}}"
            
            #print first line (no letter)
            for j in range(0,len(self.T)+1):
                line += f"{self.matrix[0][j]:>{width}}"
            print(line)
            
        
        #print only second line
        line = f"{self.S[count]:>{width}}"
        for j in range(0,len(self.T)+1):
            line += f"{self.matrix[1][j]:>{width}}"
        print(line)
           
            
    def score(self, a : str, b : str) -> int :
        '''
        Compare two char
        Return match score if equals
        else return mismatch score 
        '''
        if(a == b):
            return self.match
        return self.mismatch
    
    def getBestScore(self,shouldPrint: bool = False):     
        '''
        Fill the matrix but only keep 2 lines in memory
        '''
        #While last char of S not reached
        count = 0
        while(count < len(self.S)):
            #Calc second line
            for j in range(1,len(self.T)+1):
                #calc best value
                delete = self.matrix[0][j] + self.gap
                insert = self.matrix[1][j-1] + self.gap
                match = self.matrix[0][j-1] + self.score(self.S[count],self.T[j-1])
                
                self.matrix[1][j] = max(delete,insert,match,0)
            
            #
            if (shouldPrint):
                self.printLine(count)
            
            #Copy second line to first
            self.matrix[0] = self.matrix[1]
                        
            #Reset second line
            self.matrix[1] = [0 for i in range(len(self.T)+1)]
            
            #Increment number of line done / letters of S
            count +=1
            
        return max(self.matrix[0])
                
            

In [18]:
dml = DMLinearMem("GATAG","AATGAATCAAT", +2, -1, -1)

In [19]:
score = dml.getBestScore(True)
print("score : ",score)

           A   A   T   G   A   A   T   C   A   A   T
       0   0   0   0   0   0   0   0   0   0   0   0
   G   0   0   0   0   2   1   0   0   0   0   0   0
   A   0   2   2   1   1   4   3   2   1   2   2   1
   T   0   1   1   4   3   3   3   5   4   3   2   4
   A   0   2   3   3   3   5   5   4   4   6   5   4
   G   0   1   2   2   5   4   4   4   3   5   5   4
score :  5


## AlignSemiGlobal

In [20]:
def alignSemiGlobal(read,index,posSeedInRef,posSeedInRead,k,dmax):

    #getSubstring of ref for alignement
    startIndexReference = posSeedInRef - dmax - posSeedInRead #pos of anchor - dmax - number of char before the seed in the read
    startIndexReference = startIndexReference if startIndexReference > 0 else 0 
    
    endIndexReference = posSeedInRef + dmax + len(read) - posSeedInRead #pos of anchor + dmax + number of char after the seed in the read
    endIndexReference = endIndexReference if endIndexReference < len(index.text)-1 else len(index.text)-1 
    
    substringRef = index.text[startIndexReference:endIndexReference+1]
    dm = DMLinearMem(read,substringRef, +2, -1, -1)
    
    return dm.getBestScore()
    
    

In [21]:
f = open("test1/reference.fasta", "r")
print(f.readline())
line =  f.readline().strip()
print(line)

ref = Reference()
ref.createIndex(line)

>random sequence 1 consisting of 1000 bases.

CGAGCTGGTCCTAACCCGGAGACCGCAGGCTGCGCGCGTATCGCAGCATCTGGCATTACGCCGCATCGAGTGCATGCACGAGAGAAGGAAGGGCACTGTTCGCTACCAGTCTACCCTACATAAGATTATACACATCTTTGAGTTTTTTCGTCCATCAAGTAGCGAAACGGATGTAGCGCTTCCCGACGGACTTCATAGGCGATTCACTCACGGTCGATTGAGCCGGGCGGAGCATGCTACACGTGTAAATGTTCTCGGTTAACTATTATGGTTTGGATGATTGGTGCCAGTGTTGCTTGCGTCTGACGAGTACACACCCTATAGAGAAAGAATACCTCATGTTTGCGTAACGAGCGTTCAATTTCCTCCTGTTTGTACCTTACCCCGAGGGTTATCGAACCTTGCGGGCTGGGTCGGAAAACTTGTCTTAGAGGCCTGCGACCGTGATTACATTGCTACAGATTGTCCCCATTGTTCCGCGGAGGCATTTTCGCAGGACGTCTGATGTAATGCGGTTTCCCTTGAAGGATAGGCATAATTTGATTGATCACTTCACTGCGATCTAGCTATGTTTAGAGTAATAGTTTCCAACCCACTGAGGCACTCTCGTTCTGTGAAACAGTTAGGCCGGTTGCCGTGCGCAGCAACATGGTGTGGACACATCTCCCAGCCTGTTGAATGACGGCCTAGTGTCAGGAATTAGAGAGCCTTAACCCTATCAGGGTTGTCCCGACAGTTGACATCGCCCGAGATGGCTCTTTTGAAGGGCCCCAAGATCGGCTGCATCTACTTGGCACAACGGCTTTGCCTGGCTCGTTAAAATCCTGTCACATACGCGAGTTCCCGAAGTTGGCCGATTGCCCCTATCACCGTGTTGGAACCCATGTGTTAGCACAGACCTGAAGACTAATCCTCATTCCCTGTGTCACCGCAATTTCAGCCAAGCCAGCCACG

In [22]:
def getBestSemiGlobalAlg(read,index,k,dmax):
    epi= ExactPatternIdentification(index)
    seeds = [read[i:i+k] for i in range(0, len(read), k)]
    bestScore = -1
    bestPos = -1
    for i in range(0,len(seeds)):
        seed = seeds[i]
        positions = epi.search(seed) #find positions of seed in Ref
        if(positions == -1): #seed not in reference
            return -1,-1
        
        
        for position in positions: #attention a ne pas retraiter une portion deja traitéé(liste tuple ?)
            score = alignSemiGlobal(read,index,position,i*k,k,dmax)
            if (score > bestScore):
                bestScore = score
                bestPos = position
    
    return bestScore,bestPos

In [23]:
getBestSemiGlobalAlg("CGAGCTGGTCCTAACCCGGAGACCGCAGGCTGCGCGCGTATCGCAGCATCTGGCATTACGCCGCATCGAGTGCATGCACGAGAGAAGGAAGGGCACTGTT",ref,19,5)

(200, 0)

In [24]:
getBestSemiGlobalAlg("AGTC",ref,2,5)

(8, 107)

## Output Write

In [25]:
def appendResults(outputStream,readName:str,bestPos : int,isRevCompl:bool,bestScore:int):
    t = "\t"
    str =f'{readName}{t}{bestPos}{t}{"-" if isRevCompl else "+"}{t}{bestScore}{t}\n'
    outputStream.write(str)

## Main

In [26]:
"""
Find the best alignments of all reads on a reference
output a text file with results
"""
def main (index : Reference,readsFilename : str,outFilename : str, k: int, dmax : int):
    #open file reads  
    inputStream = openReads(readsFilename)
    readName,readContent = getNextRead(inputStream)
    
    #
    outputStream = open(outFilename, "w")
        
    while(readContent!= -1):
        print(readName)
        bestScore,bestPos = getBestSemiGlobalAlg(readContent,index,k,dmax)#(score,pos)
        isRevCompl = False
        #Found a reult
        if(bestScore != -1):
            #We return the number of errors
            bestScore = 2*len(readContent)-bestScore
        
        appendResults(outputStream,readName,bestPos,isRevCompl,bestScore) #with tabs
        readName,readContent = getNextRead(inputStream)

    outputStream.close()



In [27]:
main(ref,"test1/reads.fasta","test1/res_reference_debug1.txt",20,4)

read1
read2
read3
read4
read5
read6
read7
read8
read9
read10
read11
read12
read13
read14
read15
read16
read17
read18
read19
read20
read21
read22
