# 1 - MODULES AND CONSTANTS

### MODULES AND LIBRARIES

In [12]:
import sys

### CONSTANTS

In [13]:
REFERENCE_SEQUENCE_INPUT_PATH = 'referenceSequence.txt'
QUERY_DATA_INPUT_PATH =  'queryData.txt'

# 2 - LOAD DATA

### Function: readSequence
reads the given sequence to be evaluated and returns it in a string format <br> 
will raise value error if the sequence contain characters different from A,C,G,T,-,X

In [14]:
def readSequence(path:str=REFERENCE_SEQUENCE_INPUT_PATH)->str:
    """reads the reference sequence

    Args:
        path (str, optional): the path to the txt file to be read. Defaults to REFERENCE_SEQUENCE_INPUT_PATH.

    Raises:
        ValueError: if the sequence contains characters difference from A, C, G, T, -, X

    Returns:
        sequence (str): the sequence read 
    """
    
    with open(path, 'r', encoding='UTF-8') as fp:
        seq =  ''.join(list(map(lambda line:line.strip().upper(), fp)))
        
    if set(seq).difference({'A', 'C', 'G', 'T', '-', 'X'}).__len__() == 0:
        return seq
    
    raise ValueError('Input format not correct')

### Function: readQueryData 
reads the given query sequences and returns it as a list of string
<br> 
will raise value error if any of the sequence contain characters different from A,C,G,T,-,X

In [15]:
def readQueryData(path:str=QUERY_DATA_INPUT_PATH)->list[str]:
    """reads the query data to be aligned

    Args:
        path (str, optional): The path to the file containing the data to be aligned. Defaults to QUERY_DATA_INPUT_PATH.

    Raises:
        ValueError: if any of the sequence contains characters difference from A, C, G, T, -, X

    Returns:
        sequences (list[str]): the list of sequences to be matched 
    """
    
    with open(path, 'r', encoding='UTF-8') as fp:
        seq =  list(map(lambda line:line.strip().upper(), fp))
    
    for item in seq:
        if set(item).difference({'A', 'C', 'G', 'T', '-', 'X'}).__len__() != 0:
            raise ValueError('Input format not correct')

    return seq

# 3 - SCORE ALIGNMENT

### Function: score_alignment 
takes in two strings and returns the alignment score of the two strings. <br>
The alignment score is calculated by summing the number of matching characters and subtracting the number of
mismatching characters. <br>
For example, the alignment of "ACGGT" and "ACGGC" would have a score of 4 (5 matching characters and 1 mismatching character).

In [16]:
def score_alignment(seq1:str, seq2:str)->int:
    """Evaluates the alignment score of two string

    Args:
        seq1 (str): the first sequence
        seq2 (str): the second sequence

    Raises:
        ValueError: if the length of the two sequence is not equal

    Returns:
        score (int): the score alignment obtained
    """
    
    
    if len(seq1) != len(seq2):
        raise ValueError('Lengths of the two sequences should match')
    
    return sum(1 if seq1[i] == seq2[i] else -1 for i in range(len(seq1)))

# 4 - FIND BEST ALIGNMENT

### Function find_best_alignment 
takes in a reference string, a query string, and a scoring function <br>
returns the best alignment of the query string to the reference string. 

In [17]:
def find_best_alignment(reference:str, query:str, scoringFunction=score_alignment)->int:
    """evaluates the best possible alignment for a query sequence into a sequence

    Args:
        reference (str): the reference sequence 
        query (str): the sub sequence to be aligned to the reference sequence
        scoringFunction (function, optional): the function to be used to determine the score, the function must accept two string as input (in the order reference, subsequence) and must return an integer or float.  Defaults to score_alignment.

    Raises:
        ValueError: if the reference sequence has a lower or equal length to the query sequence

    Returns:
        position (int): the starting position in the reference sequence for which the best alignment score was obtained 
        best score (int): the best alignment score obtained by the sequence  
    """
    
    if len(reference) <= len(query):
        raise ValueError('reference sequence length should be higher than the query sequence length')
    
    maximumScore = float('-inf')
    
    for i in range(len(reference)-len(query), -1, -1):
        score = scoringFunction(reference[i:i+len(query)], query)
        
        if score >= maximumScore:
            maximumScore = score
            pos = i


    return pos, maximumScore

# 5 - ALIGN READS

### Function: align_reads 
takes in a reference genome and a list of query strings
<br> returns the alignments of the query strings to the reference genome.

In [18]:
def align_reads(reference:str, queries:list[str]|set[str], alignmentFunction=score_alignment)->list[list[str, str, int, int]]:
    """Evaluates the best alignment possible

    Args:
        reference (str): the reference sequence 
        queries (list[str] | set[str]): the list (or set) of sub sequences to be queried 
        alignment function (function): the function to be used for alignment evaluation

    Returns:
        results (list): a list of lists containing the results for each query.
            Each query will have the following results:
                reference (str): the portion of the reference string that creates the best match
                query (str): the sequence that was queried
                position (int): the starting position in the reference sequence for the best scoring
                score (int): the best score obtained 
    """
    alignment = list()
    
    for data in queries:
        pos, score = find_best_alignment(reference, data, scoringFunction=alignmentFunction)
        alignment.append([reference[pos:pos+len(data)], data, pos, score])
    
    return alignment

# 6 - PRETTY PRINT 

prints to screen (or to file) the results of the queries

In [22]:
def prettyPrint(results:list[list[str, str, int, int]], sequence:str, outputFilePath:str=None)->None:
    if outputFilePath:
        outputFilePath = open(outputFilePath, 'w', encoding='UTF-8')
    else:
        outputFilePath = sys.stdout
    
    print(f"Reference sequence : {sequence}", file=outputFilePath, end='\n'*2)
        
    for data in results:
        print(f"Portion of the reference sequence : {data[0]}", file=outputFilePath)
        print(f"Sequence queried : {data[1]}", file=outputFilePath)
        print(f"Position for the best alignment in the reference sequence : {data[2]}", file=outputFilePath)
        print(f"best scoring obtained : {data[3]}", file=outputFilePath, end='\n'*3)
        
    
    if outputFilePath != sys.stdout:
        outputFilePath.close()

# TESTING

Test the implementation of each function

In [20]:
prettyPrint(align_reads(readSequence(path=REFERENCE_SEQUENCE_INPUT_PATH), readQueryData(path=QUERY_DATA_INPUT_PATH)), readSequence())

Reference sequence : GATCGTGGCTCTAGA

Portion of the reference sequence : GATC
Sequence queried : GATC
Position for the best alignment in the reference sequence : 0
best scoring obtained : 4


Portion of the reference sequence : GGCT
Sequence queried : GGCT
Position for the best alignment in the reference sequence : 6
best scoring obtained : 4


Portion of the reference sequence : CTAG
Sequence queried : CTAG
Position for the best alignment in the reference sequence : 10
best scoring obtained : 4


