# Pattern Counting
`pattern count` 
Input: Strings Text and Pattern.
Output: Count(Text, Pattern).

In [19]:
def pattern_count(text: str, pattern: str) -> int:
    count =0
    for i in range(0,len(text)-len(pattern)+1): #iterating through every kmer in text
        if text[i:i+len(pattern)]==pattern: #if it matches pattern
            count+=1
    return count

In [49]:
def scattered_pattern_count(text: str, pattern: str, overlap: int) -> int:
    count =0
    i=0
    n = len(text)
    k = len(pattern)
    while i<=n-k:
        if text[i:i+k]==pattern: #if it matches pattern
            count+=1
            if k>overlap:
                i=i+k-overlap
            else:
                return 0
        else:
            i+=1
        
    return count

In [None]:
'GATATA','ATA',3

In [50]:
scattered_pattern_count('GATATATATCATATATATATATG','ATATAT',6)

0

### Frequent words problem
most frequent kmer in text = pattern with highest `pattern_count(text,pattern)`
O(|Text|^2*k) = Uses array to store 
Freq Table approach: hash map for kmer with count

FrequencyTable(Text, k)
    freqMap ← empty map
    n ← |Text|
    for i ← 0 to n − k
        Pattern ← Text(i, k)
        if freqMap[Pattern] doesn't exist
            freqMap[Pattern]← 1
        else
           freqMap[pattern] ←freqMap[pattern]+1 
    return freqMap

In [6]:
def frequency_table(text: str, k: int) -> dict[str, int]:
    freqMap = {}
    n = len(text)
    for i in range(0,n-k+1):
        pattern = text[i:i+k]
        if pattern not in freqMap:
            freqMap[pattern]=1
        else:
            freqMap[pattern]=freqMap[pattern]+1
    return freqMap

def MaxMap(freqMap: dict[str,int]) -> int:
    max_key = max(freqMap, key=lambda k: freqMap[k])
    return freqMap[max_key]

def frequent_words(text: str, k: int) -> list[str]:
    """Find the most frequent k-mers in a given text."""
    freqPatterns = []
    freqMap = frequency_table(text,k)
    max_len = MaxMap(freqMap)
    for key in freqMap:
        if freqMap[key] == max_len:
            freqPatterns.append(key)
    return freqPatterns
     

Reverse Complement of string

In [20]:
def reverse_complement(pattern: str) -> str:
    reversed_str = ""
    for char in pattern:
        reversed_str = reverse(char)+reversed_str
    return reversed_str
def reverse(char: str) -> str:
    if char=='A':
        return 'T'
    elif char=='T':
        return 'A'
    elif char=='G':
        return 'C'
    else:
        return 'G'

In [2]:
reverse_complement('ATG')

'CAT'

### Pattern Matching
Find all occurrences of a pattern in a genome.

In [None]:
def pattern_matching(pattern: str, genome: str) -> list[int]: #1.3
    """Find all occurrences of a pattern in a genome."""
    ind = []
    for i in range(0,len(genome)-len(pattern)+1):
        kmer = genome[i:i+len(pattern)]
        if kmer==pattern:
            ind.append(i)
    return ind

### Clump Finding Problem
We defined a k-mer as a "clump" if it appears many times within a short interval of the genome. More formally, given integers L and t, a k-mer Pattern forms an (L, t)-clump inside a (longer) string Genome if there is an interval of Genome of length L in which this k-mer appears at least t times. 

FindClumps(Text, k, L, t)
    Patterns ← an array of strings of length 0
    n ← |Text|
    for every integer i between 0 and n − L
        Window ← Text(i, L)
        freqMap ← FrequencyTable(Window, k)
        for every key s in freqMap
            if freqMap[s] ≥ t
                append s to Patterns
    remove duplicates from Patterns
    return Pattern

In [11]:
def find_clumps(text: str, k: int, L: int, t: int) -> list[str]: #1.4 
    Patterns = [] 
    n = len(text)
    for i in range(0,n-L+1):
        window = text[i:i+L] 
        #print(window)
        freqMap = frequency_table(window, k)
        for key in freqMap:
            if freqMap[key] >= t:
                if key not in Patterns:
                    Patterns.append(key)
    return Patterns

In [12]:
find_clumps('CGGACTCGACAGATGTGAAGAACGACAATGTGAAGACTCGACACGACAGAGTGAAGAGAAGAGGAAACATTGTAA',5,50,4)

['CGACA', 'GAAGA']

### DNA Replication
Leading strand: no stopping 5'to3' (template is 3' to 5') (reverse half strand - high C%)\
Lagging strand: stopping (template is 5' to 3') (forward half strand- high G%)\
\
Difference in nucleotides because of deamination
Lagging strand is more likely to have mutations (C->T) > when making 3' to 5' new strand more As decreasing normal amount of Gs > if less Gs new 5' to 3' strand has lesser Cs > G-C decreases\
\
Thus, min skew value is where the ori is

In [17]:
def minimum_skew(genome: str) -> list[int]:
    skew = [0] 
    min_skew_value = 0
    min_pos = [0]

    for i in range(len(genome)): #tracks all curr skew values by adding a nucleotide, then finds where the lowest skew values are
        if genome[i] == "C":
            skew_value = skew[-1] - 1
        elif genome[i] == "G":
            skew_value = skew[-1] + 1
        else:
            skew_value = skew[-1]

        skew.append(skew_value)
        #print(skew_value,skew)
        if skew_value < min_skew_value:
            min_skew_value = skew_value
            min_pos = [i + 1]
        elif skew_value == min_skew_value:
            min_pos.append(i + 1)

    return min_pos

'TAACG'
[0,0,0,-1,0]
min_pos = 3 

In [18]:
minimum_skew('TAAAGACTGCCGAGAGGCCAACACGAGTGCTAGAACGAGGGGCGTAAACGCGGGTCCGAT')

[11, 24]

## Freq Word with Mismatch (within ham_dist)

Add to freq if pattern <= d distance 
### Mismatch (Hamming Distance)

'AAT' and 'CAT' ham_dis=1

In [None]:
def hamming_distance(p: str, q: str) -> int: #1.8.1
    if len(p)!=len(q):
        return -1
    ham_count = 0
    for i in range(0,len(p)):
        if p[i]!=q[i]:
            ham_count+=1
    
    return ham_count

### Approx Pattern Match/Count 
compares kmers to 1 given pattern

In [None]:
def approximate_pattern_matching(pattern: str, text: str, d: int) -> list[int]: #takes in pattern and compares to every kmer in text (adds if in hamming dist)
    ind = []
    for i in range(0,len(text)-len(pattern)+1):
        kmer = text[i:i+len(pattern)]
        if hamming_distance(kmer,pattern)<=d: 
            ind.append(i)
    return ind

In [None]:
def approximate_pattern_count(text: str, pattern: str, d: int) -> int: #same thing as approximate_pattern_matching but instead of ind arr keeps count
    count = 0
    for i in range(0, len(text)-len(pattern)+1):
        kmer = text[i:i+len(pattern)]
        if hamming_distance(kmer,pattern)<=d:
            count+=1
    
    return count

### Freq Table with Mismatch


In [None]:
def neighbors(pattern: str,d: int) -> list[str]:
    if d==0:
        return [pattern]
    if len(pattern)==1:
        return ["A","C","G","T"]
    Neighborhood = []
    suffixNeighborhood = neighbors(pattern[1:],d)
    for text in suffixNeighborhood:
        if hamming_distance(pattern[1:],text)<d:
            for nucleotide in ["A","T","G","C"]:
                Neighborhood.append(nucleotide+text)
        else:
            Neighborhood.append(pattern[0]+text)
    return Neighborhood

In [None]:
def frequent_words_with_mismatches(text: str, k: int, d: int) -> list[str]: 
    patterns = []
    freqMap = {}
    n = len(text)
    for i in range(0,n-k+1): #iterating through kmers
        pattern = text[i:i+k]
        #print(pattern)
        neighborhood = neighbors(pattern, d) #finding all neighbors of kmer within d dist
        for j in range(0,len(neighborhood)): #iterating through all neighbors of kmer
            neighbor = neighborhood[j]
            if neighbor not in freqMap: #add neighbor count to freqMap
                freqMap[neighbor]=1 
            else:
                freqMap[neighbor]+=1
    m = MaxMap(freqMap) #finding pattern with highest count
    for key in freqMap:
        if freqMap[key] == m:
            patterns.append(key)
    return patterns

## Additional problems

In [None]:
def pattern_count_wo_overlap(text:str,pattern:str)->int:
    count =0
    i=0
    while i<=len(text)-len(pattern):
        if text[i:i+len(pattern)]==pattern: #if it matches pattern
            count+=1
            i=i+len(pattern)
        else:
            i+=1
        
    return count