### Getting the number of each nucleotide in *Vibrio cholerae*:

In [32]:
def nucleotide_count(seq):
    nuc_counts = {'A':seq.count('A'), 'C':seq.count('C'), 'G':seq.count('G'), 'T':seq.count('T')}
    return nuc_counts


file_path = r'C:\Users\ryanr\OneDrive\Desktop\Coursera\Bioinformatics UCSD\Vibrio_cholerae_genome_seq.txt'

with open(file_path, 'r') as f:
    seq = f.readline()
    
print(nucleotide_count(seq))

{'A': 293942, 'C': 263573, 'G': 256024, 'T': 294711}


### Counting pattern occurrences in a sequence:

### **Answer:**

In [12]:
# Count the occurrences of a given pattern in a sequence
def pattern_count(seq, pattern):
    # Store 1 if pattern is encountered, and sum total
    return sum([1 for nuc in range(len(seq) - len(pattern)) if seq[nuc:nuc + len(pattern)] == pattern])


file = r'C:\Users\ryanr\OneDrive\Desktop\Coursera\Bioinformatics UCSD\dataset_30272_6.txt'
with open(file, 'r') as f:
    # Read the first line and assign it to the variable 'Seq'
    Seq = f.readline().strip()
    # Read the second line and assign it to the variable 'pattern'
    Pattern = f.readline().strip()
    
print("Seq:", Seq)
print("Pattern:", Pattern)
print(f"Occurrences of 'TTATCCATT' in Vibrio cholerae: {pattern_count(Seq, Pattern)}")

Seq: TAGAGATTATCCAATTATCCACCGTTATCCAATTTATCCACATTATCCATTTATCCAGTTATCCACTTATCCAGTTATCCATTGATTATCCATTATCCAGTCTTATCCATTTATCCATTATCCATTATCCATTATCCATTATCCATTATCCATTATCCAATTTATCCATTATCCAGGAACACCGGCAATGCCAGTTATCCAGATTATCCATTATCCATTATCCAACCGATTATCCAGCTTATCCACTGCCGTACATGATTATCCATTATCCAAATACAGTTATCCATAATTATCCAGGGTATTTATCCATTATCCAACTTATCCATTTATCCATTATCCATATTATCCAGGCACGCGTTATCCAGAGTGACTTGTTATCCATTTTATCCACATTATCCACAATAGTATTTATCCATTATCCAAAGCGCGTTATCCAAGTTATCCATTATCCATTATCCACCAACATTATCCAGATTATCCATGTGTTATCCAGTTATCCAGAGTTATCCAACACTTATCCACTTATCCATTTATCCATTATCCACCCATTATCCATTATCCACTGTCCTTATCCATGTTATCCACATTATCCATTTATCCATTATCCAGCTTTATCCACATCTTTATCCATTATCCAGACCATTATCCATGCTTATCCAAGGATCTTATCCACGAAGTTATCCATATACATTCAACGTTATCCACATAATTTATCCATTATCCATTATCCAACTTATCCAGTTATCCACTTATCCAGGCTTATCCAACCTTTATCCATTATCCAAGTCTTATCCATACTTATCCATTATCCACTTATCCATATTATCCACCCGTTATCCACGGTTATCCAGTTATCCACTTATCCAATTATCCATTATCCATTTATCCATTCTTTTATCCACTTTATCCACATGAGTATTTTATCCACGTTATCCAAGTTATCCATTTTATCCATTATCCATTATCCATTATCCACTCTTATCCACCGATTATCC

### Counting the most frequent pattern occurrences (k-mers of length 2, 3, etc.) in a sequence:

### **Answer:**

In [21]:
import itertools

# Create a frequency map of k-mer pattern occurrences as a dictionary
def freq_map(seq, k):
    freqMap = {}
    # For the whole sequence - k
    for i in range(len(seq) - int(k) + 1):
        # Scanning window of letters (of length k)
        pattern = seq[i:i + int(k)]
        # For each new k-length segment, it sets its value to 1 in dictionary
        if pattern not in freqMap:
            freqMap[pattern] = 1
        # If k-length segment is already in dictionary, it increments its value (initially 1)
        freqMap[pattern] += 1
    return freqMap

# Retrieve the most frequent patterns in frequency table
def most_freq_patterns(seq, k):
    freqPatterns = []
    # Get freqMap dictionary
    freqMap = freq_map(seq, int(k))
    # For each pattern (key), count (value) item in dictionary
    for pattern, count in freqMap.items():
        # Find patterns with max count values
        if count == max(freqMap.values()):
            # Append freqPatterns list with those patterns
            freqPatterns.append(pattern)
    return freqPatterns


file_path = r'C:\Users\ryanr\OneDrive\Desktop\Coursera\Bioinformatics UCSD\dataset_30272_13.txt'
with open(file_path, 'r') as file:
    Seq = file.readline().strip()
    K = file.readline().strip()

# Get the most frequent patterns
freqPatterns = most_freq_patterns(Seq, K)
print("Most Frequent Patterns:", freqPatterns)

# Get the frequency map
freqMap = freq_map(Seq, K)

print(f"Frequency Map: ")
[print(f"{key}:{value}") for key, value in list(freqMap.items())[:10]]
print(".\n.\n.")

Most Frequent Patterns: ['TACGTTTTGGATAG', 'ACGTTTTGGATAGT']
Frequency Map: 
ATTTGGCATTTGGC:8
TTTGGCATTTGGCA:4
TTGGCATTTGGCAT:4
TGGCATTTGGCATT:4
GGCATTTGGCATTT:4
GCATTTGGCATTTG:4
CATTTGGCATTTGG:4
TTTGGCATTTGGCT:3
TTGGCATTTGGCTA:2
TGGCATTTGGCTAC:2
.
.
.


### Finding the compliment of a sequence and reversing it:

In [14]:
# Obtaining the reverse compliment of a DNA sequence
def reverse_compliment(seq):
    # Assign complimentary letters
    comp_dict = {'A':'T', 'T':'A', 'C':'G', 'G':'C'}
    # Replace each nucleotide with its complementary letter
    comp_seq = ''.join(comp_dict[n] for n in seq)
    # Reverse the entire string
    rev_comp_seq = comp_seq[::-1]
    return rev_comp_seq



file_path = r'C:\Users\ryanr\OneDrive\Desktop\Coursera\Bioinformatics UCSD\dataset_30273_2.txt'
with open(file_path, 'r') as file:
    Seq = file.readline().strip()

rev_comp_seq = reverse_compliment(Seq)
print("Original Sequence: ", Seq[:10], "...", Seq[-10:], sep='')
print("\nReversed Complementary Sequence: ", rev_comp_seq[:10], "...", rev_comp_seq[-10:], sep='')

Original Sequence: ACCTCAGGTT...GATAACCTTC

Reversed Complementary Sequence: GAAGGTTATC...AACCTGAGGT


### Finding all locations/occurrences of a pattern within a sequence:

In [15]:
# Find locations of a k-mer pattern within a DNA sequence
def find_pattern_indices(seq, pattern):
    occurrences = []
    k = len(pattern)
    # For whole sequence - k
    for i in range(len(seq) - k + 1):
        # Checking for our pattern using a scanning window of size k 
        if seq[i:i + k] == pattern:
            # If our pattern is found, append the index to occurences list
            occurrences.append(i)
    occurrences_nocommas = ' '.join(map(str, occurrences))
    return occurrences_nocommas



file_path = r'C:\Users\ryanr\OneDrive\Desktop\Coursera\Bioinformatics UCSD\dataset_30273_5.txt'
with open(file_path, 'r') as file:
    Pattern = file.readline().strip()
    Seq = file.readline().strip()

result = find_pattern_indices(Seq, Pattern)
print(f"The pattern '{Pattern}' occurs at indices:\n{result}")

The pattern 'ATTATCAAT' occurs at indices:
11 141 148 155 172 278 285 354 428 468 566 573 593 765 772 779 798 805 853 882 889 904 973 980 987 994 1074 1081 1088 1138 1171 1201 1208 1261 1347 1362 1386 1393 1421 1465 1481 1507 1607 1737 1744 1759 1766 1814 1824 1881 1964 1982 2017 2040 2142 2186 2193 2223 2230 2257 2285 2301 2325 2332 2359 2422 2429 2476 2483 2499 2537 2615 2634 2643 2650 2657 2664 2717 2724 2740 2747 2757 2788 2833 2881 2926 3008 3053 3060 3091 3106 3135 3151 3195 3233 3240 3277 3323 3330 3345 3352 3418 3456 3463 3577 3584 3591 3598 3660 3676 3731 3738 3798 3851 3858 3865 3983 4018 4198 4205 4212 4219 4259 4274 4322 4354 4382 4463 4487 4541 4556 4563 4659 4696 4738 4807 4833 4840 4857 4864 4919 4954 4961 4986 4993 5076 5091 5116 5136 5233 5302 5309 5364 5374 5405 5412 5428 5498 5518 5525 5567 5585 5602 5649 5716 5827 5950 6053 6117 6124 6133 6140 6147 6218 6271 6288 6295 6323 6330 6395 6462 6560 6567 6602 6627 6642 6659 6738 6745 6756 6786 6801 6820 6843 6850 6882 6982

### Finding where 'CTTGATCAT' occurs in the *Vibrio cholerae* genome:

In [16]:
file_path = r'C:\Users\ryanr\OneDrive\Desktop\Coursera\Bioinformatics UCSD\Vibrio_cholerae.txt'
with open(file_path, 'r') as genome:
    Seq = genome.readline().strip()
    
Pattern = 'CTTGATCAT'

result = find_pattern_indices(Seq, Pattern)
print(f"The pattern '{Pattern}' occurs at indices:\n{result}")

The pattern 'CTTGATCAT' occurs at indices:
60039 98409 129189 152283 152354 152411 163207 197028 200160 357976 376771 392723 532935 600085 622755 1065555


### Find a k-mer pattern forming a 'clump' in a sequence:
 - ### *Clump(L, t)* denotes a pattern being found *t* times within *L* nucleotides
 - ### e.g. (50,4)-clump: find all k-mers occuring *t* = 4 times within an *L* = 50 nucleotide section

In [17]:
# Find patterns that form (L,t)-clumps i.e., occurring t times within an L-nucleotide stretch
def find_clumps(seq, k, L, t):
    patterns = []
    for i in range(len(seq) - int(L) + 1):
        # Scanning window of length L
        window = seq[i:i + int(L)]
        # Get freqTable dictionaries of k-mers within L-length windows of sequence
        freqMap = freq_table(window, k)
        # For each item in dictionary
        for j in freqMap:
            # If value of key (counts) is t or higher, add to Patterns list
            if freqMap[j] >= int(t) and j not in patterns:
                patterns.append(j)
    patterns_nocommas = ' '.join(map(str, patterns))
    return patterns_nocommas



file_path = r'C:\Users\ryanr\OneDrive\Desktop\Coursera\Bioinformatics UCSD\dataset_30274_5.txt'
with open(file_path, 'r') as file:
    Seq = file.readline().strip()
    k, L, t = file.readline().strip().split()

result = find_clumps(Seq, k, L, t)
print(f"{k}-mers occurring at least {t} times in {L}-long segments: {result}")

10-mers occurring at least 4 times in 100-long segments: GAACCTGGGT AACCTGGGTT CCCCATTGCG CCCCCATTGC CTTGTTGGTG CGCGAGCGGT GCGAGCGGTG TACCTAAAGT TCCAGATACC GTCCAGATAC TGTTACTGCA ACACCCAAAC GCACGGGAGG CACGGGAGGC


### How many different 9-mers form (500,3)-clumps in the E. coli genome?

In [30]:
from collections import defaultdict

# Faster/more efficient code for longer genomes
def find_clumps_V2(seq, k, L, t):
    patterns = set()
    # Make one freqMap dictionary, default key values = 0
    freqMap = defaultdict(int)
    # Add each pattern to freqMap for first window
    for nuc in range(L):
        pattern = seq[nuc:nuc + k]
        freqMap[pattern] += 1
        if freqMap[pattern] == t:
            patterns.add(pattern)
    # Slide the window and update the frequency table incrementally
    for nuc in range(1, len(seq) - L + 1):
        old_pattern = seq[nuc - 1:nuc - 1 + k]
        # Decrement the k-mer that exited to avoid double counts in the current window
        freqMap[old_pattern] -= 1
        new_pattern = seq[nuc + L - k:nuc + L]
        # Increment the new k-mer that entered window
        freqMap[new_pattern] += 1
        if freqMap[new_pattern] == t:
            patterns.add(new_pattern)
    return list(patterns)



file_path = r'C:\Users\ryanr\OneDrive\Desktop\Coursera\Bioinformatics UCSD\E_coli.txt'
with open(file_path, 'r') as file:
    Seq = file.readline().strip()
    
k = 9
L = 500
t = 3

result = find_clumps_V2(Seq, k, L, t)
print(f"Total number of {k}-mers that form (500, 3)-Clumps: {len(result)}")
print(f"{k}-mers occurring in (500, 3)-Clumps in E-coli:\n{result[:10]}...")

Total number of 9-mers that form (500, 3)-Clumps: 1904
9-mers occurring in (500, 3)-Clumps in E-coli:
['CCGGATGCG', 'CCCTGCGGG', 'GCCTGCGGC', 'CAGCAGGCC', 'GTCGGGGCT', 'CGTCGAAGC', 'TCGGATAAG', 'GATGGTGGT', 'AGCCGCTAT', 'GCCGACCAG']...


## Using Object Oriented Programming to Create a DNASequence Class 

In [31]:
from DNASequenceClass import DNASequence 
import random_sequence

s1 = random_sequence.random_sequence(100)
s2 = random_sequence.random_sequence(500)

seq1 = DNASequence(s1)
seq2 = DNASequence(s2)

print(f"Sequence: {s1}")
print(f"Length: {len(seq1)}")
print(f"Nucleotide counts: {seq1.nuc_count()}")
print(f"Reverse compliment sequence: {seq1.rev_comp()}")
print(f"\nSequence: {s2}")
print(f"Length: {len(seq2)}")
print(f"Nucleotide counts: {seq2.nuc_count()}")
print(f"Reverse compliment sequence: {seq2.rev_comp()}")

Sequence: AACATCCATGACAGCCTTCGGTTAGTGGTTAAGGTACCAGTCCGCATAAGCCGTAGAAGATAGCCAAAGTGGCCTCGCCGGAGTTCGTTCCGACGTGTTT
Length: 100
Nucleotide counts: {'A': 24, 'C': 25, 'G': 27, 'T': 24}
Reverse compliment sequence: AAACACGTCGGAACGAACTCCGGCGAGGCCACTTTGGCTATCTTCTACGGCTTATGCGGACTGGTACCTTAACCACTAACCGAAGGCTGTCATGGATGTT

Sequence: TCTAAGAGTAATAACAGCGGCCGACCAATACAAGCCCACTTCACACTACGGCCGTTGCCTCGTCATGGAAATGTCCTGCCCGCCTTCCCACGCCCGGGGAATTCTAGTATAAAAGCGTCCCATAAGTCAGTTACTAGAAAAGTTATGGGTAACTGCACGAAATAATATGCTGCGCGTGAAACATTCTATAGCGCCATTTCAAGGAGGTTTAGACAAAGCGACGAAACCAGCCTTTATCCTGGAGGCGCTAATTAGTTCCTTGCCGGAAAGAGTCGCCTGAGTCCATTGATAGCGCGGCACGAAGATGGTAGCCCGCTTCTGAATATATGTGACTACCATCTGCTGAATACAAGCTGCTGCAGACGTTCCCCACGGCTTGATCCGTAGGAGTTTCTAACAATGTTACCGTAGGAAGCAACTCCTTATCGCTGTTCGTATCTCATTGTGTGTCCTTTGGTGCAGCGCGAGAGTACGCCAAGCGCTGATCCACTCGGTTTCAA
Length: 500
Nucleotide counts: {'A': 130, 'C': 130, 'G': 117, 'T': 123}
Reverse compliment sequence: TTGAAACCGAGTGGATCAGCGCTTGGCGTACTCTCGCGCTGCACCAAAGGACACACAATGAGATACGAACAGCGATAA