<a href="https://colab.research.google.com/github/Tycour/crisanti-toolshed/blob/main/docs/lessons/Finding_Motifs_Exercise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
'''
Motifs
- A commonly shared interval in DNA
- In bioinformatics terms: searching a string for a given substring.
'''

# Strings are indexed as though they were a list of characters such that:
seq_1 = 'GCTA'
for nt in seq_1:
  print(nt)

# is functionally the same as:
seq_2 = ['G','C','T','A']
for nt in seq_2:
  print(nt)

# But they are still considered different variables by Python.
print('Are both of these the same to you, Oh Mighty Python?', seq_1 == seq_2)

G
C
T
A
G
C
T
A
Are both of these the same to you, Oh Mighty Python? False


In [None]:
# Python 0-based indexation means that the you subtract 1 to our 1-based conception of positionality:
# e.g. In the sequence, 'GATTACA', the position of all 'A's are 2, 5, and 7.
# But in Python that list would be 1, 3, and 5 respectively.

i = 0
for nt in 'GATTACA':
  if nt == 'A':
    print(i)
  i += 1

for i, nt in enumerate('GATTACA'):
  if nt == 'A':
    print(i)

# 'x += y' means adding y to x. It is the same as writing 'x = y + x'.

1
4
6
1
4
6


In [None]:
# Bit of revision about extracting elements you already know
seq = 'GATTACA'

# What do these return?
print(seq[0])
print(seq[:3])
print(seq[3:])
print(seq[::-1])

G
GAT
TACA
ACATTAG


In [None]:
# A substring of a string can thus be represented as string[x:y]
# where x gives the starting index, and y the index of the character following the end of the substring
# i.e. string[x:y] will return a substring from string[x] up to, but not including, string[y]

seq = 'GATTACA'
print(seq[1:4]) # Prints a substring from indices 1 to 3
print(seq[1]) # Prints the single character substring of index 1
print(seq[4]) # Prints the single character substring of index 4

ATT
A
A


In [None]:
# Given a string, 'seq', and a substring, 'motif':
seq = 'GATATATGCATATACTT'
motif = 'ATAT'

# One way to check for the presence/absence of a substring in a given string:
print(motif in seq) # Returns a boolean value, True in this case
print('A' in 'TCG') # -> False

# When learning, you can always make the code spell everything out for you.
for i, nt in enumerate(seq):
  print(i, '-->', nt, '-->', seq[i:i+4])

# Which are the indices representing the starting positions of motif in seq?

# Capturing indices for later use:
indices = []
for i, nt in enumerate(seq):
  if seq[i:i+4] == motif:
    indices.append(i)

print(indices)

True
False
0 --> G --> GATA
1 --> A --> ATAT
2 --> T --> TATA
3 --> A --> ATAT
4 --> T --> TATG
5 --> A --> ATGC
6 --> T --> TGCA
7 --> G --> GCAT
8 --> C --> CATA
9 --> A --> ATAT
10 --> T --> TATA
11 --> A --> ATAC
12 --> T --> TACT
13 --> A --> ACTT
14 --> C --> CTT
15 --> T --> TT
16 --> T --> T
[1, 3, 9]


In [1]:
seq = 'GATATATGCATATACTT'
motif = 'ATAT'

# You could write a function to automate this process further:
def find_motif(seq, motif):
  indices = []
  for i, nt in enumerate(seq):
    if seq[i:i+4] == motif:
      indices.append(i) # .append() adds an element to a list
  return indices

print(find_motif(seq, motif))

print(find_motif('GATTACA', 'TT'))

# Can anyone figure out why doesn't this work?
# What quick fix could you make to this code to make it universal?

[1, 3, 9]
[]


In [4]:
# Use 'len(motif)' to allow any motif length to be used

def find_motif(seq, motif):
  indices = []
  for i, nt in enumerate(seq):
    if seq[i:i+len(motif)] == motif:
      indices.append(i) # .append() adds an element to a list
  return indices

print(find_motif('GATTACA', 'TT'))


[2]
