In [1]:
# import functions and libraries
import statistics as stats
from functions import *

# Data

## Peaks1 : Fasta File

In [2]:
peaks = readFasta("Sequence_by_Peaks_1.fasta")
peaks = [seq.upper() for seq in peaks]

Information about Peaks 1

In [3]:
print(f'number of sequences : {len(peaks)}')
seq_len = [len(seq) for seq in peaks]
print(f'average length of sequences : {round(stats.mean(seq_len),3)} nt')
print(f'min length : {min(seq_len)} nt')
print(f'max length : {max(seq_len)} nt')
print(f'\ndata structure of peaks : {type(peaks)}')

number of sequences : 115
average length of sequences : 277.087 nt
min length : 151 nt
max length : 2051 nt

data structure of peaks : <class 'list'>


## Fixation Site of the Transcription Factor `AFT1`

In [4]:
aft1 = "GGGTGCA"

### Checking the thresholds

To make sure that we don't accidentally eliminate aft1, we should find the threshold parameters.

In [5]:
removeLowComplexeHomo({aft1:1},5)

['GGGTGCA']

In [6]:
removeLowComplexeHetero({aft1:1},2)

['GGGTGCA']

In [7]:
removeTARich({aft1:1},0.3)

['GGGTGCA']

# Preparing the candidate kmers

In [8]:
k = 7
m = 5
n = 2
p = 0.3

Search all possible motifs of size k of all the sequences and their frequences.

In [9]:
peaks_kmers = searchMotifs(k,peaks)

print(f'data structure of peaks_kmers : {type(peaks_kmers)}')
print(f'number of kmers : {len(peaks_kmers)}')

data structure of peaks_kmers : <class 'dict'>
number of kmers : 11724


Not all of these kmers are informative so we'll eliminate these candidate kmers with different conditions.

In [10]:
rem_homo = removeLowComplexeHomo(peaks_kmers,m)

print(f'data structure of peaks_kmers : {type(rem_homo)}')
print(f'number of kmers : {len(rem_homo)}')
print(f'number of kmers reduced : {len(peaks_kmers)-len(rem_homo)}')

data structure of peaks_kmers : <class 'list'>
number of kmers : 11028
number of kmers reduced : 696


In [11]:
rem_het = removeLowComplexeHetero(rem_homo,n)

print(f'data structure of peaks_kmers : {type(rem_het)}')
print(f'number of kmers : {len(rem_het)}')
print(f'number of kmers reduced : {len(rem_homo)-len(rem_het)}')

data structure of peaks_kmers : <class 'list'>
number of kmers : 6860
number of kmers reduced : 4168


In [12]:
rem_ta = removeTARich(rem_het,p)

print(f'data structure of peaks_kmers : {type(rem_ta)}')
print(f'number of kmers : {len(rem_ta)}')
print(f'number of kmers reduced : {len(rem_het)-len(rem_ta)}')

data structure of peaks_kmers : <class 'list'>
number of kmers : 506
number of kmers reduced : 6354


In [13]:
kmers = rem_ta

# Evaluating the resulting candidate kmers

In [14]:
def compare(cand,ref) :
    dist = dict()
    aux = ref

    i = 0
    while i < len(ref)-2 :
        d = 0
        j = 0
        while j < len(aux) :
            if aux[j] != cand[j] : d += 1
            j += 1
        dist[aux] = (d,i)
        i += 1
        aux = aux[1:]
    return min(dist.items(), key=lambda x: x[1])

In [15]:
def eval(kmers,ref) :
    return {cand:compare(cand,ref) for cand in kmers}

# Calculations

## HashTable

In [30]:
ht = hashTable(peaks,kmers)
ht_top = getTopMotifs(ht,5,True)
print("Top 5 candidates found by HashTable")
print(ht_top)

Top 5 candidates found by HashTable
{'TGCACCC': 21, 'GGGTGCA': 17, 'CTGCACC': 10, 'GTGCACC': 7, 'GCACCCT': 6}


In [17]:
eval(ht_top,aft1)

{'TGCACCC': ('TGCA', (0, 3)),
 'GGGTGCA': ('GGGTGCA', (0, 0)),
 'CTGCACC': ('GTGCA', (1, 2)),
 'GTGCACC': ('GTGCA', (0, 2)),
 'GCACCCT': ('GCA', (0, 4))}

## Median String

In [18]:
ms = medianStringSearch(peaks,kmers)

In [31]:
ms_top = getTopMotifs(ms,5,False)
print("Top 5 candidates found by Median String")
print(ms_top)

Top 5 candidates found by Median String
{'TGCACCC': 191, 'GGGTGCA': 195, 'GGTGCAC': 199, 'GGTGCAG': 200, 'CTGCACC': 203}


In [20]:
eval(ms_top,aft1)

{'TGCACCC': ('TGCA', (0, 3)),
 'GGGTGCA': ('GGGTGCA', (0, 0)),
 'GGTGCAC': ('GGTGCA', (0, 1)),
 'GGTGCAG': ('GGTGCA', (0, 1)),
 'CTGCACC': ('GTGCA', (1, 2))}

## Suffix Trees

In [32]:
peaks_concat = " ".join(peaks)
stree = constructTree(peaks_concat)
motif_occur_var, motif_occur_var_seqs = inexactMatch([aft1],peaks_concat,stree,1)
st_top = getTopMotifs(motif_occur_var,5,True)
print("Top 5 candidates found by Suffix Trees")
print(st_top)

Top 5 candidates found by Suffix Trees
{'GGGTGCA': 17, 'GGGTGTA': 9, 'GGGTGAA': 5, 'GGGTGCC': 4, 'GGGTGCT': 3}


In [26]:
eval(st_top,aft1)

{'GGGTGCA': ('GGGTGCA', (0, 0)),
 'GGGTGTA': ('GGGTGCA', (1, 0)),
 'GGGTGAA': ('GGGTGCA', (1, 0)),
 'GGGTGCC': ('GGGTGCA', (1, 0)),
 'GGGTGCT': ('GGGTGCA', (1, 0))}

In [28]:
print("Is it a variation of GGGTGCA?")
for x in st_top :
    if x != "GGGTGCA" : print(f'{x} : {x in motif_occur_var_seqs["GGGTGCA"]}')

Is it a variation of GGGTGCA?
GGGTGTA : True
GGGTGAA : True
GGGTGCC : True
GGGTGCT : True


# Conclusion

## HashTable
| motif | match | distance | displacement |
| :-: | :-: | :-: | :-: |
| TGCACCC | TGCA | 0 | 3 |
| GGGTGCA | GGGTGCA | 0 | 0 |
| CTGCACC | GTGCA | 1 | 2 |
| GTGCACC | GTGCA | 0 | 2 |
| GCACCCT | GCA | 0 | 4 |

## Median String
| motif | match | distance | displacement |
| :-: | :-: | :-: | :-: |
| TGCACCC | TGCA | 0 | 3 |
| GGGTGCA | GGGTGCA | 0 | 0 |
| GGTGCAC | GGTGCA | 0 | 1 |
| GGTGCAG | GGTGCA | 0 | 1 |
| CTGCACC | GTGCA | 1 | 2 |

## Suffix Trees
| motif | match | distance | displacement | variation |
| :-: | :-: | :-: | :-: | :-: |
| GGGTGCA | GGGTGCA | 0 | 0 | original |
| GGGTGTA | GGGTGCA | 1 | 0 | 1 |
| GGGTGAA | GGGTGCA | 1 | 0 | 1 |
| GGGTGCC | GGGTGCA | 1 | 0 | 1 |
| GGGTGCT | GGGTGCA | 1 | 0 | 1 |