In [181]:
# import functions and libraries
import statistics as stats
import numpy as np
import pandas as pd

from functions import *

# Data

## Peaks1 : Fasta File

In [182]:
peaks = readFasta("Sequence_by_Peaks_1.fasta")
peaks = [seq.upper() for seq in peaks]

Information about Peaks 1

In [183]:
print(f'number of sequences : {len(peaks)}')
seq_len = [len(seq) for seq in peaks]
print(f'average length of sequences : {round(stats.mean(seq_len),3)} nt')
print(f'min length : {min(seq_len)} nt')
print(f'max length : {max(seq_len)} nt')
print(f'\ndata structure of peaks : {type(peaks)}')

number of sequences : 115
average length of sequences : 277.087 nt
min length : 151 nt
max length : 2051 nt

data structure of peaks : <class 'list'>


## Fixation Site of the Transcription Factor `AFT1`

In [184]:
aft1 = "GGGTGCA"

### Checking the thresholds

To make sure that we don't accidentally eliminate aft1, we should find the threshold parameters.

In [185]:
removeLowComplexeHomo({aft1:1},5)

['GGGTGCA']

In [186]:
removeLowComplexeHetero({aft1:1},2)

['GGGTGCA']

In [187]:
removeTARich({aft1:1},0.3)

['GGGTGCA']

# Preparing the candidate kmers

In [188]:
k = 7
m = 5
n = 2
p = 0.3

Search all possible motifs of size k of all the sequences and their frequences.

In [189]:
peaks_kmers = searchMotifs(k,peaks)

print(f'data structure of peaks_kmers : {type(peaks_kmers)}')
print(f'number of kmers : {len(peaks_kmers)}')

data structure of peaks_kmers : <class 'dict'>
number of kmers : 11724


Not all of these kmers are informative so we'll eliminate these candidate kmers with different conditions.

In [190]:
rem_homo = removeLowComplexeHomo(peaks_kmers,m)

print(f'data structure of peaks_kmers : {type(rem_homo)}')
print(f'number of kmers : {len(rem_homo)}')
print(f'number of kmers reduced : {len(peaks_kmers)-len(rem_homo)}')

data structure of peaks_kmers : <class 'list'>
number of kmers : 11028
number of kmers reduced : 696


In [191]:
rem_het = removeLowComplexeHetero(rem_homo,n)

print(f'data structure of peaks_kmers : {type(rem_het)}')
print(f'number of kmers : {len(rem_het)}')
print(f'number of kmers reduced : {len(rem_homo)-len(rem_het)}')

data structure of peaks_kmers : <class 'list'>
number of kmers : 6860
number of kmers reduced : 4168


In [192]:
rem_ta = removeTARich(rem_het,p)

print(f'data structure of peaks_kmers : {type(rem_ta)}')
print(f'number of kmers : {len(rem_ta)}')
print(f'number of kmers reduced : {len(rem_het)-len(rem_ta)}')

data structure of peaks_kmers : <class 'list'>
number of kmers : 506
number of kmers reduced : 6354


In [193]:
kmers = rem_ta

# Evaluating the resulting candidate kmers

In [194]:
def compare(cand,ref) :
    dist = dict()
    aux = ref

    i = 0
    while i < len(ref)-2 :
        d = 0
        j = 0
        while j < len(aux) :
            if aux[j] != cand[j] : d += 1
            j += 1
        dist[aux] = (d,i) # distance, displacement
        i += 1
        aux = aux[1:]
    return min(dist.items(), key=lambda x: x[1])

In [195]:
def eval(kmers,ref) :
    return {cand:compare(cand,ref) for cand in kmers}

In [210]:
def convert_df_eval(res) :
    cand = res.keys()
    stats = res.values()
    substring = []
    dist = []
    displace = []
    for (ss,(d1,d2)) in stats :
        substring.append(ss)
        dist.append(d1)
        displace.append(d2)
    df = {'motif':cand, 'match':substring,'distance':dist, 'displacement':displace}
    return pd.DataFrame(df)


# Calculations

## HashTable

In [197]:
ht = hashTable(peaks,kmers)
ht_top = getTopMotifs(ht,5,True)
print("Top 5 candidates found by HashTable")
print(ht_top)

Top 5 candidates found by HashTable
{'TGCACCC': 21, 'GGGTGCA': 17, 'CTGCACC': 10, 'GTGCACC': 7, 'GCACCCT': 6}


In [211]:
res_ht = convert_df_eval(eval(ht_top,aft1))
display(res_ht)

Unnamed: 0,motif,match,distance,displacement
0,TGCACCC,TGCA,0,3
1,GGGTGCA,GGGTGCA,0,0
2,CTGCACC,GTGCA,1,2
3,GTGCACC,GTGCA,0,2
4,GCACCCT,GCA,0,4


## Median String

In [202]:
ms = medianStringSearch(peaks,kmers)
ms_top = getTopMotifs(ms,5,False)
print("Top 5 candidates found by Median String")
print(ms_top)

Top 5 candidates found by Median String
{'TGCACCC': 191, 'GGGTGCA': 195, 'GGTGCAC': 199, 'GGTGCAG': 200, 'CTGCACC': 203}


In [212]:
res_ms = convert_df_eval(eval(ms_top,aft1))
display(res_ms)

Unnamed: 0,motif,match,distance,displacement
0,TGCACCC,TGCA,0,3
1,GGGTGCA,GGGTGCA,0,0
2,GGTGCAC,GGTGCA,0,1
3,GGTGCAG,GGTGCA,0,1
4,CTGCACC,GTGCA,1,2


## Suffix Trees

In [204]:
peaks_concat = " ".join(peaks)
stree = constructTree(peaks_concat)
motif_occur_var, motif_occur_var_seqs = inexactMatch([aft1],peaks_concat,stree,1)
st_top = getTopMotifs(motif_occur_var,5,True)
print("Top 5 candidates found by Suffix Trees")
print(st_top)

Top 5 candidates found by Suffix Trees
{'GGGTGCA': 17, 'GGGTGTA': 9, 'GGGTGAA': 5, 'GGGTGCC': 4, 'GGGTGCT': 3}


In [213]:
res_st = convert_df_eval(eval(st_top,aft1))
display(res_st)

Unnamed: 0,motif,match,distance,displacement
0,GGGTGCA,GGGTGCA,0,0
1,GGGTGTA,GGGTGCA,1,0
2,GGGTGAA,GGGTGCA,1,0
3,GGGTGCC,GGGTGCA,1,0
4,GGGTGCT,GGGTGCA,1,0


In [23]:
print("Is it a variation of GGGTGCA?")
for x in st_top :
    if x != "GGGTGCA" : print(f'{x} : {x in motif_occur_var_seqs["GGGTGCA"]}')

Is it a variation of GGGTGCA?
GGGTGTA : True
GGGTGAA : True
GGGTGCC : True
GGGTGCT : True


# Summary of Results

In [216]:
print("HashTable")
display(res_ht)
print("\nMedian String")
display(res_ms)
print("\nSuffix Trees")
display(res_st)

HashTable


Unnamed: 0,motif,match,distance,displacement
0,TGCACCC,TGCA,0,3
1,GGGTGCA,GGGTGCA,0,0
2,CTGCACC,GTGCA,1,2
3,GTGCACC,GTGCA,0,2
4,GCACCCT,GCA,0,4



Median String


Unnamed: 0,motif,match,distance,displacement
0,TGCACCC,TGCA,0,3
1,GGGTGCA,GGGTGCA,0,0
2,GGTGCAC,GGTGCA,0,1
3,GGTGCAG,GGTGCA,0,1
4,CTGCACC,GTGCA,1,2



Suffix Trees


Unnamed: 0,motif,match,distance,displacement
0,GGGTGCA,GGGTGCA,0,0
1,GGGTGTA,GGGTGCA,1,0
2,GGGTGAA,GGGTGCA,1,0
3,GGGTGCC,GGGTGCA,1,0
4,GGGTGCT,GGGTGCA,1,0


# Locating the sites

There are 2 algorithms we can use to locate the fixation site.

## Indexing motifs of a fixed length

In [24]:
peaks = [p.lower() for p in peaks]
aft1 = aft1.lower()
maxVar = 2

In [25]:
pos_seqs1 = findMotifData(peaks,aft1,k,maxVar)
print(f'positions of gggtgca\n {pos_seqs1}')
print(f'found in {len(pos_seqs1)} sequences out of {len(peaks)}')

pos_seqs2 = findMotifData(peaks,aft1_2,k,maxVar)
print(f'\npositions of gggtgca\n {pos_seqs2}')
print(f'found in {len(pos_seqs2)} sequences out of {len(peaks)}')

positions of gggtgca
 {0: {'gggtgca': [347]}, 1: {'gggtgca': [307]}, 4: {'gggtgca': [162]}, 11: {'gggtgca': [267]}, 13: {'gggtgca': [74]}, 29: {'gggtgca': [117]}, 38: {'gggtgca': [349]}, 40: {'gggtgca': [791]}, 46: {'gggtgca': [288]}, 53: {'gggtgca': [704]}, 55: {'gggtgca': [246]}, 77: {'gggtgca': [164]}, 89: {'gggtgca': [92]}, 93: {'gggtgca': [261]}, 101: {'gggtgca': [73]}, 102: {'gggtgca': [193]}, 112: {'gggtgca': [34]}}
found in 17 sequences out of 115

positions of gggtgca
 {0: {'tgcaccc': [404, 447]}, 1: {'tgcaccc': [949, 997]}, 10: {'tgcaccc': [93]}, 12: {'tgcaccc': [73]}, 28: {'tgcaccc': [197]}, 31: {'tgcaccc': [151]}, 39: {'tgcaccc': [206, 238]}, 46: {'tgcaccc': [236]}, 47: {'tgcaccc': [40]}, 48: {'tgcaccc': [184]}, 49: {'tgcaccc': [54]}, 53: {'tgcaccc': [358]}, 54: {'tgcaccc': [159]}, 57: {'tgcaccc': [65]}, 58: {'tgcaccc': [184]}, 67: {'tgcaccc': [182]}, 68: {'tgcaccc': [169]}, 72: {'tgcaccc': [148]}}
found in 18 sequences out of 115


## Frequency Matrixes

`matrix ID : MA0269.2`

Collection : Core

Taxon : Fungi

Species : Saccharomyces cerevisiae

Data Type : PBM

Validation : 18842628

Uniprot ID : P22149

link : https://jaspar.elixir.no/matrix/MA0269.2/

In [50]:
mat = readJaspar("MA0269.2.jaspar")
print("Frequence table provided by jaspar.elixir.no\n")
for line in mat : print(line)
mat = np.array(mat)

Frequence table provided by jaspar.elixir.no

[136, 716, 145, 34, 160, 10, 961, 8, 25, 19]
[78, 25, 31, 47, 10, 966, 9, 972, 955, 740]
[142, 31, 288, 39, 812, 11, 15, 9, 2, 13]
[641, 226, 535, 878, 16, 10, 14, 10, 16, 225]


### My Functions to Calculate and Evaluate

In [119]:
def getMotifs(sequence,pos_lst,k) :
    return [sequence[p:p+k] for p in pos_lst]

In [120]:
def reduct_compare(cand,ref) :
    dist = []
    aux = ref

    i = 0
    while i < len(ref)-2 :
        d = 0
        j = 0
        while j < len(aux) :
            if aux[j] != cand[j] : d += 1
            j += 1
        dist.append((d,i)) # distance, displacement
        i += 1
        aux = aux[1:]
    return min(dist)

In [121]:
def reduct_eval(kmers,ref) :
    return {cand:reduct_compare(cand,ref) for cand in kmers}

In [122]:
def getTopEval(stats,n) :
    lst = [(motif,(dist,displace)) for motif,(dist,displace) in stats.items()]
    lst.sort(key=lambda x : x[1])
    return lst[:n]

### Calculation

In [52]:
PWM = computing_pwm(mat,k)
f = f0_calcule(PWM,k)
pos = searchPWMOptmiseMotifs(peaks,7,PWM,f)

In [158]:
res = dict()

for i,lst in pos.items() :
    seq = peaks[i]

    motifs = getMotifs(seq,lst,k)
    motifs_freq = Counter(motifs)
    stats = reduct_eval(motifs,aft1)
    top = getTopEval(stats,len(stats))

    res[i] = [(m,s,motifs_freq[m]) for m,s in top]

res

{0: [('gggtgca', (0, 0), 1),
  ('gtgtgca', (1, 0), 1),
  ('gagtgaa', (2, 0), 1),
  ('gactgca', (2, 0), 1),
  ('tggggca', (2, 0), 1),
  ('taaagca', (2, 3), 1),
  ('aagtaca', (3, 0), 1),
  ('aattgca', (3, 0), 2),
  ('aactgca', (3, 0), 1),
  ('tagtgaa', (3, 0), 1),
  ('tagtaga', (3, 2), 1),
  ('tatttca', (3, 3), 1),
  ('ttttgct', (3, 3), 2),
  ('aattgaa', (3, 4), 1),
  ('aattgta', (3, 4), 1)],
 1: [('gggtgca', (0, 0), 1),
  ('aggtgca', (1, 0), 1),
  ('gaaagca', (1, 4), 1),
  ('cgttgca', (2, 0), 2),
  ('gtttgca', (2, 0), 1),
  ('gagttca', (2, 0), 1),
  ('ttgcgca', (2, 2), 1),
  ('taatgta', (2, 4), 1),
  ('tcttgca', (2, 4), 1),
  ('taaggca', (2, 4), 1),
  ('aaatgca', (2, 4), 1),
  ('aagggca', (3, 0), 1),
  ('ttttgca', (3, 0), 1),
  ('tagtgga', (3, 0), 1),
  ('tagtgta', (3, 0), 1),
  ('ctttgca', (3, 0), 2),
  ('tagtaga', (3, 2), 1),
  ('atttaca', (3, 2), 1),
  ('aattgaa', (3, 4), 1)],
 2: [('aggtgca', (1, 0), 1),
  ('gtttaca', (2, 2), 1),
  ('tactgca', (2, 3), 1),
  ('ttaagca', (2, 3), 1),
 

In [159]:
all_freq = dict()

for i,lst in res.items() :
    for (x,_,f) in lst :
        if x in all_freq : all_freq[x] += f
        else : all_freq[x] = f

In [160]:
final_top = getTopMotifs(all_freq,10,True)
final_top

{'gggtgca': 17,
 'tatttca': 14,
 'tattgaa': 13,
 'aattgaa': 9,
 'gaaagca': 9,
 'atttaca': 9,
 'aaataca': 9,
 'aaaagca': 9,
 'aggtgca': 8,
 'aactgca': 7}

In [219]:
res_freq_mat = convert_df_eval(eval(final_top,aft1.lower()))
display(res_freq_mat)

Unnamed: 0,motif,match,distance,displacement
0,gggtgca,gggtgca,0,0
1,tatttca,tgca,3,3
2,tattgaa,tgca,3,3
3,aattgaa,gca,3,4
4,gaaagca,gca,1,4
5,atttaca,gtgca,3,2
6,aaataca,gca,2,4
7,aaaagca,gca,2,4
8,aggtgca,gggtgca,1,0
9,aactgca,gggtgca,3,0


HashTable
| motif | match | distance | displacement |
| :-: | :-: | :-: | :-: |
| TGCACCC | TGCA | 0 | 3 |
| GGGTGCA | GGGTGCA | 0 | 0 |
| CTGCACC | GTGCA | 1 | 2 |
| GTGCACC | GTGCA | 0 | 2 |
| GCACCCT | GCA | 0 | 4 |

Median String
| motif | match | distance | displacement |
| :-: | :-: | :-: | :-: |
| TGCACCC | TGCA | 0 | 3 |
| GGGTGCA | GGGTGCA | 0 | 0 |
| GGTGCAC | GGTGCA | 0 | 1 |
| GGTGCAG | GGTGCA | 0 | 1 |
| CTGCACC | GTGCA | 1 | 2 |

Suffix Trees
| motif | match | distance | displacement | variation |
| :-: | :-: | :-: | :-: | :-: |
| GGGTGCA | GGGTGCA | 0 | 0 | original |
| GGGTGTA | GGGTGCA | 1 | 0 | 1 |
| GGGTGAA | GGGTGCA | 1 | 0 | 1 |
| GGGTGCC | GGGTGCA | 1 | 0 | 1 |
| GGGTGCT | GGGTGCA | 1 | 0 | 1