In [53]:
from collections import defaultdict

"""
defaultdict allow to add a value to the dictionary without checking whether it
is a new key. In case the key does not exists, it initialize a new instance
of the specific kind of defaultdict as the value.
"""

def parse_genomes_input(input_file):

    """
    Parse the input file into a dictionary holds as the key the genome_number
    and the value is the the sequences.
    Each sequence is represented as string, combine all letters together.  
    """

    genomes = defaultdict(list)
    for line in input_file:
        sections = line.split('#')
        genome_number = sections[1]
        sequence = ''.join(line.split('\t')[1:]).replace('\n','')
        X_split = []
        if 'X' in sequence:
            X_split = sequence.split('X')
            genomes[genome_number].extend(X_split)
        else:
            genomes[genome_number].append(sequence)
    return genomes

def find_all_words(d, genomes):
   
    """
    Create a dictionary that containes all possible words of length d
    from the genome sequences. The key is the substring and the value is 
    a set of all the genomes which this word occurs.
    """
    
    words = defaultdict(set)
    for genome_number, genome_sequences in genomes.items():
        for sequence in genome_sequences:
            for i in range(0, len(sequence) - d * 4 + 1, 4):
                word = sequence[i:i + 4 * d]
                words[word].add(genome_number)
    return words

def find_consecutive_cogs(words):
    
    """
    Iterating all the optional words and find cogs that appears
    at least 3 times in a row. It will create a dictionary where the
    key is the cog number and the velue is a set of all words containing it 
    3 times or more, and appears in 10 genomes or more. 
    """
    consecutive_cogs_with_words = {}
    for word, genomes in words.items():
      for i in range(0, len(word) - 11, 4):
        if word[i:i+4] == word[i+4:i+8] == word[i+8:i+12]:
          number_of_genomes = len(words[word])
          if number_of_genomes >= 10:
            if word[i:i+4] in consecutive_cogs_with_words:
              consecutive_cogs_with_words[word[i:i+4]].add(word)
            else:
              consecutive_cogs_with_words[word[i:i+4]] = set()
              consecutive_cogs_with_words[word[i:i+4]].add(word)
    return consecutive_cogs_with_words



In [54]:
with open('cog_words_bac.txt') as f:
  genomes = parse_genomes_input(f)
output_to_decipher = {}
print(f'The length of the words is 10')
words = find_all_words(d=10, genomes=genomes)
consecutive_cogs_with_words = find_consecutive_cogs(words)
print(consecutive_cogs_with_words)
    

The length of the words is 10
{'2165': {'2804145921652165216547953156329731491989', '3031145028041459216521652165479531563297', '1450280414592165216521654795315632973149'}, '1463': {'0244022211270767076714631463146314631463', '0222112707670767146314631463146314631463', '1141102807670767146314631463146314631463'}, '3539': {'3691018312502062353931883121353935393539', '0008061708013539312131883539353935393539', '1734000806170801353931213188353935393539', '0617080135393121318835393539353935390413', '0801353931213188353935393539353904130414'}, '1681': {'1681168116813352335133513353335428740630'}, '1344': {'1749478747862063170617051256134413441344', '4786206317061705125613441344134413341345', '4787478620631706170512561344134413441334'}}


In [55]:
def make_spaces_between_cogs(output_to_decipher):

  """
  Rearranging the output to cog spelled sequences.
  """ 

  output_with_spaces = {}
  for cog,words_with_cog in output_to_decipher.items():
    if not words_with_cog:
      continue
    output_with_spaces[cog] = []
    word_by_cogs = []
    for word in words_with_cog:
      word_seperated = ""
      for i in range (0, len(word), 4):
        word_seperated = word_seperated + " " + word[i:i+4] 
      word_by_cogs.append(word_seperated[1:])
    output_with_spaces[cog] = word_by_cogs
  return output_with_spaces

In [56]:
output_with_spaces = make_spaces_between_cogs(consecutive_cogs_with_words)
print(output_with_spaces)

{'2165': ['2804 1459 2165 2165 2165 4795 3156 3297 3149 1989', '3031 1450 2804 1459 2165 2165 2165 4795 3156 3297', '1450 2804 1459 2165 2165 2165 4795 3156 3297 3149'], '1463': ['0244 0222 1127 0767 0767 1463 1463 1463 1463 1463', '0222 1127 0767 0767 1463 1463 1463 1463 1463 1463', '1141 1028 0767 0767 1463 1463 1463 1463 1463 1463'], '3539': ['3691 0183 1250 2062 3539 3188 3121 3539 3539 3539', '0008 0617 0801 3539 3121 3188 3539 3539 3539 3539', '1734 0008 0617 0801 3539 3121 3188 3539 3539 3539', '0617 0801 3539 3121 3188 3539 3539 3539 3539 0413', '0801 3539 3121 3188 3539 3539 3539 3539 0413 0414'], '1681': ['1681 1681 1681 3352 3351 3351 3353 3354 2874 0630'], '1344': ['1749 4787 4786 2063 1706 1705 1256 1344 1344 1344', '4786 2063 1706 1705 1256 1344 1344 1344 1334 1345', '4787 4786 2063 1706 1705 1256 1344 1344 1344 1334']}


In [57]:
def parse_info_input(file):

  """
  Parse the info file into a dictionary holds as the key the genome_number
  and the value is the known information.
  """

  info = defaultdict(list)
  for line in file:
    sections = line.split(';')
    genome_number = sections[0][3:]
    if genome_number not in info:
      info[genome_number] = sections[1:-1]

  return info

In [58]:
with open('COG_INFO_TABLE.txt', encoding='Windows-1252') as file:
    info = parse_info_input(file)

In [59]:
def find_number_of_occurences_in_row(cog, word):
  max_output = 0
  counter = 0
  for i in range (0, len(word)-3, 5):
    if word[i:i+4] == cog:
      counter = counter + 1
    else:
      if max_output < counter:
        max_output = counter
      counter = 0
  if max_output < counter:
        max_output = counter
  return max_output

5

In [86]:
def print_output(output_with_spaces):

  """
  Printing the cogs that has been found, with the number of 
  occurences for each of the words it is into. Following that, is the printing
  of the function for each other cog in the word.
  """ 
  
  for cog, words_with_cog in output_with_spaces.items():
    print(f'The cog is {cog}\n')
    word_number = 1
    for word in words_with_cog:
      word_to_dict = word.replace(" ","")
      genoms = words[word_to_dict]
      number_of_genoms = len(words[word_to_dict])
      print(f'word number {word_number}:')
      print(f'The word is {word} and it appears in the {number_of_genoms} following genoms:') 
      print(genoms)
      number_of_occurences = find_number_of_occurences_in_row(cog,word)
      print(f'The cog apears in this word {number_of_occurences} times in a row.')
      print(f'The word Functionality is:')
      splited_word = word.split(' ')
      for i in range (0,len(splited_word)):
        print(f'cog: {splited_word[i]}:')
        print(info[splited_word[i]])
      word_number = word_number + 1
      print("\n")

In [87]:
print_output(output_with_spaces)

The cog is 2165

word number 1:
The word is 2804 1459 2165 2165 2165 4795 3156 3297 3149 1989 and it appears in the 13 following genoms:
{'NC_009792', 'NC_013421', 'NC_009800', 'NC_015061', 'NC_014618', 'NC_009832', 'NC_011742', 'NC_004431', 'NC_007946', 'NC_010473', 'NC_014121', 'NC_013592', 'NC_010468'}
The cog apears in this word 3 times in a row.
The word Functionality is:
cog: 2804:
['NUW', 'CELLULAR PROCESSES AND SIGNALING', 'Cell motility', 'CELLULAR PROCESSES AND SIGNALING', 'Intracellular trafficking, secretion, and vesicular transport', 'CELLULAR PROCESSES AND SIGNALING', 'Extracellular structures', 'Type II secretory pathway ATPase GspE/PulE or T4P pilus assembly pathway ATPase PilB']
cog: 1459:
['NUW', 'CELLULAR PROCESSES AND SIGNALING', 'Cell motility', 'CELLULAR PROCESSES AND SIGNALING', 'Intracellular trafficking, secretion, and vesicular transport', 'CELLULAR PROCESSES AND SIGNALING', 'Extracellular structures', 'Type II secretory pathway, component PulF']
cog: 2165:
['