In [1]:
!pip install biopython

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting biopython
  Downloading biopython-1.81-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m38.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: biopython
Successfully installed biopython-1.81


In [2]:
from Bio.Seq import MutableSeq, Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
import pandas as pd
import random

In [3]:
final = pd.read_csv('./final.csv')
final.head()

Unnamed: 0,gene_symbol,gene_id,acession,version,interval_from,interval_to,strand,gene,prot_seq,gr_acid_num,num_transcripts,missense_mutation_amount
0,NLRP3,114548,NC_000001,11,247416077,247448817,plus,TCTCTGCCTCTGCTCTGATGTAAGTGGAGACCACATCCTTCCTGCC...,MHTFKHTHIHRPPSPLLSPSLVSNVGAEVV*TGHREGPLFYLPYMS...,4800,20,27
1,MEFV,4210,NC_000016,10,3242027,3256633,minus,AGATTTTCTAATTTTTTTATTCATGAGGCCCAGTCAATTCTCTTAA...,MVFGLSTELGEDESWYRFWGSLVDDRWDLLDHGILKLFKFKFDVLW...,3506,2,24
2,NOD2,64127,NC_000016,10,50693606,50733075,plus,GAGCCGGGAGTCGTGGCCCGGAGTGGGCCTTGGAGTCGGCGCGCAG...,MENTLRGRRRKREQPTL*IQV*VGKGLSGVVLDAPV*GVDKREDTK...,4675,18,15
3,PSTPIP1,9051,NC_000015,10,76994680,77037475,plus,CCTTGCCTCTGTGTGCTCACAGCCCCCCAGAGCACAGCTGTGTCTG...,MSDPFSKDRGSESER*IYYHYFIFSCVVDFVT*SFP*VP*FRGSWN...,2217,18,6


In [5]:
# For gene MEFV translation starts from index 1, for true silence mutation only index 0 remains.
# It causes an endless loop, that's why I drop it from condition.


genes = final.gene.to_list()
prot_seq = final.prot_seq.to_list()
# dict for genes storage

mute_dict = {'NLRP3':{},'MEFV': {}, 'NOD2':{},'PSTPIP1':{}}

while True:
  if len(mute_dict['NLRP3']) == 4  and len(mute_dict['NOD2']) == 4  and len(mute_dict['PSTPIP1']) == 4:
    # and len(mute_dict['MEFV']) == 4 
    break
  mutations = []
  for ind, gene in enumerate(genes):

    n_bases = ['A', 'T', 'G', 'C']
    stop = ['UAA', 'UAG', 'UGA']

    # random index, where we change nucleotide
    num = random.randint(0, len(gene) - 1)
    start = gene[num]
    
    # delete this nucleotide from bases

    for base in n_bases:
      if gene[num] == base:
        n_bases.remove(base)
    # randomly choose another nucleotide
    insertion = random.choice(n_bases)
    gene = list(gene)
    gene.pop(num)
    gene.insert(num, insertion)
    gene = ''.join(gene)
    # set sequence type with Seq function
    gene = Seq(gene)      
      
    # Collect mutations

    # 1. true silence mutation
    if final.strand.loc[ind] == 'plus':
      compl = gene.complement()
      m_rna = compl.transcribe()
      

      index = m_rna.find('AUG')
      if num < index:
        print(f'This mutation {start}>{gene[num]} in gene {final.gene_symbol.loc[ind]} located in {num} will not affect protein\
        translation starts from {index}')
        mutations.append({f'True silence mutation in {final.gene_symbol.loc[ind]}':f'{start}>{gene[num]}, index: {num}'})
        continue

      rna = m_rna[index:]
      prot = rna.translate()

    else:
      # 2. silence mutation
      reversed = gene.reverse_complement() 
      compl = reversed.complement()
      m_rna = compl.transcribe()

      index = m_rna.find('AUG')
      if num < index:
        print(f'This mutation {start}>{gene[num]} in gene {final.gene_symbol.loc[ind]} located in {num} will not affect protein\
        translation starts from {index}')
        mutations.append({f'True silence mutation in {final.gene_symbol.loc[ind]}':f'{start}>{gene[num]}, index: {num}'})
        continue

      rna = m_rna[index:]
      prot = rna.translate() 
    # looking for a codon
    if num > 2:
      cut = num - index
      pos = cut % 3
      if pos == 0:
        codon = rna[cut:cut+3]
      elif pos == 1:
        codon = rna[cut-1:cut+2] 
      else:
        codon = rna[cut-2:cut+1]   

    
    if codon in stop:
      
      # 3. nonsence mutation
      print(f'This mutation  {start}>{gene[num]} in gene {final.gene_symbol.loc[ind]} located in {num} will stop translation\
      codon is {codon}')
      mutations.append({f'Nonsence mutation in {final.gene_symbol.loc[ind]}':f'{start}>{gene[num]}, index: {num}'}) 
      continue 
    else: 
      # If proteins are equal, it's Silence
        if prot == prot_seq[ind]:
          print(f'This mutation {start}>{gene[num]} in gene {final.gene_symbol.loc[ind]} located in {num} will not affect amino acid')
          mutations.append({f'Silence mutation in {final.gene_symbol.loc[ind]}':f'{start}>{gene[num]}, index: {num}'})
        else:
          # 4. missence mutation
          if len(prot) != len(prot_seq[ind]):
            continue  

          # Looking for an amino acid and it's location in a protein       
          id, difference = [(x, i) for x, i in enumerate(prot) if i != prot_seq[ind][x]][0]
          
          print(f'This mutation {start}>{gene[num]} in gene {final.gene_symbol.loc[ind]} located in {num} will change amino acid from {prot_seq[ind][id]} to {difference}')
          mutations.append({f'Missence mutation in {final.gene_symbol.loc[ind]}':f'{start}>{gene[num]}, index: {num}'})
  # fill our dict
  for k, value in mute_dict.items():
    for mutation in mutations: 
      key_str = [i for i in mutation.keys()][0]
      
      star = ' '.join(key_str.split()[:-1])
      star = star + ' ' + str(k)     
      if k in key_str and star not in value.keys():   
        mute_dict[k].update(mutation)




This mutation C>A in gene NLRP3 located in 22987 will change amino acid from V to L
This mutation C>G in gene MEFV located in 3861 will change amino acid from Q to E
This mutation A>T in gene NOD2 located in 22787 will change amino acid from S to T
This mutation C>A in gene PSTPIP1 located in 41272 will not affect amino acid
This mutation T>A in gene NLRP3 located in 20707 will change amino acid from S to C
This mutation T>C in gene MEFV located in 4608 will change amino acid from F to L
This mutation A>G in gene NOD2 located in 39360 will change amino acid from I to T
This mutation C>T in gene PSTPIP1 located in 8969 will change amino acid from A to T
This mutation G>C in gene NLRP3 located in 15050 will change amino acid from T to S
This mutation A>G in gene MEFV located in 9950 will change amino acid from E to G
This mutation C>T in gene NOD2 located in 30259 will change amino acid from M to I
This mutation A>G in gene PSTPIP1 located in 11667 will change amino acid from I to T
This



This mutation G>T in gene NOD2 located in 31003 will not affect amino acid
This mutation A>T in gene PSTPIP1 located in 21735 will change amino acid from V to D
This mutation  C>T in gene NLRP3 located in 10296 will stop translation      codon is UGA
This mutation C>G in gene MEFV located in 3819 will change amino acid from R to G
This mutation G>A in gene NOD2 located in 27466 will not affect amino acid
This mutation A>G in gene PSTPIP1 located in 3318 will change amino acid from L to P
This mutation T>A in gene NLRP3 located in 2257 will change amino acid from T to S
This mutation C>A in gene MEFV located in 10362 will change amino acid from L to I
This mutation T>C in gene NOD2 located in 24050 will change amino acid from N to D
This mutation T>A in gene PSTPIP1 located in 19557 will change amino acid from D to V
This mutation A>T in gene NLRP3 located in 30392 will change amino acid from V to D
This mutation A>G in gene MEFV located in 14499 will change amino acid from K to E
This 

In [6]:
# The execution time is always different, because it is random. But it seems to work
mute_dict

{'NLRP3': {'Missence mutation in NLRP3': 'C>A, index: 22987',
  'Nonsence mutation in NLRP3': 'C>T, index: 10296',
  'Silence mutation in NLRP3': 'T>C, index: 26763',
  'True silence mutation in NLRP3': 'A>C, index: 173'},
 'MEFV': {'Missence mutation in MEFV': 'C>G, index: 3861',
  'Silence mutation in MEFV': 'C>G, index: 12190',
  'Nonsence mutation in MEFV': 'T>C, index: 945'},
 'NOD2': {'Missence mutation in NOD2': 'A>T, index: 22787',
  'Silence mutation in NOD2': 'G>T, index: 31003',
  'Nonsence mutation in NOD2': 'C>A, index: 32246',
  'True silence mutation in NOD2': 'C>G, index: 181'},
 'PSTPIP1': {'Silence mutation in PSTPIP1': 'C>A, index: 41272',
  'Missence mutation in PSTPIP1': 'C>T, index: 8969',
  'True silence mutation in PSTPIP1': 'G>T, index: 460',
  'Nonsence mutation in PSTPIP1': 'G>A, index: 13160'}}

In [7]:
mute_dict['NOD2']

{'Missence mutation in NOD2': 'A>T, index: 22787',
 'Silence mutation in NOD2': 'G>T, index: 31003',
 'Nonsence mutation in NOD2': 'C>A, index: 32246',
 'True silence mutation in NOD2': 'C>G, index: 181'}