In [25]:
# INSTALLING LIBRARIES
# Following code taken from Google Magenta's open source code, link below:
# https://colab.research.google.com/notebooks/magenta/hello_magenta/hello_magenta.ipynb 
# @test {"output": "ignore"}

print('Installing dependencies...')
!apt-get update -qq && apt-get install -qq libfluidsynth1 fluid-soundfont-gm build-essential libasound2-dev libjack-dev
!pip install pyfluidsynth pretty_midi

!pip install -qU magenta

# Hack to allow python to pick up the newly-installed fluidsynth lib. 
# This is only needed for the hosted Colab environment.
import ctypes.util
orig_ctypes_util_find_library = ctypes.util.find_library
def proxy_find_library(lib):
  if lib == 'fluidsynth':
    return 'libfluidsynth.so.1'
  else:
    return orig_ctypes_util_find_library(lib)
ctypes.util.find_library = proxy_find_library

print('Importing libraries and defining some helper functions...')
from google.colab import files

import magenta.music as mm
import magenta

print('🎉 Done!')
print(magenta.__version__)

Installing dependencies...
Importing libraries and defining some helper functions...
🎉 Done!
1.3.1


In [26]:
# CODE FOR SINGLE NOTE, AUDIO FILES IN SLIDE 1
from magenta.music.protobuf import music_pb2

RNA_T = music_pb2.NoteSequence()
RNA_G = music_pb2.NoteSequence()
RNA_A = music_pb2.NoteSequence()
RNA_C = music_pb2.NoteSequence()


# RNA T (U) to Bassoon note C
RNA_T.notes.add(pitch=60, start_time=0, end_time=2, instrument=1, program=70, velocity = 60)
# RNA A to string pizz note E
RNA_A.notes.add(pitch=64, start_time=0, end_time=1, instrument=1, program=45, velocity = 60)
# RNA G to taiko drum note D
RNA_G.notes.add(pitch=54, start_time=0, end_time=1, instrument=3, program=116, velocity = 60)
# RNA C to marimba note marimba
RNA_C.notes.add(pitch=64, start_time=0, end_time=1, instrument=4, program=12, velocity = 60)

# producing individual note sounds
mm.play_sequence(RNA_T,synth=mm.fluidsynth)
mm.play_sequence(RNA_A,synth=mm.fluidsynth)
mm.play_sequence(RNA_G,synth=mm.fluidsynth)
mm.play_sequence(RNA_C,synth=mm.fluidsynth)

In [0]:
# CLEANING DATA, BY ACCESSION ID
# data from GenBank

LC522350 =  """1 tcgcaccgta gctggtgtct ctatctgtag tactatgacc aatagacagt ttcatcaaaa 
61 attattgaaa tcaatagccg ccactagagg agctactgta gtaattggaa caagcaaatt
      121 ctatggtggt tggcacaaca tgttaaaaac tgtttatagt gatgtagaaa accctcacct
      181 ta"""

MN970003 = """        1 taaacacctc ataccactta tgtacaaagg acttccttgg aatgtagtgc gtataaagat
       61 tgtacaaatg ttaagtgaca cacttaaaaa tctctctgac agagtcgtat ttgtcttatg
      121 ggcacatggc tttgagttga catctatgaa gtattttgtg aaaataggac ctgagcgcac
      181 ctgttgtcta tgtgatagac gtgccacatg cttttccact gcttcagaca cttatgcctg
      241 ttggcatcat tctattggat ttgattacgt ctataatccg tttatgattg
"""

MN975264 = """1 tgagttatga ggatcaagat gcacttttcg catatacaaa acgtaatgtc atccctacta
       61 taactcaaat gaatcttaag tatgccatta gtgcaaagaa tagagctcgc accgtagctg
      121 gtgtctctat ctgtagtact atgaccaata gacagtttca tcaaaaatta ttgaaatcaa
      181 tagccgccac tagaggagct actgtagtaa ttggaacaag caaattctat ggtggttggc
      241 acaacatgtt aaaaactgtt tatagtgatg tagaaaaccc tcacctt

"""
# creating a dictionary where key=accession id, value=string of nucleotides
data = {}
to_clean = [ MN970003, LC522350, MN975264 ]
ids = ["MN970003", "LC522350", "MN975264"]

i = 0
for string in to_clean:
  res = "".join(filter(lambda x: not x.isdigit(), "".join(string.split()))) 
  data[ids[i]] = res
  i += 1

In [0]:
# CODE FOR AUDIO FILES IN SLIDE 4
# defining a function that generates music for a given strain.
from magenta.music.protobuf import music_pb2

def makeMusic(string):

  # creating a "note sequence", a container to add/remove notes from
  seq = music_pb2.NoteSequence()
  time = 0.0
  note_len = 0.35
  count = 0

  # looping through nucleotides in the RdRp sequence
  for letter in string:
    
    # adding a rest after every 3 notes - to emphasize amino acid groups 
    # if count % 4 == 0:
    #   seq.notes.add(pitch=0, start_time=time, end_time=time + note_len)

    if (letter == 't'):
      # RNA t (assumed to be uracil, see note in README) goes to note C, bassoon
      seq.notes.add(pitch=60, start_time=time, end_time=time + note_len, instrument=1, program=70, velocity = 60)
    elif (letter == 'a'):
      # RNA a goes to note E, string pizzicato
      seq.notes.add(pitch=64, start_time=time, end_time=time + note_len, instrument=1, program=45, velocity = 60)
    elif(letter == 'g'):
      # RNA g goes to note D, taiko drum
      seq.notes.add(pitch=62, start_time=time, end_time=time + note_len, instrument=3, program=116, velocity = 60)
    else:
      # RNA C goes to note G, marimba
      seq.notes.add(pitch= 67, start_time=time, end_time=time + note_len, instrument=4, program=12, velocity = 60)
    time = time + note_len
    count += 1

  seq.tempos.add(qpm=60)

# plotting sequence
  mm.plot_sequence(seq)
# playing sequence
  mm.play_sequence(seq,synth=mm.fluidsynth)


In [29]:
# Making music for three strains
for strain in data:
  print(strain)
  makeMusic(data[strain])

MN970003


LC522350


MN975264


In [41]:
# SINGULAR STRAND LC522350 ENCODED TO MUSIC
# Distinguishing between nucleotides

from magenta.music.protobuf import music_pb2
import bokeh.plotting

first_string = music_pb2.NoteSequence()

# going through the first string only
for string in data:
  time = 0.0
  note_len = 0.3
  for letter in data[string]:
    # RNA T/U goes to note C
    if (letter == 't'):
      first_string.notes.add(pitch= 60, start_time=time, end_time=time + note_len,velocity = 1)
    # RNA G goes to note E
    elif(letter == 'g'):
      first_string.notes.add(pitch= 62, start_time=time, end_time=time + note_len,velocity = 80)
    # RNA A goes to note G
    elif(letter == 'a'):
      first_string.notes.add(pitch= 64,start_time=time, end_time=time + note_len, velocity = 40)
   # RNA C goes to note B
    else:
      first_string.notes.add(pitch= 66, start_time=time, end_time=time + note_len,velocity = 120)
    time = time + note_len
  break

first_string.tempos.add(qpm=20);

# This is a colab utility method that visualizes a NoteSequence.
fig = mm.plot_sequence(first_string, show_figure=False)

fig.plot_width = 1000
fig.plot_height = 200
bokeh.plotting.output_notebook()
bokeh.plotting.show(fig)

# This is a colab utility method that plays a NoteSequence.
mm.play_sequence(first_string,synth=mm.fluidsynth)

In [31]:
# LAYERED ARRANGEMENT IN SLIDE 5
from magenta.music.protobuf import music_pb2
import bokeh.plotting

first_string = music_pb2.NoteSequence()
note_len = 0.3
v = 30
i = 0

for string in data:
  # going through each string and layering
  time = 0.0
  string_num = 0
  string_len = len(data[string])
  for letter in data[string]:
    # i iterates thru all
    if (letter == 't'):
      # RNA T goes to note C
      try:
        # if the notes at the same time are the same, amplify the sound
        if first_string.notes[i- string_len].pitch == 60:
          first_string.notes.add(pitch=60, start_time=time, end_time=time + note_len, instrument=1, program=12, velocity = v+50)
      except:
        # if note is different, keep lower volume
        first_string.notes.add(pitch=60, start_time=time, end_time=time + note_len, instrument=1, program=12, velocity = v)
 
    elif(letter == 'g'):
      # RNA G goes to note E
      try:
        # if note is same, amplify the sound
        if first_string.notes[i- string_len].pitch == 62:
          first_string.notes.add(pitch=62, start_time=time, end_time=time + note_len, instrument=1, program=12, velocity = v+50)
      except:
        # if note is different, keep lower volume
        first_string.notes.add(pitch=62, start_time=time, end_time=time + note_len, instrument=1, program=12, velocity = v)

    elif(letter == 'a'):
      # RNA A goes to note G
      try:
        # if note is same, amplify the sound
        if first_string.notes[i- string_len].pitch == 64:
          first_string.notes.add(pitch=64, start_time=time, end_time=time + note_len, instrument=1, program=12, velocity = v+50)
      except:
        # if note is different, keep lower volume
        first_string.notes.add(pitch=64, start_time=time, end_time=time + note_len, instrument=1, program=12, velocity = v)
      
    else:
      # RNA C goes to note B
      try:
        # if note is same, amplify the sound
        if first_string.notes[i- string_len].pitch == 66:
          first_string.notes.add(pitch=66, start_time=time, end_time=time + note_len, instrument=1, program=12, velocity = v+50)
      except:
        # if note is different, keep lower volume
        first_string.notes.add(pitch=66, start_time=time, end_time=time + note_len, instrument=1, program=12, velocity = v)

    time = time + note_len
    i += 1


first_string.tempos.add(qpm=80);

# This is a colab utility method that visualizes a NoteSequence.
fig = mm.plot_sequence(first_string, show_figure=False)
fig.plot_width = 1000
fig.plot_height = 200
bokeh.plotting.output_notebook()
bokeh.plotting.show(fig)

# This is a colab utility method that plays a NoteSequence.
mm.play_sequence(first_string,  synth=mm.fluidsynth)


In [0]:
# CODE FOR VISUALIZATIONS IN TABLEAU
# Visualizations linked in slides
# Getting strain data to create visualization of 
# nucleotides / corresponding amino acids

# defining a function to retrieve corresponding amino acid
def getAminoAcid(triplet):
  # dictionary of amino acids and corresponding 3 nucleotides 
  amino_acids = {
      "phe": ["ttt","ttc"],
      "leu": ["tta", "ttg", "ctt", "ctc", "cta", "ctg"],
      "ser": ["tct", "tcc", "tca", "tcg", "agt", "agc"],
      "tyr": ["tat", "tac"],
      "cys": ["tgt", "tgc"],
      "trp": ["tgg"],
      "pro": ["cct", "ccc", "cca", "ccg"],
      "his": ["cat", "cac"],
      "gin": ["caa", "cag"],
      "arg": ["cgt", "cgc", "cga", "cgg", "aga", "agg"],
      "ile": ["att", "atc", "ata"],
      "met": ["atg"],
      "thr": ["act", "acc", "aca", "acg"],
      "asn": ["aat", "aac"],
      "lys": ["aaa", "aag"],
      "val": ["gtt", "gtc", "gta", "gtg"],
      "ala": ["gct", "gcc", "gca", "gcg"],
      "asp": ["gat", "gac"],
      "glu": ["gaa", "gag"],
      "gly": ["ggt", "ggc", "gga", "ggg"],
      "stop": ["taa", "tag", "tga"]
  }

  # searching dictionary for given 3 nucleotides
  for key in amino_acids:
    for val in amino_acids[key]:
      if val == triplet:
        return key
  return "amino acid not found"

In [0]:
# defining function that organizes + downloades sequence info into dataframe
import pandas as pd

def downloadData(name):
  strain_df = pd.DataFrame(columns=["nucleotide", "amino_acid"])
  s = data[name]

  # adding nucleotides into first column of dataframe
  for i in range (0, len(s)):
    strain_df.loc[i] = [s[i], ""]

  # retrieving amino acids and adding them to second column
  # cutting off sequence at largest multiple of three, for proof of concept
  end = int(len(s)/3)*3
  for l in range(0, end, 3):
      triplet = s[l] + s[l+1] + s[l+2]
      strain_df.iloc[l]["amino_acid"] = getAminoAcid(triplet)
      strain_df.iloc[l+1]["amino_acid"] = getAminoAcid(triplet)
      strain_df.iloc[l+2]["amino_acid"] = getAminoAcid(triplet)

  # downloading dataframe as a csv file, to be uploaded to Tableau
  filename = name + ".csv"
  strain_df.to_csv(filename)


In [0]:
# downloading csv files for all 3 strains
for key, value in data.items():
  downloadData(key)