In [1]:
import random
import os

In [2]:
!git clone https://github.com/PauliusMilmantas/ProteinFolding

Cloning into 'ProteinFolding'...
remote: Enumerating objects: 43, done.[K
remote: Counting objects: 100% (43/43), done.[K
remote: Compressing objects: 100% (33/33), done.[K
remote: Total 43 (delta 10), reused 37 (delta 7), pack-reused 0[K
Unpacking objects: 100% (43/43), 556.45 KiB | 2.80 MiB/s, done.


In [3]:
hydrophobic = dict({
    "K": 5.72,
    "N": 6.17,
    "D": 6.18,
    "E": 6.38,
    "P": 6.64,
    "Q": 6.67,
    "R": 6.81,
    "S": 6.93,
    "T": 7.08,
    "G": 7.31,
    "A": 7.62,
    "H": 7.85,
    "W": 8.41,
    "Y": 8.53,
    "F": 8.99,
    "L": 9.37,
    "M": 9.83,
    "I": 9.99,
    "V": 10.38,
    "C": 10.93,
    "-": 0,
    "X": 0
})

def transformHydro(seq):
  sum = 0
  for el in seq:
    v = hydrophobic.get(el)
    if v is not None:
      sum += float(v)
  
  return sum

In [4]:
class Sequence:
  def __init__(self, seq, desc):
    self.seq = seq
    self.desc = desc

In [5]:
with open('/content/ProteinFolding/TestCases/test_fb11d.a3m') as f:
    lines = f.readlines()

header = lines[0]
sequences = []

desc = ''
skippedFirstLine = False
for line in lines:
  if(skippedFirstLine):
    if(desc == ''):
      desc = line
    else:
      sequences.append(
          Sequence(line, desc)
      )
      desc = ''
  else:
    skippedFirstLine = True

print('Loaded {} sequences out of {} lines'.format(len(sequences), len(lines) - 1))

Loaded 111 sequences out of 222 lines


In [6]:
tmp = []
sequencesImportant = ['>101\t102\t103\n', '>101\n', '>10\n', '>103\n']
for seq in sequences:
  if(seq.desc not in sequencesImportant):
    tmp.append(seq)

sequences = tmp

In [7]:
criticalSequencesHead = [
    Sequence('PIAQIHILEGRSDEQKETLIREVSEAISRSLDAPLTSVRVIITEMAKGHFGIGGELASK---------------------------------------------------------------------------\n', '>101\n'),
    Sequence('PIAQIHILEGRSDEQKETLIREVSEAISRSLDAPLTSVRVIITEMAKGHFGIGGELASK---------------------------------------------------------------------------\n', '>101\n'),
    Sequence('PIAQIHILEGRSDEQKETLIREVSEAISRSLDAPLTSVRVIITEMAKGHFGIGGELASKAIAQIHICGRSDEQLCGRSDEQKETLIREGHFGIGGELASKAIAQIHILCGRSDEQKETLIREGHFGIGGELASK\n', '>101\t102\t103\n')
]

criticalSequencesTail = [
    Sequence('----------------------------------------------------------------------------------------------------AIAQIHILCGRSDEQKETLIREGHFGIGGELASK\n', '>103\n'),
    Sequence('----------------------------------------------------------------------------------------------------AIAQIHILCGRSDEQKETLIREGHFGIGGELASK\n', '>103\n'),
    Sequence('-----------------------------------------------------------AIAQIHICGRSDEQLCGRSDEQKETLIREGHFGIGGELASK----------------------------------\n', '>102\n'),
    Sequence('-----------------------------------------------------------AIAQIHICGRSDEQLCGRSDEQKETLIREGHFGIGGELASK----------------------------------\n', '>102\n')
]

In [8]:
number_of_samples = 10

for epoch in range(number_of_samples):
  reorderedSequences = []
  seqOrder = random.sample(range(len(sequences)), len(sequences))

  # Reordering requences
  for i in seqOrder:
    reorderedSequences.append(
        sequences[i]
    )

  # Adding critical sequences in their hardcoded order
  for s in criticalSequencesHead:
    reorderedSequences.insert(0, s)

  for s in criticalSequencesTail:
    reorderedSequences.append(s)

  # Creating dir
  os.makedirs('/content/stats/{}'.format(epoch))

  # Writing statistics
  statisticsFile = open('/content/stats/{}/stats.csv'.format(epoch), 'w')

  for idx, s in enumerate(reorderedSequences):
    statisticsFile.write('{};{};{};{};\n'.format(idx, s.desc.replace('\n', ''), s.seq.replace('\n', ''), str(transformHydro(s.seq))))

  # A3M output
  resultFile = open('/content/stats/{}/test_fb11d.a3m'.format(epoch), 'w')

  resultFile.write(header)

  for seq in reorderedSequences:
    resultFile.write(seq.desc)
    resultFile.write(seq.seq)

In [9]:
!zip -r /content/stats.zip /content/stats

  adding: content/stats/ (stored 0%)
  adding: content/stats/4/ (stored 0%)
  adding: content/stats/4/stats.csv (deflated 70%)
  adding: content/stats/4/test_fb11d.a3m (deflated 71%)
  adding: content/stats/6/ (stored 0%)
  adding: content/stats/6/stats.csv (deflated 70%)
  adding: content/stats/6/test_fb11d.a3m (deflated 71%)
  adding: content/stats/9/ (stored 0%)
  adding: content/stats/9/stats.csv (deflated 69%)
  adding: content/stats/9/test_fb11d.a3m (deflated 69%)
  adding: content/stats/2/ (stored 0%)
  adding: content/stats/2/stats.csv (deflated 70%)
  adding: content/stats/2/test_fb11d.a3m (deflated 71%)
  adding: content/stats/8/ (stored 0%)
  adding: content/stats/8/stats.csv (deflated 70%)
  adding: content/stats/8/test_fb11d.a3m (deflated 71%)
  adding: content/stats/7/ (stored 0%)
  adding: content/stats/7/stats.csv (deflated 70%)
  adding: content/stats/7/test_fb11d.a3m (deflated 71%)
  adding: content/stats/3/ (stored 0%)
  adding: content/stats/3/stats.csv (deflated 70