# Downloading & reading config settings

In [1]:
!git clone https://github.com/PauliusMilmantas/ProteinFolding

Cloning into 'ProteinFolding'...
remote: Enumerating objects: 12, done.[K
remote: Counting objects: 100% (12/12), done.[K
remote: Compressing objects: 100% (9/9), done.[K
remote: Total 12 (delta 0), reused 9 (delta 0), pack-reused 0[K
Unpacking objects: 100% (12/12), done.


In [2]:
from configparser import ConfigParser
configPath = '/content/ProteinFolding/config.ini'
config = ConfigParser()

config.read(configPath)

['/content/ProteinFolding/config.ini']

# Global settings for current environment

In [3]:
testCaseName = 'case1' #@param {type:"string"}
parameterSize = 136 #@param {type:"number"}

# Installing dependencies

In [4]:
!pip install Bio

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting Bio
  Downloading bio-1.4.0-py3-none-any.whl (270 kB)
[K     |████████████████████████████████| 270 kB 5.6 MB/s 
Collecting mygene
  Downloading mygene-3.2.2-py2.py3-none-any.whl (5.4 kB)
Collecting biopython>=1.79
  Downloading biopython-1.79-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (2.3 MB)
[K     |████████████████████████████████| 2.3 MB 28.8 MB/s 
Collecting biothings-client>=0.2.6
  Downloading biothings_client-0.2.6-py2.py3-none-any.whl (37 kB)
Installing collected packages: biothings-client, mygene, biopython, Bio
Successfully installed Bio-1.4.0 biopython-1.79 biothings-client-0.2.6 mygene-3.2.2


In [5]:
from google.colab import files
import numpy as np
from Bio.pairwise2 import format_alignment
from Bio import pairwise2
import re

# Parse MSA file

In [6]:
class MsaRow:
  def __init__(self, name, seq):
    self.name = name
    self.seq = seq

In [7]:
msaFilePath = config.get('testCases', testCaseName)
MSA = []

seqNameRegex = r'^[>].*';
seqRegex = r'^(?![#>]).*';

fullMSAPath = '/content/ProteinFolding/' + msaFilePath
with open(fullMSAPath) as file:
  print('Opened MSA in: {}'.format(fullMSAPath))

  for line in file:
    seqNameRegexMatch = re.match(seqNameRegex, line)
    seqRegexMatch = re.match(seqRegex, line)

    #If line is a sequence name
    if(seqNameRegexMatch is not None):
      seqName = line
    #If line is a sequence
    elif(seqRegexMatch is not None):
      MSA.append(
          MsaRow(seqName, line)
      )
    else:
      print(f'Line is ignored: "{line}"')

print("Found sequences in MSA file: {}".format(len(MSA)))

Opened MSA in: /content/ProteinFolding/TestCases/test_fb11f.a3m
Line is ignored: "#59,41,34	1,1,1
"
Found sequences in MSA file: 7710


In [8]:
sum = 0
for index, i in enumerate(MSA):
  sum += len(i.seq)

print(f'Average lenghts of sequences: {sum / len(MSA)}')

Average lenghts of sequences: 136.1201037613489


In [9]:
#@title Calculating distances - Dynamic programming
#@markdown Penalty if residues are not equal
penalty_notEqual = -2 #@param (type: "number")

#@markdown Penalty if there is a space between residues
penalty_space = -8 #@param (type: "number")

# Find sequence to which other will be compared with
comparableSeq = ''
for index, i in enumerate(MSA):
  if(len(i.seq) == parameterSize):
    comparableSeq = i.seq
    del MSA[index]

comparableSeq = comparableSeq.replace('\n', '')

print('Sequence for comparison: {}'.format(comparableSeq))

# Calculating distance matrices
distances = []
for seq in MSA:
  alignments = pairwise2.align.globalms(seq.seq, comparableSeq, 2, penalty_notEqual, penalty_space, penalty_space)
  distances.append(alignments)

Sequence for comparison: PVFHVHIGENQfTGDEKRNLADALNLALHEAMETPMDDRFIIISEHKEDEFFI----------------------------------------------------------------------------------


# Generating DTW array

In [10]:
DTW = [comparableSeq]
for distance in distances:
  distance = distance[0]

  if(len(distance.seqA) == parameterSize):
    seqA = list(distance.seqA)
    for i in range(parameterSize):
      if(distance.seqA[i] == '-'):
        seqA[i] = distance.seqB[i]

    toAppend = ''.join(seqA)
    DTW.append(toAppend.replace('\n', ''))

print('DTW size: {}'.format(len(DTW)))

DTW size: 4112


Writing DTW to file

In [11]:
f = open("DTW.txt", "w")

for line in DTW:
  f.write(line + ',')

f.close()

files.download('/content/DTW.txt')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [12]:
DTW

['PVFHVHIGENQfTGDEKRNLADALNLALHEAMETPMDDRFIIISEHKEDEFFI----------------------------------------------------------------------------------',
 'PIAQIHILEGRSDEQEKETLIREVSEAISRSLDAPLTSVRVIITEMAKGHFGIGGELASKAIAQIHICGRSDEQLCGRSDEQKETLIREGHFGIGGELASKAIAQIHILCGRSDEQKETLIREGHFGIGGELASK',
 'PIAQIHILEGRSDEQEKETLIREVSEAISRSLDAPLTSVRVIITEMAKGHFGIGGELASK---------------------------------------------------------------------------',
 'PVVTIEMWEGRfTPEQKKALVEAVTSAVAGAIGCPPEAVEVIIHEVPKVNWGIGGQIASE---------------------------------------------------------------------------',
 'PIYHIEMMEGRfTPEQKRKLVEAVTRVSVDILGGSPEAVHVLIHEIPRDNWATGGQLWSE---------------------------------------------------------------------------',
 'PMINVSMFPGRfTAEQKQALVREVTDAFVRTCGGNPEGVWVTINEIPAEHWASGGTLFSE---------------------------------------------------------------------------',
 'PVVTVEMWEGRfTDEQKRKLAELVTNAVCEAIGCPREAVEVIMREVPRKNWAIGGKLASE---------------------------------------------------------------------------',
 'PTINVQLFEGRfTPEQKR