In [1]:
import pandas as pd

Average masses taken from http://proteomicsresource.washington.edu/protocols06/masses.php.

In [2]:
# Define dictionary keys and values

molecular_mass_dict = {
    'A':71.0779, 
    'R':156.18568, 
    'G':57.05132, 
    'S':87.0773, 
    'P':97.11518, 
    'V':99.13106, 
    'T':101.10388, 
    'C':103.1429, 
    'L':113.15764, 
    'I':113.15764, 
    'N':114.10264, 
    'D':115.0874, 
    'Q':128.12922, 
    'K':128.17228, 
    'E':129.11398, 
    'M':131.19606, 
    'H':137.13928, 
    'F':147.17386, 
    'U':150.3079, 
    'Y':163.17326, 
    'W':186.2099, 
    'O':237.29816
}

In [None]:
# Convert seq column to list
# May have to change df and column name depending on use

seq_list = list(test_df['AASeq'].tolist())
seq_list

In [None]:
# Calculate approx mass in same order as seq_list

total_mass = []

for seq in seq_list:
    mass_list = []
    for amino_acid, mw in molecular_mass_dict.items():
        aa_weight = seq.count(amino_acid)*mw
        mass_list.append(aa_weight)
    total_mass.append(sum(mass_list))
    

print(total_mass)

### Below code tests above code
Random sequences generated using http://molbiotools.com/randomsequencegenerator.html.

In [3]:
# Generating test dataframe with random sequences of variable length

test_data = {
    'Gene': ['RealNiceProtein', 
             'EvenCoolerProtein', 
             'SuperBigProtein'],
    'AASeq': ['MPFMVNNIYVSFCEIKEIVCAGGSTTKYADVLQENNEQGRTVKLQ', 
             'MIDVVEDAKANALNNVRCNMVGSGLQAAGAIMGLAERESFFQAMEEARSAKGECEWKIDE', 
             'MSVMKVCCFYAPKQRLNSSLQMPQCVLMNDVALIVRQSDEGIIGGFRLMHKYVKIVYGPKTSESYKRVGPNERLGIDTAKDGIAKAWSLPILEIRCLENVLPYDLGKMHP']
}

test_df = pd.DataFrame(test_data, columns = ['Gene', 'AASeq'])

test_df

Unnamed: 0,Gene,AASeq
0,RealNiceProtein,MPFMVNNIYVSFCEIKEIVCAGGSTTKYADVLQENNEQGRTVKLQ
1,EvenCoolerProtein,MIDVVEDAKANALNNVRCNMVGSGLQAAGAIMGLAERESFFQAMEE...
2,SuperBigProtein,MSVMKVCCFYAPKQRLNSSLQMPQCVLMNDVALIVRQSDEGIIGGF...


In [4]:
# Convert seq column to list
# May have to change df and column name depending on use

seq_list = list(test_df['AASeq'].tolist())
seq_list

['MPFMVNNIYVSFCEIKEIVCAGGSTTKYADVLQENNEQGRTVKLQ',
 'MIDVVEDAKANALNNVRCNMVGSGLQAAGAIMGLAERESFFQAMEEARSAKGECEWKIDE',
 'MSVMKVCCFYAPKQRLNSSLQMPQCVLMNDVALIVRQSDEGIIGGFRLMHKYVKIVYGPKTSESYKRVGPNERLGIDTAKDGIAKAWSLPILEIRCLENVLPYDLGKMHP']

In [5]:
# Calculate approx mass in same order as seq_list

total_mass = []

for seq in seq_list:
    mass_list = []
    for amino_acid, mw in molecular_mass_dict.items():
        aa_weight = seq.count(amino_acid)*mw
        mass_list.append(aa_weight)
    total_mass.append(sum(mass_list))
    

print(total_mass)

[5051.7509, 6488.28216, 12396.57926]


In [6]:
# Append new list to specific location in original dataframe

test_df.insert(loc=2, # May have to change location depending on dataframe shape
               column='Mass',
               value = total_mass
              )

test_df

Unnamed: 0,Gene,AASeq,Mass
0,RealNiceProtein,MPFMVNNIYVSFCEIKEIVCAGGSTTKYADVLQENNEQGRTVKLQ,5051.7509
1,EvenCoolerProtein,MIDVVEDAKANALNNVRCNMVGSGLQAAGAIMGLAERESFFQAMEE...,6488.28216
2,SuperBigProtein,MSVMKVCCFYAPKQRLNSSLQMPQCVLMNDVALIVRQSDEGIIGGF...,12396.57926


# Below cells contain function version of above code

In [86]:
def get_prot_mass(AASeq):
    
    '''Tabulate approximate total mass of input amino acid string based on the sum of
    the average masses for every amino acid in the string.'''
    
    # Create dict with amino acid sybols as keys and associated average masses as values
    # Average masses taken from http://proteomicsresource.washington.edu/protocols06/masses.php
    molecular_mass_dict = {
    'A':71.0779, 
    'R':156.18568, 
    'G':57.05132, 
    'S':87.0773, 
    'P':97.11518, 
    'V':99.13106, 
    'T':101.10388, 
    'C':103.1429, 
    'L':113.15764, 
    'I':113.15764, 
    'N':114.10264, 
    'D':115.0874, 
    'Q':128.12922, 
    'K':128.17228, 
    'E':129.11398, 
    'M':131.19606, 
    'H':137.13928, 
    'F':147.17386, 
    'U':150.3079, 
    'Y':163.17326, 
    'W':186.2099, 
    'O':237.29816
    }
    
    # Calculate approx mass for amino acid sequence
    mass_list = []
    
    # Capitalize all letters in amino acid string prior to taking mass
    AASeq = AASeq.upper()
    
    # Raise error if amino acid sequence contains characters not in the dict
    acceptable_amino_acids_list = list(molecular_mass_dict.keys())

    for char in AASeq:
        if char not in acceptable_amino_acids_list:
            if char == ' ':
                continue
            else:
                raise ValueError('AASeq contains unacceptable characters')
    
    for amino_acid, mw in molecular_mass_dict.items():
        aa_weight = AASeq.count(amino_acid)*mw
        mass_list.append(aa_weight)
    total_mass = sum(mass_list)
    
    return total_mass

### Unit Tests

In [88]:
import unittest

from mcalc import get_prot_mass

class MolecularMassTestCase(unittest.TestCase):

    def test_lower(self):
        result_lower = get_prot_mass('mpfmvnniyvsfceikeivcaggsttkyadvlqenneqgrtvklq')
        self.assertEqual(results_lower, 5051.7509)
        
    def test_gaps(self):
        result_gaps = get_prot_mass('MPFMVNNIYVSF  CEIKEIV CAGGSTTKYADVLQEN NEQGRTVKLQ')
        self.assertEqual(result_gaps, 5051.7509)

    def test_numbers(self):
        # check that the function fails when fed values other than strings
        with self.assertRaises(AttributeError):
            get_prot_mass(123)

    def test_amino_acids(self):
        # check that the function fails when fed values other than strings containing real amino acid symbols
        with self.assertRaises(ValueError):
            get_prot_mass('MPFMVNNIYVSF528ZZZ')

if __name__ == '__main__':
    unittest.main()

ModuleNotFoundError: No module named 'mcalc'