In [None]:
import sqlite3
import pandas as pd
import numpy as np

connection = sqlite3.connect('unimod.db')


mod_query = """
SELECT
    s.one_letter,
    p.position,
    m.record_id AS unimod_id,
    m.mono_mass,
    m.full_name,
    m.code_name,
    m.composition,
    c.classification
FROM specificity AS s
JOIN modifications AS m ON s.mod_key = m.record_id
JOIN positions AS p ON s.position_key = p.record_id
JOIN classifications AS c ON s.classifications_key = c.record_id
WHERE
    m.username_of_poster = 'unimod' OR m.approved = 1
"""
mods = pd.read_sql_query(mod_query, connection)


aa_query = """
SELECT
    a.one_letter,
    a.three_letter,
    a.full_name,
    a.num_H,
    a.num_O,
    a.num_C,
    a.num_N,
    a.num_S,
    a.num_Se

FROM amino_acids AS a
WHERE
    a.one_letter != '-'
"""
amino_acids = pd.read_sql_query(aa_query, connection)


elements_query = """
SELECT
    e.element,
    e.full_name,
    e.mono_mass
FROM elements AS e
"""
elements = pd.read_sql_query(elements_query, connection)

#Close the connection
connection.close()




In [None]:



element_vector = elements.set_index('element').loc[['H', 'O', 'C', 'N', 'S', 'Se'], 'mono_mass'].values
amino_acids['mono_mass'] = amino_acids.set_index(['one_letter', 'three_letter', 'full_name']).dot(element_vector).values

# Replace Ile and Leu with Xle
amino_acids.loc[amino_acids['one_letter'] == 'L', ['one_letter', 'three_letter', 'full_name']] = ['J', 'Xle', 'Isoleucine/Leucine']
amino_acids = amino_acids.loc[~amino_acids['one_letter'].isin(['I', 'L'])]

aa_vector = amino_acids.set_index('one_letter')['mono_mass']
aa_subs_pairwise = pd.DataFrame(aa_vector.values[:, np.newaxis] - aa_vector.values, index = aa_vector.index.values, columns = aa_vector.index.values)

In [None]:

# aa_three_to_one_letter = {three_letter : one_letter for three_letter, one_letter in 
#                           zip(amino_acids['three_letter'], amino_acids['one_letter'])}
# aa_subs = mods[mods['classification'] == 'AA substitution']
# aa_subs['code_name'] = aa_subs['code_name'].str.replace('2', '->')
# aa_subs['sub_aa'] = aa_subs['code_name'].str.split(' ').str[0].str.split('->').str[1]
# aa_subs['sub_aa'] = aa_subs['sub_aa'].map(aa_three_to_one_letter)

In [None]:
def calculate_aa_substitution_matrix(processed_amino_acids_df: pd.DataFrame) -> pd.DataFrame:
    """
    Calculates the pairwise mass difference matrix (Row AA mass - Column AA mass).
    """
    aa_vector: pd.Series = processed_amino_acids_df.set_index('one_letter')['mono_mass']
    
    aa_subs_pairwise: pd.DataFrame = pd.DataFrame(
        aa_vector.values[:, np.newaxis] - aa_vector.values,
        index=aa_vector.index.values,
        columns=aa_vector.index.values
    )
    return aa_subs_pairwise