In [2]:
import pandas as pd
import numpy as np
import sys
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import json
import glob

In [3]:
experiment_name = 'P3856'
run_name = 'P3856_YHE211_1_Slot1-1_1_5104'
experiment_base_dir = '/data2/experiments'
EXPERIMENT_DIR = "{}/{}".format(experiment_base_dir, experiment_name)

In [4]:
# go through all the runs in the experiment and gather the features
experiment_features_l = []

print("consolidating the features found in run {}".format(run_name))
features_dir = '{}/features/{}'.format(EXPERIMENT_DIR, run_name)

# consolidate the features found in this run
run_feature_files = glob.glob("{}/exp-{}-run-{}-features-precursor-*.pkl".format(features_dir, experiment_name, run_name))
run_features_l = []
print("found {} feature files for the run {}".format(len(run_feature_files), run_name))
for file in run_feature_files:
    df = pd.read_pickle(file)
    run_features_l.append(df)
# make a single df from the list of dfs
run_features_df = pd.concat(run_features_l, axis=0, sort=False)
del run_features_l[:]

experiment_features_l.append(run_features_df)

consolidating the features found in run P3856_YHE211_1_Slot1-1_1_5104
found 52411 feature files for the run P3856_YHE211_1_Slot1-1_1_5104


In [5]:
# consolidate the features found across the experiment
EXPERIMENT_FEATURES_NAME = '~/{}'.format('experiment-features.pkl')
experiment_features_df = pd.concat(experiment_features_l, axis=0, sort=False)
experiment_features_df.reset_index(drop=True, inplace=True)
print("saving {} experiment features to {}".format(len(experiment_features_df), EXPERIMENT_FEATURES_NAME))
experiment_features_df.to_pickle(EXPERIMENT_FEATURES_NAME)

saving 234877 experiment features to ~/experiment-features.pkl


In [6]:
experiment_features_df.columns

Index(['monoisotopic_mz', 'charge', 'intensity', 'intensity_full_rt_extent',
       'scan_apex', 'scan_curve_fit', 'scan_lower', 'scan_upper', 'rt_apex',
       'rt_curve_fit', 'rt_lower', 'rt_upper', 'precursor_id', 'envelope',
       'feature_id', 'candidate_phr_error', 'mono_adjusted',
       'original_phr_error', 'original_phr', 'monoisotopic_mass'],
      dtype='object')

In [7]:
experiment_features_df.mono_adjusted.sum() / len(experiment_features_df)

0.04048501981888392

In [8]:
experiment_features_df[(experiment_features_df.mono_adjusted == True)].sample(n=3)

Unnamed: 0,monoisotopic_mz,charge,intensity,intensity_full_rt_extent,scan_apex,scan_curve_fit,scan_lower,scan_upper,rt_apex,rt_curve_fit,rt_lower,rt_upper,precursor_id,envelope,feature_id,candidate_phr_error,mono_adjusted,original_phr_error,original_phr,monoisotopic_mass
171429,987.678453,2,1562.420054,488,230.42,False,195.0,270.0,1735.69,False,1725.87,1745.87,13987,"[(987.6854194589077, 265.420054200542), (988.1...",1398704,0.0425128,True,1.03494,2.169492,1973.342306
63064,900.115944,1,2509.833333,1346,365.65,True,323.48,407.82,2285.41,False,2275.76,2296.18,49060,"[(900.1176710083373, 1540.8333333333335), (901...",4906006,-0.00227102,True,-0.359912,0.308108,899.108644
101064,652.829723,2,103644.746722,35610,744.35,True,713.13,775.56,1762.63,True,1758.9,1766.36,15611,"[(652.8376552642403, 42137.746722394884), (653...",1561102,0.0238647,True,0.476262,1.034467,1303.644846


In [9]:
# Find the ratio of H(peak_number)/H(peak_number-1) for peak_number=1..6
# peak_number = 0 refers to the monoisotopic peak
# number_of_sulphur = number of sulphur atoms in the molecule
def peak_ratio(monoisotopic_mass, peak_number, number_of_sulphur):
    MAX_NUMBER_OF_SULPHUR_ATOMS = 3
    MAX_NUMBER_OF_PREDICTED_RATIOS = 6

    S0_r = np.empty(MAX_NUMBER_OF_PREDICTED_RATIOS+1, dtype=np.ndarray)
    S0_r[1] = np.array([-0.00142320578040, 0.53158267080224, 0.00572776591574, -0.00040226083326, -0.00007968737684])
    S0_r[2] = np.array([0.06258138406507, 0.24252967352808, 0.01729736525102, -0.00427641490976, 0.00038011211412])
    S0_r[3] = np.array([0.03092092306220, 0.22353930450345, -0.02630395501009, 0.00728183023772, -0.00073155573939])
    S0_r[4] = np.array([-0.02490747037406, 0.26363266501679, -0.07330346656184, 0.01876886839392, -0.00176688757979])
    S0_r[5] = np.array([-0.19423148776489, 0.45952477474223, -0.18163820209523, 0.04173579115885, -0.00355426505742])
    S0_r[6] = np.array([0.04574408690798, -0.05092121193598, 0.13874539944789, -0.04344815868749, 0.00449747222180])

    S1_r = np.empty(MAX_NUMBER_OF_PREDICTED_RATIOS+1, dtype=np.ndarray)
    S1_r[1] = np.array([-0.01040584267474, 0.53121149663696, 0.00576913817747, -0.00039325152252, -0.00007954180489])
    S1_r[2] = np.array([0.37339166598255, -0.15814640001919, 0.24085046064819, -0.06068695741919, 0.00563606634601])
    S1_r[3] = np.array([0.06969331604484, 0.28154425636993, -0.08121643989151, 0.02372741957255, -0.00238998426027])
    S1_r[4] = np.array([0.04462649178239, 0.23204790123388, -0.06083969521863, 0.01564282892512, -0.00145145206815])
    S1_r[5] = np.array([-0.20727547407753, 0.53536509500863, -0.22521649838170, 0.05180965157326, -0.00439750995163])
    S1_r[6] = np.array([0.27169670700251, -0.37192045082925, 0.31939855191976, -0.08668833166842, 0.00822975581940])

    S2_r = np.empty(MAX_NUMBER_OF_PREDICTED_RATIOS+1, dtype=np.ndarray)
    S2_r[1] = np.array([-0.01937823810470, 0.53084210514216, 0.00580573751882, -0.00038281138203, -0.00007958217070])
    S2_r[2] = np.array([0.68496829280011, -0.54558176102022, 0.44926662609767, -0.11154849560657, 0.01023294598884])
    S2_r[3] = np.array([0.04215807391059, 0.40434195078925, -0.15884974959493, 0.04319968814535, -0.00413693825139])
    S2_r[4] = np.array([0.14015578207913, 0.14407679007180, -0.01310480312503, 0.00362292256563, -0.00034189078786])
    S2_r[5] = np.array([-0.02549241716294, 0.32153542852101, -0.11409513283836, 0.02617210469576, -0.00221816103608])
    S2_r[6] = np.array([-0.14490868030324, 0.33629928307361, -0.08223564735018, 0.01023410734015, -0.00027717589598])

    model_params = np.empty(MAX_NUMBER_OF_SULPHUR_ATOMS, dtype=np.ndarray)
    model_params[0] = S0_r
    model_params[1] = S1_r
    model_params[2] = S2_r

    ratio = None
    if (((1 <= peak_number <= 3) & (((number_of_sulphur == 0) & (498 <= monoisotopic_mass <= 3915)) |
                                    ((number_of_sulphur == 1) & (530 <= monoisotopic_mass <= 3947)) |
                                    ((number_of_sulphur == 2) & (562 <= monoisotopic_mass <= 3978)))) |
       ((peak_number == 4) & (((number_of_sulphur == 0) & (907 <= monoisotopic_mass <= 3915)) |
                              ((number_of_sulphur == 1) & (939 <= monoisotopic_mass <= 3947)) |
                              ((number_of_sulphur == 2) & (971 <= monoisotopic_mass <= 3978)))) |
       ((peak_number == 5) & (((number_of_sulphur == 0) & (1219 <= monoisotopic_mass <= 3915)) |
                              ((number_of_sulphur == 1) & (1251 <= monoisotopic_mass <= 3947)) |
                              ((number_of_sulphur == 2) & (1283 <= monoisotopic_mass <= 3978)))) |
       ((peak_number == 6) & (((number_of_sulphur == 0) & (1559 <= monoisotopic_mass <= 3915)) |
                              ((number_of_sulphur == 1) & (1591 <= monoisotopic_mass <= 3947)) |
                              ((number_of_sulphur == 2) & (1623 <= monoisotopic_mass <= 3978))))):
        beta0 = model_params[number_of_sulphur][peak_number][0]
        beta1 = model_params[number_of_sulphur][peak_number][1]
        beta2 = model_params[number_of_sulphur][peak_number][2]
        beta3 = model_params[number_of_sulphur][peak_number][3]
        beta4 = model_params[number_of_sulphur][peak_number][4]
        scaled_m = monoisotopic_mass / 1000.0
        ratio = beta0 + (beta1*scaled_m) + beta2*(scaled_m**2) + beta3*(scaled_m**3) + beta4*(scaled_m**4)
    return ratio


In [10]:
experiment_features_df.loc[58961]

monoisotopic_mz                                                       891.086
charge                                                                      2
intensity                                                             1006.72
intensity_full_rt_extent                                                  832
scan_apex                                                              354.42
scan_curve_fit                                                           True
scan_lower                                                             334.42
scan_upper                                                             374.42
rt_apex                                                                  1873
rt_curve_fit                                                            False
rt_lower                                                                 1863
rt_upper                                                                 1883
precursor_id                                                    

In [11]:
experiment_features_df.loc[58961].envelope

[(891.0968434946863, 115.15937940761637),
 (891.5809, 108.00),
 (892.0974, 356.00),
 (892.5981, 1.00),
 (893.0855, 426.56)]

In [12]:
peak_ratio(monoisotopic_mass=1780.16, peak_number=1, number_of_sulphur=0)

0.9599606045173357