In [3]:
import pandas as pd
import numpy as np
import sys
import pickle
import glob
import os
import matplotlib.pyplot as plt

In [85]:
FILE_IDX_FOR_ANALYSIS = 0

In [4]:
SEQUENCE_FILE_COUNTS_FILE_NAME = '/Users/darylwilding-mcbride/Downloads/sequence-charge-file-counts-df.pkl'

In [5]:
sequence_file_counts_df = pd.read_pickle(SEQUENCE_FILE_COUNTS_FILE_NAME)

In [6]:
sequence_file_counts_df.head()

Unnamed: 0,sequence,charge,number_of_files,file_idxs
0,AAAAAAAAAPAAAATAPTTAATTAATAAQ,2,7,"[0, 2, 5, 7, 11, 15, 16]"
1,AAAAAAAAAPAAAATAPTTAATTAATAAQ,3,3,"[10, 12, 17]"
2,AAAAAAAAVPSAGPAGPAPTSAAGR,2,20,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
3,AAAAALSQQQSLQER,2,3,"[12, 17, 18]"
4,AAAAATVVPPMVGGPPFVGPVGFGPGDR,3,13,"[1, 3, 5, 6, 7, 9, 10, 12, 14, 16, 17, 18, 19]"


In [7]:
EXPERIMENT_SEQUENCE_ATTRIBS_FILE_NAME = '/Users/darylwilding-mcbride/Downloads/experiments/190719_Hela_Ecoli/training-sets/experiment-sequence-charge-attribs-df.pkl'

In [8]:
experiment_sequence_attribs_df = pd.read_pickle(EXPERIMENT_SEQUENCE_ATTRIBS_FILE_NAME)

In [9]:
experiment_sequence_attribs_df.head()

Unnamed: 0,sequence,charge,theoretical_mz,experiment_scan,experiment_rt,experiment_intensity
0,AAAAAAAAAPAAAATAPTTAATTAATAAQ,2,1184.1049,73.911301,602.012681,108754.444444
1,AAAAAAAAAPAAAATAPTTAATTAATAAQ,3,789.7394,335.868469,609.86414,29377.333333
2,AAAAAAAAVPSAGPAGPAPTSAAGR,2,1016.0286,160.291329,489.973155,58456.622222
3,AAAAALSQQQSLQER,2,785.9075,496.361511,375.039574,57092.25
4,AAAAATVVPPMVGGPPFVGPVGFGPGDR,3,864.1171,707.53758,1039.861047,50988.518519


In [86]:
# use the run-0 model to find AAAAATVVPPMVGGPPFVGPVGFGPGDR, charge 3 in the raw data of the file idx
RUN_MZ_MODEL_FILE_NAME = '/Users/darylwilding-mcbride/Downloads/experiments/190719_Hela_Ecoli/trained-models/file-{}-mz-model.pkl'.format(FILE_IDX_FOR_ANALYSIS)
RUN_SCAN_MODEL_FILE_NAME = '/Users/darylwilding-mcbride/Downloads/experiments/190719_Hela_Ecoli/trained-models/file-{}-scan-model.pkl'.format(FILE_IDX_FOR_ANALYSIS)
RUN_RT_MODEL_FILE_NAME = '/Users/darylwilding-mcbride/Downloads/experiments/190719_Hela_Ecoli/trained-models/file-{}-rt-model.pkl'.format(FILE_IDX_FOR_ANALYSIS)

In [87]:
with open(RUN_MZ_MODEL_FILE_NAME, 'rb') as file:
    mz_estimator = pickle.load(file)
with open(RUN_SCAN_MODEL_FILE_NAME, 'rb') as file:
    scan_estimator = pickle.load(file)
with open(RUN_RT_MODEL_FILE_NAME, 'rb') as file:
    rt_estimator = pickle.load(file)

In [70]:
# gather the sequence attributes to plug into each model
sequence_df = experiment_sequence_attribs_df[(experiment_sequence_attribs_df.sequence == 'AAAAATVVPPMVGGPPFVGPVGFGPGDR') & (experiment_sequence_attribs_df.charge == 3)]
sequence_estimation_attribs_df = sequence_df[['theoretical_mz','experiment_rt','experiment_scan','experiment_intensity']]
sequence_estimation_attribs = sequence_estimation_attribs_df.values

In [71]:
sequence_estimation_attribs

array([[  864.1171    ,  1039.86104703,   707.53758032, 50988.51851852]])

In [95]:
# estimate the raw monoisotopic m/z
mz_delta_estimated = mz_estimator.predict(sequence_estimation_attribs)
estimated_monoisotopic_mz = sequence_attribs_df.iloc[0].theoretical_mz - mz_delta_estimated[0]
estimated_monoisotopic_mz

864.4431811441947

In [96]:
# estimate the raw monoisotopic scan
scan_estimated = scan_estimator.predict(sequence_estimation_attribs)
estimated_scan_apex = scan_estimated[0]
estimated_scan_apex

707.0833914753989

In [97]:
# estimate the raw monoisotopic RT
rt_estimated = rt_estimator.predict(sequence_estimation_attribs)
estimated_rt_apex = rt_estimated[0]
estimated_rt_apex

1038.4501532750114

Look in the raw data for a feature at these coordinates

In [81]:
# the width to use for intensity descent, in m/z
MS1_PEAK_DELTA = 0.1

# Mass difference between Carbon-12 and Carbon-13 isotopes, in Da. For calculating the spacing between isotopic peaks.
CARBON_MASS_DIFFERENCE = 1.003355

# the collision energy to distinguish ms1 and ms2 frames
MS1_COLLISION_ENERGY = 10

In [62]:
# ms1_peaks_a is a numpy array of [mz,intensity]
# returns a numpy array of [mz_centroid,summed_intensity]
def ms1_intensity_descent(ms1_peaks_a):
    # intensity descent
    ms1_peaks_l = []
    while len(ms1_peaks_a) > 0:
        # find the most intense point
        max_intensity_index = np.argmax(ms1_peaks_a[:,1])
        peak_mz = ms1_peaks_a[max_intensity_index,0]
        peak_mz_lower = peak_mz - MS1_PEAK_DELTA
        peak_mz_upper = peak_mz + MS1_PEAK_DELTA

        # get all the raw points within this m/z region
        peak_indexes = np.where((ms1_peaks_a[:,0] >= peak_mz_lower) & (ms1_peaks_a[:,0] <= peak_mz_upper))[0]
        if len(peak_indexes) > 0:
            mz_cent = mz_centroid(ms1_peaks_a[peak_indexes,1], ms1_peaks_a[peak_indexes,0])
            summed_intensity = ms1_peaks_a[peak_indexes,1].sum()
            ms1_peaks_l.append((mz_cent, summed_intensity))
            # remove the raw points assigned to this peak
            ms1_peaks_a = np.delete(ms1_peaks_a, peak_indexes, axis=0)
    return np.array(ms1_peaks_l)


In [78]:
NUMBER_OF_ISOTOPES_RIGHT = 8     # the number of isotopes to look for in the m/z dimension
NUMBER_OF_ISOTOPES_LEFT = 1

expected_spacing_mz = CARBON_MASS_DIFFERENCE / sequence_df.iloc[0].charge

MZ_WIDTH_RIGHT_FROM_MONOISOTOPIC = NUMBER_OF_ISOTOPES_RIGHT * expected_spacing_mz
MZ_WIDTH_LEFT_FROM_MONOISOTOPIC = NUMBER_OF_ISOTOPES_LEFT * expected_spacing_mz

# tolerances for looking either side of the scan and RT apex
RT_WIDTH = 15.0       # from the standard deviation of 13 from the model against the test data
SCAN_WIDTH = 10.0     # from the standard deviation of 8 from the model against the test data

In [98]:
mz_lower = estimated_monoisotopic_mz - MZ_WIDTH_LEFT_FROM_MONOISOTOPIC
mz_upper = estimated_monoisotopic_mz + MZ_WIDTH_RIGHT_FROM_MONOISOTOPIC
scan_lower = estimated_scan_apex - SCAN_WIDTH
scan_upper = estimated_scan_apex + SCAN_WIDTH
rt_lower = estimated_rt_apex - RT_WIDTH
rt_upper = estimated_rt_apex + RT_WIDTH

In [83]:
# find the converted database file name for file idx 0
MAPPING_FILE_NAME = '/Users/darylwilding-mcbride/Downloads/percolator_mapping_df.pkl'
mapping_df = pd.read_pickle(MAPPING_FILE_NAME)

In [90]:
run_name = mapping_df[mapping_df.file_idx == FILE_IDX_FOR_ANALYSIS].iloc[0].run_name
run_name

'190719_Hela_Ecoli_1to3_06-recalibrated'

In [91]:
db_name = run_name.replace('-recalibrated', '-converted.sqlite')

In [92]:
CONVERTED_DATABASE_NAME = '/Users/darylwilding-mcbride/Downloads/experiments/190719_Hela_Ecoli/converted-databases/{}'.format(db_name)

In [93]:
CONVERTED_DATABASE_NAME

'/Users/darylwilding-mcbride/Downloads/experiments/190719_Hela_Ecoli/converted-databases/190719_Hela_Ecoli_1to3_06-converted.sqlite'

In [None]:
db_conn = sqlite3.connect(CONVERTED_DATABASE_NAME)
ms1_frame_properties_df = pd.read_sql_query("select frame_id,retention_time_secs from frame_properties where retention_time_secs >= {} and retention_time_secs <= {} and collision_energy == {} order by retention_time_secs".format(rt_lower, rt_upper, MS1_COLLISION_ENERGY), db_conn)
db_conn.close()
ms1_frame_ids = tuple(ms1_frame_properties_df.frame_id)

In [None]:
# extract the raw data within this area of interest
db_conn = sqlite3.connect(CONVERTED_DATABASE_NAME)
ms1_raw_points_df = pd.read_sql_query("select frame_id,mz,scan,intensity,retention_time_secs from frames where mz >= {} and mz <= {} and scan >= {} and scan <= {} and retention_time_secs >= {} and retention_time_secs <= {} and frame_id in {}".format(mz_lower, mz_upper, scan_lower, scan_upper, rt_lower, rt_upper, ms1_frame_ids), db_conn)
db_conn.close()

In [None]:
f, ax = plt.subplots()
colors = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple', 'tab:brown', 'tab:pink', 'tab:gray', 'tab:olive', 'tab:cyan']
markerline, stemlines, baseline = ax.stem(ms1_peaks_df.mz_centroid, ms1_peaks_df.summed_intensity, 'g')
plt.setp(markerline, 'color', colors[2])
plt.setp(stemlines, 'color', colors[2])
plt.setp(baseline, 'color', colors[7])
plt.xlabel('binned m/z')
plt.ylabel('intensity')
f.set_figheight(5)
f.set_figwidth(15)
plt.margins(0.06)
plt.suptitle('Raw points centroided and summed by m/z bin for precursor ID {}'.format(precursor_id))
plt.title('m/z bin width {} Th, narrow RT (FE +/- {} secs), wide mobility (FE +/- precursor scan width)'.format(MS1_MZ_BIN_WIDTH, RT_FRAGMENT_EVENT_DELTA_SECS))
plt.show()