In [27]:
import sqlite3
import pandas as pd
import numpy as np
import sys
import matplotlib.pyplot as plt
import peakutils

In [28]:
RT_LIMIT_LOWER = 4340  # RT range in the database
RT_LIMIT_UPPER = 4580
RT_BASE_PEAK_WIDTH_SECS = 30.0  # assumption about base peak width in RT
RT_FRAGMENT_EVENT_DELTA_SECS = 3.5  # use this window for constraining RT to focus on the fragmentation event
MS1_CE = 10

In [29]:
BASE_NAME = "/Users/darylwilding-mcbride/Downloads/HeLa_20KInt-rt-{}-{}-denoised".format(RT_LIMIT_LOWER,RT_LIMIT_UPPER)
BASE_MAXQUANT_TXT_DIR = '/Users/darylwilding-mcbride/Downloads/maxquant_results/txt'
ALLPEPTIDES_FILENAME = '{}/allPeptides.txt'.format(BASE_MAXQUANT_TXT_DIR)
PASEF_MSMS_SCANS_FILENAME = '{}/pasefMsmsScans.txt'.format(BASE_MAXQUANT_TXT_DIR)
CONVERTED_DATABASE_NAME = '{}/HeLa_20KInt.sqlite'.format(BASE_NAME)

In [30]:
db_conn = sqlite3.connect(CONVERTED_DATABASE_NAME)
ms1_frame_properties_df = pd.read_sql_query("select frame_id,retention_time_secs from frame_properties where retention_time_secs >= {} and retention_time_secs <= {} and collision_energy == {}".format(RT_LIMIT_LOWER,RT_LIMIT_UPPER,MS1_CE), db_conn)
ms2_frame_properties_df = pd.read_sql_query("select frame_id,retention_time_secs from frame_properties where retention_time_secs >= {} and retention_time_secs <= {} and collision_energy <> {}".format(RT_LIMIT_LOWER,RT_LIMIT_UPPER,MS1_CE), db_conn)
db_conn.close()

In [31]:
len(ms2_frame_properties_df)

2032

In [32]:
list(ms2_frame_properties_df[:5].frame_id)

[40408, 40409, 40410, 40412, 40413]

In [33]:
MS2_MZ_MAX = 1750
MS2_MZ_MIN = 90

In [34]:
ms2_frame_ids = tuple(ms2_frame_properties_df.frame_id)

create the bins from the range of m/z

In [139]:
MS2_MZ_BIN_WIDTH = 1e-3

ms2_bins = np.arange(start=MS2_MZ_MIN, stop=MS2_MZ_MAX+MS2_MZ_BIN_WIDTH, step=MS2_MZ_BIN_WIDTH)  # go slightly wider to accomodate the maximum value

MS2_MZ_BIN_COUNT = len(ms2_bins)
MS2_MZ_BIN_COUNT

1660001

In [184]:
# get the raw points for this frame
db_conn = sqlite3.connect(CONVERTED_DATABASE_NAME)
ms2_raw_points_df = pd.read_sql_query("select frame_id,mz,scan,intensity from frames where frame_id in {} and mz >= {} and mz <= {} and intensity > 0".format(ms2_frame_ids, MS2_MZ_MIN, MS2_MZ_MAX), db_conn)
db_conn.close()


In [185]:
ms2_raw_points_df['bin_idx'] = np.digitize(ms2_raw_points_df.mz, ms2_bins).astype(int)

In [186]:
ms2_raw_points_df.head()

Unnamed: 0,frame_id,mz,scan,intensity,bin_idx
0,40408,401.182366,112,24,311183
1,40408,404.143928,112,58,314144
2,40408,773.38251,112,22,683383
3,40408,854.732578,112,61,764733
4,40408,855.368861,112,63,765369


In [187]:
# save the binned ms2 for later
ms2_raw_points_df.to_pickle('/Users/darylwilding-mcbride/Downloads/binned_ms2_df.pkl')

for a selection of ms2 frames, combine the raw points and centroid them

In [188]:
ms2_frame_ids_to_combine = list(ms2_frame_properties_df[:5].frame_id)
ms2_frame_ids_to_combine

[40408, 40409, 40410, 40412, 40413]

In [189]:
ms2_frames_to_combine_df = ms2_raw_points_df[ms2_raw_points_df.frame_id.isin(ms2_frame_ids_to_combine)]

In [190]:
def calc_centroid(bin_df):
    d = {}
    d['bin_idx'] = bin_df.iloc[0].bin_idx
    d['mz_centroid'] = peakutils.centroid(bin_df.mz, bin_df.intensity)
    d['summed_intensity'] = bin_df.intensity.sum()
    d['point_count'] = len(bin_df)
    return pd.Series(d, index=['bin_idx','mz_centroid','summed_intensity','point_count'])

In [146]:
combined_ms2_df = ms2_frames_to_combine_df.groupby(['bin_idx'], as_index=False).apply(calc_centroid)
combined_ms2_df.summed_intensity = combined_ms2_df.summed_intensity.astype(int)
combined_ms2_df.bin_idx = combined_ms2_df.bin_idx.astype(int)
combined_ms2_df.point_count = combined_ms2_df.point_count.astype(int)

In [147]:
combined_ms2_df[combined_ms2_df.point_count > 10]

Unnamed: 0,bin_idx,mz_centroid,summed_intensity,point_count
1030,241122,331.1218,1209,12
1031,241125,331.124669,1352,14
2384,350215,440.214516,611,11
3260,401228,491.227304,849,14
5389,507333,597.332233,570,11
8067,638362,728.361125,456,11
10887,776533,866.532859,636,12
11657,822466,912.465115,676,13
12526,876537,966.536887,514,11
12645,887973,977.972874,1486,11


In [192]:
ms2_frames_to_combine_df[ms2_frames_to_combine_df.bin_idx == 964515].sort_values(['frame_id'])

Unnamed: 0,frame_id,mz,scan,intensity,bin_idx
406,40408,1054.514483,264,52,964515
445,40408,1054.514483,267,88,964515
466,40408,1054.514483,268,45,964515
480,40408,1054.514483,269,226,964515
522,40408,1054.514483,272,50,964515
553,40408,1054.514483,274,31,964515
586,40408,1054.514483,276,36,964515
678,40408,1054.514483,284,148,964515
6797,40409,1054.514483,276,142,964515
6771,40409,1054.514483,274,126,964515


for small bins widths, the m/z values are the same, so there's no point centroiding them. However, we want to be able to vary the bin widths so we'll make it general.

In [181]:
np.digitize(1054.5201, ms2_bins).astype(int)

array(964521)