In [1]:
import pandas as pd
import sqlite3
import numpy as np

In [2]:
# define a straight line to exclude the charge-1 cloud
def scan_coords_for_single_charge_region(mz_lower, mz_upper):
    scan_for_mz_lower = max(int(-1 * ((1.2 * mz_lower) - 1252)), 0)
    scan_for_mz_upper = max(int(-1 * ((1.2 * mz_upper) - 1252)), 0)
    return {'scan_for_mz_lower':scan_for_mz_lower, 'scan_for_mz_upper':scan_for_mz_upper}

In [3]:
experiment_name = 'P3856'
run_name = 'P3856_YHE211_1_Slot1-1_1_5104'
EXPERIMENT_DIR = '/media/big-ssd/experiments/{}'.format(experiment_name)
CONVERTED_DATABASE_NAME = "{}/converted-databases/exp-{}-run-{}-converted.sqlite".format(EXPERIMENT_DIR, experiment_name, run_name)

In [4]:
SEGMENT_EXTENSION = 2.0
FRAME_TYPE_MS1 = 0
rt_lower = 1650
rt_upper = 2200
segment_mz_lower = 1020
segment_mz_upper = 1040
scan_limit = scan_coords_for_single_charge_region(mz_lower=segment_mz_lower, mz_upper=segment_mz_upper)['scan_for_mz_upper']

In [5]:
VOXEL_SIZE_RT = 5
VOXEL_SIZE_SCAN = 10
VOXEL_SIZE_MZ = 0.1

In [6]:
# load the raw points for this m/z segment
db_conn = sqlite3.connect(CONVERTED_DATABASE_NAME)
raw_df = pd.read_sql_query("select frame_id,mz,scan,intensity,retention_time_secs from frames where frame_type == {} and retention_time_secs >= {} and retention_time_secs <= {} and scan >= {} and mz >= {} and mz <= {}".format(FRAME_TYPE_MS1, rt_lower, rt_upper, scan_limit, segment_mz_lower, segment_mz_upper+SEGMENT_EXTENSION), db_conn, dtype={'frame_id':np.uint16,'mz':np.float32,'scan':np.uint16,'intensity':np.uint16,'retention_time_secs':np.float32})
db_conn.close()

In [7]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10127942 entries, 0 to 10127941
Data columns (total 5 columns):
 #   Column               Dtype  
---  ------               -----  
 0   frame_id             uint16 
 1   mz                   float32
 2   scan                 uint16 
 3   intensity            uint16 
 4   retention_time_secs  float32
dtypes: float32(2), uint16(3)
memory usage: 135.2 MB


In [8]:
# assign each point a unique identifier
raw_df.reset_index(drop=True, inplace=True)  # just in case
raw_df['point_id'] = raw_df.index

In [9]:
# define bins
rt_bins = pd.interval_range(start=raw_df.retention_time_secs.min(), end=raw_df.retention_time_secs.max()+VOXEL_SIZE_RT, freq=VOXEL_SIZE_RT, closed='left')
scan_bins = pd.interval_range(start=raw_df.scan.min(), end=raw_df.scan.max()+VOXEL_SIZE_SCAN, freq=VOXEL_SIZE_SCAN, closed='left')
mz_bins = pd.interval_range(start=segment_mz_lower, end=segment_mz_upper+SEGMENT_EXTENSION+VOXEL_SIZE_MZ, freq=VOXEL_SIZE_MZ, closed='left')

In [10]:
%%timeit
# assign raw points to their bins
raw_df['rt_bin'] = pd.cut(raw_df.retention_time_secs, bins=rt_bins)
raw_df['scan_bin'] = pd.cut(raw_df.scan, bins=scan_bins)
raw_df['mz_bin'] = pd.cut(raw_df.mz, bins=mz_bins)
raw_df['bin_key'] = list(zip(raw_df.mz_bin, raw_df.scan_bin, raw_df.rt_bin))

50.8 s ± 1.01 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [11]:
raw_df[raw_df.isnull().any(axis=1)]

Unnamed: 0,frame_id,mz,scan,intensity,retention_time_secs,point_id,rt_bin,scan_bin,mz_bin,bin_key
