In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib
import os
%matplotlib inline

In [2]:
def stack(batch_path):
    all_samples = []
    num_files = len([f for f in os.listdir(batch_path)if os.path.isfile(os.path.join(batch_path, f))])
    for files in tqdm(range(num_files)):
        sample = os.listdir(batch_path)[files]
        sample_df = pd.read_csv(batch_path + sample, sep="\\t", usecols=['RT (min)', 'Precursor m/z', 'S/N'], engine='python')
        # Prefilter
        #sample_df = sample_df.drop(sample_df[(sample_df['Area'] > 5000) & (sample_df['RT (min)'] < 1.5) & (sample_df['S/N'] > 5)].index)
        all_samples += [sample_df]
    total_samples = pd.concat(all_samples)
    all_samples.clear()
    return total_samples

In [14]:
stack('/mnt/c/Users/Derek/Desktop/mass-suite/example_data/batch/')

100%|██████████| 3/3 [00:00<00:00, 78.10it/s]


Unnamed: 0,RT (min),Precursor m/z,S/N
0,0.578250,179.0358,37.786630
1,0.583783,201.0726,12.048300
2,0.583783,217.0489,13.964120
3,0.589317,142.0350,16.470440
4,0.589317,158.0119,28.776020
...,...,...,...
1178,21.128430,333.9733,52.373000
1179,21.133970,181.9790,25.339680
1180,21.156100,204.9958,15.779640
1181,21.189300,157.9763,21.389860


In [14]:
def realignment(batch_path, batch_name):
    """This function works by using one .txt file as a reference in which
    other files realigned to in terms of precursor and RT. """
    RT_error = 0.10 # units of minutes, can be adjusted
    alignment_df = pd.DataFrame()
    standard_sample = os.listdir(batch_path)[0] # first sample
    # reads .txt file into a dataframe
    standard_df = pd.read_csv(batch_path + standard_sample, sep="\\t", usecols=['RT (min)', 'Precursor m/z'], engine='python')
    #ref_df = ref_df.drop(ref_df[(ref_df['Area'] > 5000) & (ref_df['RT (min)'] < 1.5) & (ref_df['S/N'] > 5)].index)
    alignment_df['RT (min)'] = standard_df.iloc[:,0]
    alignment_df['Precursor m/z'] = standard_df.iloc[:,1]
    alignment_df['Sum RT (min)'] = 0.0
    alignment_df['Sum Precursor m/z'] = 0.0
    mz_error = 0.00002*standard_df.iloc[:,1] # 20 ppm
    col_index = 4
    total_samp = stack(batch_path)
    num_files = len([f for f in os.listdir(batch_path)if os.path.isfile(os.path.join(batch_path, f))]) 
    for files in range(num_files):
        sample = os.listdir(batch_path)[files]
        alignment_df[sample] = 0
    for row in tqdm(range(len(total_samp))):
        if row < range(len(total_samp))[-1]:
            if total_samp.index[row] < total_samp.index[row + 1]:
                overlap = np.where(((alignment_df.iloc[:, 0] - RT_error) <= total_samp.iloc[row, 0]) &\
                                   (total_samp.iloc[row, 0] <= (alignment_df.iloc[:,0] + RT_error))\
                                     & ((alignment_df.iloc[:, 1] - mz_error) <= total_samp.iloc[row, 1])\
                                     & (total_samp.iloc[row, 1] <= (alignment_df.iloc[:, 1] + mz_error)))
                if overlap[0].size:
                    overlap_val = pd.DataFrame(overlap)
                    # sets intensity value into slot where realignment was identified
                    alignment_df.at[overlap_val.iloc[0,0], alignment_df.columns[col_index]] = total_samp.iloc[row, 2]
                    # adds RT time to aligned point
                    alignment_df.at[overlap_val.iloc[0,0], 'Sum RT (min)'] += total_samp.iloc[row,0]
                    # adds m/z value to aligned point
                    alignment_df.at[overlap_val.iloc[0,0], 'Sum Precursor m/z'] += total_samp.iloc[row,1]
                else:
                    # appends as a new reference if nothing is found
                    alignment_df.loc[alignment_df.index.max() + 1] = total_samp.iloc[row, 0:2]
                    alignment_df.at[alignment_df.index[-1], alignment_df.columns[col_index]] = total_samp.iloc[row, 2]
                    alignment_df.at[alignment_df.index[-1], 'Sum RT (min)'] = total_samp.iloc[row,0]
                    alignment_df.at[alignment_df.index[-1], 'Sum Precursor m/z'] = total_samp.iloc[row,1]
            else:
                col_index += 1
                overlap = np.where(((alignment_df.iloc[:, 0] - RT_error) <= total_samp.iloc[row, 0]) &\
                                   (total_samp.iloc[row, 0] <= (alignment_df.iloc[:,0] + RT_error))\
                                     & ((alignment_df.iloc[:, 1] - mz_error) <= total_samp.iloc[row, 1])\
                                     & (total_samp.iloc[row, 1] <= (alignment_df.iloc[:, 1] + mz_error)))
                if overlap[0].size:
                    overlap_val = pd.DataFrame(overlap)
                    # sets intensity value into slot where realignment was identified
                    alignment_df.at[overlap_val.iloc[0,0], alignment_df.columns[col_index]] = total_samp.iloc[row, 2]
                    # adds RT time to aligned point
                    alignment_df.at[overlap_val.iloc[0,0], 'Sum RT (min)'] += total_samp.iloc[row,0]
                    # adds m/z value to aligned point
                    alignment_df.at[overlap_val.iloc[0,0], 'Sum Precursor m/z'] += total_samp.iloc[row,1]
                else:
                    # appends as a new reference if nothing is found
                    alignment_df.loc[alignment_df.index.max() + 1] = total_samp.iloc[row, 0:2]
                    alignment_df.at[alignment_df.index[-1], alignment_df.columns[col_index]] = total_samp.iloc[row, 2]
                    alignment_df.at[alignment_df.index[-1], 'Sum RT (min)'] = total_samp.iloc[row,0]
                    alignment_df.at[alignment_df.index[-1], 'Sum Precursor m/z'] = total_samp.iloc[row,1]
        else:
            overlap = np.where(((alignment_df.iloc[:, 0] - RT_error) <= total_samp.iloc[row, 0]) &\
                                (total_samp.iloc[row, 0] <= (alignment_df.iloc[:,0] + RT_error))\
                                 & ((alignment_df.iloc[:, 1] - mz_error) <= total_samp.iloc[row, 1])\
                                 & (total_samp.iloc[row, 1] <= (alignment_df.iloc[:, 1] + mz_error)))
            if overlap[0].size:
                overlap_val = pd.DataFrame(overlap)
                # sets intensity value into slot where realignment was identified
                alignment_df.at[overlap_val.iloc[0,0], alignment_df.columns[col_index]] = total_samp.iloc[row, 2]
                # adds RT time to aligned point
                alignment_df.at[overlap_val.iloc[0,0], 'Sum RT (min)'] += total_samp.iloc[row,0]
                # adds m/z value to aligned point
                alignment_df.at[overlap_val.iloc[0,0], 'Sum Precursor m/z'] += total_samp.iloc[row,1]
            else:
                # appends as a new reference if nothing is found
                alignment_df.loc[alignment_df.index.max() + 1] = total_samp.iloc[row, 0:2]
                alignment_df.at[alignment_df.index[-1], alignment_df.columns[col_index]] = total_samp.iloc[row, 2]
                alignment_df.at[alignment_df.index[-1], 'Sum RT (min)'] = total_samp.iloc[row,0]
                alignment_df.at[alignment_df.index[-1], 'Sum Precursor m/z'] = total_samp.iloc[row,1]
    alignment_df.rename(columns = {'RT (min)':'Average RT (min)'}, inplace = True)
    alignment_df.rename(columns = {'Precursor m/z':'Average m/z'}, inplace = True)
    # Replace all NaN elements with 0
    alignment_df = alignment_df.fillna(0)
    for rows in range(len(alignment_df)):
        # Calculating the averages after the iterations
        # requires count of nonzero count to calculate the mean properly
        count = np.count_nonzero(alignment_df.iloc[rows, 4:])
        if count > 0:
            alignment_df.at[rows, 'Average RT (min)'] = alignment_df.loc[rows, 'Sum RT (min)']/count
            alignment_df.at[rows, 'Average m/z'] = alignment_df.loc[rows, 'Sum Precursor m/z']/count
        else:
            pass
    # Drop columns to collect sums for averaging",
    alignment_df = alignment_df.drop(columns=['Sum RT (min)', 'Sum Precursor m/z'])
    # Final sort by m/z,
    #alignment_df = alignment_df.sort_values(by='Average m/z', ignore_index=True)
    # converts file back to .txt
    alignment_df.to_csv(batch_name + '.txt', header=True, index=False)
    return alignment_df

In [15]:
%time realignment('/mnt/c/Users/Derek/Desktop/mass-suite/example_data/batch/', 'test_batch')

100%|██████████| 3/3 [00:00<00:00, 42.50it/s]
100%|██████████| 3541/3541 [00:14<00:00, 249.67it/s]


CPU times: user 14.2 s, sys: 172 ms, total: 14.3 s
Wall time: 14.7 s


Unnamed: 0,Average RT (min),Average m/z,20180815_CEC_CAL-8-no4_MSpos_1.txt,20180815_CEC_CAL-8-no4_MSpos_2.txt,20180815_CEC_CAL-8-no4_MSpos_3.txt
0,0.578250,179.035800,37.0,0.0,0.000000
1,0.582611,201.072600,12.0,13.0,11.585760
2,0.582611,217.049333,13.0,16.0,16.083630
3,0.588144,142.034533,16.0,19.0,23.501240
4,0.588144,158.012300,28.0,27.0,32.548000
...,...,...,...,...,...
1596,21.111830,390.913600,0.0,0.0,33.860340
1597,21.117370,201.006900,0.0,0.0,29.371630
1598,21.133970,181.979000,0.0,0.0,25.339680
1599,21.189300,157.976300,0.0,0.0,21.389860


In [17]:
%time realignment('/mnt/c/Users/Derek/Desktop/0815/0815/', 'test_batch2')

100%|██████████| 124/124 [00:03<00:00, 40.20it/s]
100%|██████████| 244935/244935 [9:03:49<00:00,  7.51it/s]       


CPU times: user 1h 59min 27s, sys: 5h 25min 21s, total: 7h 24min 49s
Wall time: 9h 4min 46s


Unnamed: 0,Average RT (min),Average m/z,20180815_CEC_CAL-8-no4_MSpos_1.txt,20180815_CEC_CAL-8-no4_MSpos_2.txt,20180815_CEC_CAL-8-no4_MSpos_3.txt,20180815_CEC_CAL-8-no4_MSpos_4.txt,20180815_CEC_CAL-8-no4_MSpos_5.txt,20180815_CEC_CAL-8-no4_MSpos_6.txt,20180815_CEC_CAL-8-no4_MSpos_7.txt,20180815_CEC_CAL-8-no4_MSpos_8.txt,...,20180815_SR520-Creek_Mix6A_3.txt,20180815_SR520-Creek_Mix6B_1.txt,20180815_SR520-Creek_Mix6B_2.txt,20180815_SR520-Creek_Mix6B_3.txt,20180815_Swan-Creek-Dec_1.txt,20180815_Swan-Creek-Dec_2.txt,20180815_Swan-Creek-Dec_3.txt,20180815_Swan-Creek-May_1.txt,20180815_Swan-Creek-May_2.txt,20180815_Swan-Creek-May_3.txt
0,0.648777,179.034960,37.0,0.0,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000
1,0.596311,201.071778,12.0,13.0,11.58576,11.49537,11.68135,12.12624,11.71651,13.08869,...,82.01872,61.19651,56.43658,60.34138,58.99921,68.57910,61.25859,47.73723,47.61060,51.172450
2,0.604112,217.049855,13.0,16.0,16.08363,17.77179,16.21303,20.89342,17.78909,19.31178,...,0.00000,18.17052,18.55908,18.17823,0.00000,44.61441,38.35946,0.00000,0.00000,0.000000
3,0.600863,142.034773,16.0,19.0,23.50124,23.26580,22.86015,20.46871,19.71240,17.78686,...,160.14880,121.50730,118.69080,139.11850,118.11610,125.56250,110.97360,84.20597,91.81557,84.451320
4,0.602397,158.012529,28.0,27.0,32.54800,35.09987,32.81023,33.34668,33.66407,34.24351,...,0.00000,89.81661,94.32565,103.93650,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
174286,21.118580,159.971500,0.0,0.0,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,26.607880
174287,21.124120,331.974200,0.0,0.0,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,79.357480
174288,21.135180,181.980500,0.0,0.0,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,24.837140
174289,21.162850,165.966600,0.0,0.0,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,19.675960
