In [1]:
import pandas as pd
import numpy as np
import sklearn
import glob
from pathlib import Path 
import sys
sys.path.append('../mss')
import mssmain as mss
import peakutils
from scipy.integrate import simps
from ast import literal_eval
import scipy
from tqdm import tqdm

In [2]:
#Read the dataset
path = '../example_data/peakdata/labelled_output/'
all_files = glob.glob(path + "/*.csv")

In [3]:
for i in range(len(all_files)):
    if i == 0:
        df = pd.read_csv(all_files[i])
        df['source'] = all_files[i]
    else:
        df_else = pd.read_csv(all_files[i])
        df_else['source'] = all_files[i]
        df = df.append(df_else, ignore_index = True)

In [4]:
#reshape data
df.columns = ['index', 'mz', 'i array', 'label', 'source']
df = pd.DataFrame(df, columns = ['mz', 'i array', 'label', 'source', 'index'])
df.head()

Unnamed: 0,mz,i array,label,source,index
0,593.411581,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3,../example_data/peakdata/labelled_output\100ba...,680
1,196.99735,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3,../example_data/peakdata/labelled_output\100ba...,60
2,327.00653,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2,../example_data/peakdata/labelled_output\100ba...,280
3,420.97569,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,../example_data/peakdata/labelled_output\100ba...,488
4,483.345794,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,../example_data/peakdata/labelled_output\100ba...,568


# RT conversion rate should be incorporated if any width parameters included

In [5]:
df_relabel = df[(df['label'] != 3) & (df['label'] != 2) & (df['label'] != 1)] #df for mislabelled peaks

In [6]:
df_model = df.drop(df_relabel.index) #data for modeling, now have ~3500 rows of data

In [7]:
rt_conversion_rate = 0.005533333

In [87]:
def peak_para(intensity, rt_conversion_rate, peak_thres = 0.01, thr = 0.02, min_d = 1, rt_window = 1.5, peak_area_thres = 1e5, min_scan = 15, max_scan = 200, max_peak = 5, min_scan_window = 20, sn_range = 7):
    '''
    firstly get rt, intensity from given mz and error out of the mzml file
    Then find peak on the intensity array, represent as index --> index
    Find peak range by looping from peak index forward/backward until hit the peak_base --> l_range,h_range. peakspan = h_range - l_range
    Trim/correct peak range is too small or too large, using min_scan/max_scan,min_scan_window --> trimed l/h_range
    Integration of peak based on the given range using simp function --> peakarea
    '''
    
    #Get rt_window corresponded scan number -- needs update later
    
    #Get peak index
    indexes = peakutils.indexes(intensity, thres=thr, min_dist = min_d)
    
    result_dict = {}
    
    
    #dev note: boundary detection refinement
    for index in indexes:
        h_range = index
        l_range = index
        base_intensity = peak_thres * intensity[index] # use relative thres, also considering S/N, 1/2 rt point?
        half_intensity = 0.5 * intensity[index]

        #Get the higher and lower boundary
        while intensity[h_range] >= base_intensity:
            h_range += 1
            if intensity[h_range-1] < half_intensity: #potentially record this
                if h_range - index > 4: #fit r2 score, keep record https://stackoverflow.com/questions/55649356/how-can-i-detect-if-trend-is-increasing-or-decreasing-in-time-series as alternative
                    x = np.linspace(h_range - 2, h_range, 3)
                    y = intensity[h_range - 2 : h_range + 1]
                    slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(x, y)
#                     print(rt[h_range],r_value)
                    if abs(r_value) < 0.6:
                        break
                    elif h_range > len(intensity)-2: 
                        break
        while intensity[l_range] >= base_intensity: #Dev part 2, low priority since general peak shapes
            l_range -= 1
            if intensity[l_range] < half_intensity:
                pass #backdoor for recording 1/2 rt point
        #Output a range from the peak list
    
        peak_range = intensity[l_range:h_range]#no filter so ignored for tailing effects
        #print(index + scan_window)
                
        #Calculate for S/N
        signal = intensity[index]
        neighbour_blank = intensity[l_range - sn_range : l_range] + intensity[h_range + 1 : h_range + sn_range + 1]
        noise = max(neighbour_blank)
        if noise != 0:
            sn = round(signal/noise, 3)
        else:
            sn = 0
        
        #Calculate height/width, consider log10 transform
        height = signal
        width = (h_range - l_range) * rt_conversion_rate
        #Add rt conversion factor here to convert width in scan into rt
        hw_ratio = round(height/width,0)
        
        #------------------------------------------------new-------------------------------------------
        #Additional global parameters
        #1/2 peak range
        h_loc = index
        l_loc = index
        while intensity[h_loc] > half_intensity:
            h_loc += 1
        while intensity[l_loc] > half_intensity:
            l_loc -= 1
        #calculate for slope -- interpolation included-- pay attention!
        h_half = h_loc + (half_intensity - intensity[h_loc]) / (intensity[h_loc - 1] - intensity[h_loc])
        l_half = l_loc + (half_intensity - intensity[l_loc]) / (intensity[l_loc + 1] - intensity[l_loc])
        mb = (height - half_intensity) / ((h_half - index) * rt_conversion_rate) #when transfer back use rt[index] instead
        ma = (height - half_intensity) / ((index - l_half) * rt_conversion_rate)
        #------------------------------------------------new-------------------------------------------


        #Intergration based on the simps function
        if len(peak_range) >= min_scan:
            integration_result = simps(peak_range)
            if integration_result >= peak_area_thres:
                #Calculate Area/background ratio, i.e, peak area vs rectangular area as whole(if =1 then peak is a pleateu)
                
                background_area = (h_range - l_range) * height
                ab_ratio = round(integration_result/background_area, 3)
                
                #appending to result
                if len(result_dict) == 0:
                    result_dict.update({index : [l_range, h_range, integration_result, sn, hw_ratio, ab_ratio, h_half, l_half, height, ma, mb, ma+mb, mb/ma]})
                elif integration_result != list(result_dict.values())[-1][2]: #Compare with previous item
                    s_window = abs(index - list(result_dict.keys())[-1])
                    if s_window > min_scan_window:
                        result_dict.update({index : [l_range, h_range, integration_result, sn, hw_ratio, ab_ratio, h_half, l_half, height, ma, mb, ma+mb, mb/ma]})

                    
        #Filtering:
        #1. delete results that l_range/h_range within 5 scans
        #3. If still >5 then select top 5 results
        #list(result_dict.values())[-1]
    
    #Noise filter
    if len(result_dict) > max_peak:
        result_dict = {}
        


    return result_dict

In [84]:
test_array = literal_eval(df_model.iloc[1200]['i array'])

In [85]:
para = peak_para(test_array, rt_conversion_rate)

In [86]:
para

{2911: [2903,
  2934,
  235822.75048828122,
  0,
  63463.0,
  0.699,
  2929.1921090546475,
  2904.0186584388252,
  10886.1015625,
  140901.8491314241,
  54072.011795481514,
  194973.86092690562,
  0.3837565804054612]}

In [88]:
df_para = pd.DataFrame(columns = ['mz',
                        'i array',
                        'peak width in min',
                        'half intensity width in min',
                        'left width' ,
                        'right width',
                        'assymetric factor',
                       'integration',
                       'sn',
                       'hw',
                       'ab',
                        'peak height',
                        'ma',
                        'mb',
                        'broad rate',
                        'skewness',
                        'variance',
                       'label'])

In [89]:
for i, row in tqdm(df_model.iterrows()):
    try:
        i_array = literal_eval(row['i array'])
        para = peak_para(i_array, rt_conversion_rate)

        for i in para.items():
            index = i[0]
            l_range = i[1][0]
            h_range = i[1][1]
            integration = i[1][2]
            sn = i[1][3]
            hw = i[1][4]
            ab = i[1][5]
            h_half = i[1][6]
            l_half = i[1][7]
            height = i[1][8]
            ma = i[1][9]
            mb = i[1][10]
            broad_rate = i[1][11]
            skewness = i[1][12]
            
            w = (h_range - l_range) * rt_conversion_rate
            l_width = (index - l_range) * rt_conversion_rate
            r_width = (h_range - index) * rt_conversion_rate
            t_r = (h_half - l_half) * rt_conversion_rate

            paradict = {'mz' : row['mz'],
                        'i array' : row['i array'],
                        'peak width in min' : w,
                        'half intensity width in min' : t_r,
                        'left width' : l_width,
                        'right width' : r_width,
                        'assymetric factor' : ((h_range - index) * rt_conversion_rate) / ((index - l_range) * rt_conversion_rate),
                       'integration' : integration,
                       'sn' : sn,
                       'hw' : hw,
                       'ab' : ab,
                        'peak height' : height,
                        'ma' : ma,
                        'mb' : mb,
                        'broad rate' : broad_rate,
                        'skewness' : skewness,
                        'variance' : w ** 2 / (1.764 * ((r_width / l_width) ** 2) - 11.15 * (r_width / l_width) + 28),
                       'label': row['label']}
            df_para = df_para.append(paradict, ignore_index = True)
    except:
        continue

3555it [03:07, 18.93it/s]


In [91]:
df_para.head()

Unnamed: 0,mz,i array,peak width in min,half intensity width in min,left width,right width,assymetric factor,integration,sn,hw,ab,peak height,ma,mb,broad rate,skewness,variance,label
0,196.99735,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.204733,0.20457,0.005533,0.1992,36.0,296175.783203,0.892,36466.0,1.072,7465.887207,1349257.0,18497.942028,1367755.0,0.01371,2.2e-05,3
1,196.99735,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.1162,0.116738,0.011067,0.105133,9.5,140804.324056,0.915,67609.0,0.853,7856.220703,476408.6,36206.063658,512614.6,0.075998,0.000166,3
2,420.97569,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.188133,0.129128,0.0498,0.138333,2.777778,697342.793864,0.0,195821.0,0.557,36840.394531,495241.0,200363.699725,695604.7,0.404578,0.003327,1
3,483.345794,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.143867,0.144161,0.022133,0.121733,5.5,179794.688558,0.0,59442.0,0.809,8551.689453,236320.8,33917.169198,270238.0,0.143522,0.001033,1
4,701.404588,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.1826,0.182849,0.011067,0.171533,15.5,201843.804688,1.212,39443.0,0.849,7202.227539,486407.0,20525.565199,506932.6,0.042198,0.00012,1


In [92]:
df_para.to_csv('../example_data/peakdata/labelled_output/summary-3rdedit.csv')