In [1]:
import pandas as pd
import numpy as np
from itertools import groupby
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn import cluster,mixture
from sklearn.cluster import DBSCAN
from sklearn.manifold import TSNE
import scipy
from pandas.core.common import flatten

  import pandas.util.testing as tm


Idea:
blank removal --> noise removal --> find unique/shared clusters --> use information for id --> use information for tracking
```
sudo code:
picking up sources
output labeled table
use the information for the source id

two way:
1. venn diagram --> source id
2. more data: single source approach to identify clusters for different sources and use the modeling approach for source tracking
```

important : tweak the parameters during venn diagram approach

Two ways: use source data + venn diagram, give unique cluster higher score and shared cluster lower score, given a new sample, can predict the source id, or assign with possibility scores, for instance >70% of cluster features present then the shource exists. Hard to do source tracking since matrix effect and dilution effect is not considered

better way with more data: every source with dilution series, and use the single source approach to find clusters, and modeling for the source approportioning



In [2]:
d_ms = pd.read_csv('../example_data/clustering/sample1114.csv')
d_ms = d_ms.rename(columns={'Average Rt(min)': 'Average RT (min)', 'Average Mz': 'Average m/z', 'S/N average': 'Average sn'})
d_ms.insert(3, "Average score", 1)

In [32]:
def data_prep(d_input, blank_keyword, simp_summary = False,svb_thres=10, empty_thres=0,rt_range=[0, 30], mz_range=[0, 1200], sn_thres=3, score_thres=0, area_thres=5000):
    '''
    The function is used to clean the dataframe according to user setting
    blank_keyword: part of string from column that indicates the column is a blank sample
    svb_thres: sample vs blank thres
    empty_thres: empty cell thres in a row
    cv_thres: as all sample is in triplicate, calculate the CV for every triplicate sample set #Needs to be updated in case there is no triplicate samples
    rt_range: rt filter
    mz_range: mz filter
    sn_thres: signal/noise column thres
    score_thres: score column thres
    area_thres: count for max peak area from each row
    '''
    d_thres = d_input[d_input[d_input.columns[4:]].max(1) >= area_thres]
    
    d_thres = d_thres[(d_thres['Average RT (min)'] > rt_range[0]) & (d_thres['Average RT (min)'] < rt_range[1])]
    d_thres = d_thres[(d_thres['Average m/z'] > mz_range[0]) & (d_thres['Average m/z'] < mz_range[1])]
    d_thres = d_thres[d_thres['Average sn'] >= sn_thres]
    d_thres = d_thres[d_thres['Average score'] >= score_thres]
    d_thres.reset_index(inplace=True, drop=True)
    
    col_blank = []
    for key in blank_keyword:
        # Get column name if it contains blank indicating strings
        col_blank.extend([col for col in d_thres.columns if key in col])
        
    col_sample = [col for col in d_thres.columns if col not in col_blank]
    # Sample maximum area vs Blank average area to count for svb
    d_sample = d_thres[d_thres[col_sample[4:]].max(axis=1) / d_thres[col_blank].mean(axis=1) > svb_thres][col_sample] 
    d_sample.reset_index(inplace=True)
    d_sample.drop(columns=['index'],inplace=True)
    
    # Get a list of triplicate, every triplicate is in a sublist
    #Sample: [[a1,a2,a3],[b1,b2,b3]]
    #Note: the triplicate parsing is now only used '_' which needs update in the future
    #d_transpose['dilu_vol'] = d_transpose['dilu_vol'].apply(lambda x : x.replace('-','_')) in case people use '-' for parsing
    trip_list = [list(i) for j, i in groupby(d_sample.columns[4:], lambda a: a.split('_')[:-1])] 
    trip_list = [i for i in trip_list if len(i)>=2] #filter out columns that is not in triplicate -- sample naming issue

    for triplicate in tqdm(trip_list):
        for row in d_sample[triplicate].itertuples(): # Loop for every sets of triplicates
            if row[1:].count(0) > empty_thres:
                d_sample.loc[row.Index, triplicate] = 0 # if more than thres, then set all three values to 0
#             elif np.mean(row[1:]) != 0:
#                 if np.std(row[1:]) / np.mean(row[1:]) > cv_thres:
#                     d_sample.loc[row.Index, triplicate] = 0 #need verify, not work for now
            else:
                pass
            
    d_sample = d_sample[~(d_sample[d_sample.columns[4:]]==0).all(1)] #clean rows with all 0
    if simp_summary == True:
        simp_dict={}
        for i, column in enumerate(trip_list):
            avg = d_sample[column].mean(1)
            cv = d_sample[column].std(1) / d_sample[column].mean(1) #optional display CV
            simp_dict.update({column[0][:-2]:avg, ' CV #' + str(i):cv})
        d_result = pd.DataFrame(simp_dict)
        d_result = pd.concat([d_sample[d_sample.columns[:4]], d_result], axis=1)
    elif simp_summary == False:
        d_result = d_sample.copy()
    
    return d_result

In [178]:
d_sample

Unnamed: 0,Average RT (min),Average m/z,Average sn,Average score,20181114_CoulterCreek_1,20181114_CoulterCreek_2,20181114_CoulterCreek_3,20181114_Crescent-Creek-Jan_1,20181114_Crescent-Creek-Jan_2,20181114_Crescent-Creek-Jan_3,...,20181114_SR520-Creek_Mix6A_3,20181114_SR520-Creek_Mix6B_1,20181114_SR520-Creek_Mix6B_2,20181114_SR520-Creek_Mix6B_3,20181114_SwanCreek-Dec_1,20181114_SwanCreek-Dec_2,20181114_SwanCreek-Dec_3,20181114_SwanCreek-May_1,20181114_SwanCreek-May_2,20181114_SwanCreek-May_3
0,5.696,200.07405,35.03,1,3339,2477,3796,4698,3918,2252,...,3769,5471,5255,5571,4397,5408,5311,3060,2645,2647
1,4.326,200.12823,16.90,1,1791,2769,2955,2544,1265,2225,...,2000,2429,1507,1572,539,2316,2175,3045,1496,802
2,4.220,200.12869,29.97,1,2586,1877,2674,4057,3302,4223,...,2389,2229,2068,4053,1055,2186,1263,1387,2033,2486
3,5.920,200.16484,192.03,1,1403,166,137,679,388,677,...,6572,7230,10364,8871,772,1658,1421,660,474,368
4,5.295,200.20181,23.37,1,0,0,0,0,0,0,...,188,571,232,868,211,141,193,372,96,333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5295,13.722,796.54181,72.80,1,3287,1666,1874,522,877,1666,...,2243,3766,4044,2141,1317,1430,1557,707,1621,770
5296,19.664,796.68024,64.91,1,6486,11286,11809,1949,698,1904,...,2254,3491,1027,1102,1127,527,562,678,1562,1144
5297,18.050,797.51886,17.14,1,2482,1489,756,498,678,466,...,780,278,206,787,832,650,508,0,0,0
5298,5.058,798.47003,24.24,1,457,673,401,245,169,163,...,809,560,859,1091,438,265,89,194,357,178


In [34]:
keys=['CEC','Blank','ISTD','Wash','Shutdown']
d_sample = data_prep(d_ms,keys,rt_range = [1,30], mz_range = [200,800], area_thres=500, simp_summary = False) # The function now only deal with triplicate samples
#Needs to refine towards case that don't have 3 samples

100%|██████████| 35/35 [01:40<00:00,  2.87s/it]


1. grouping
2. noise_rm & filter
3. source ID using avg PAs -- consider the score or other labels in the source label?
4. calc dilution as below -- score?

In [130]:
def source_label(d_input, sourcelist,area_thres=5000, concat = True): #noise removal only based on sourcelist cols
    np.seterr(divide='ignore', invalid='ignore')
    #source labeling
    d_result = d_input.copy()
    source_col=[]
    for s in sourcelist:
        source = [col for col in d_input.columns if s in col]
        source_col.append(source)
    simp_dict={}
    for i, column in enumerate(source_col):
        avg = d_result[column].mean(1)
        cv = d_result[column].std(1) / d_result[column].mean(1) #optional display CV
        cv_nan=np.isnan(cv)
        cv[cv_nan]=0.0 #replace nan with 0
        simp_dict.update({sourcelist[i]:avg, str(sourcelist[i])+' Cv':cv})
    d_summary = pd.DataFrame(simp_dict)
    d_summary['source']="NA"
    for row in d_summary.itertuples():
        sourcelabel = list(d_summary.columns[[col_index for col_index, peak_avg in enumerate(row[1:-1]) if peak_avg >= area_thres]])
        if len(sourcelabel) != 0:
            labelstr = ','.join(sourcelabel)
            d_summary.at[row.Index,'source'] = labelstr
    if concat == True:
        d_concat = pd.concat([d_result, d_summary], axis=1)
    elif concat == False:
        d_concat=d_result.copy()
        d_concat['source'] = d_summary['source']
    
    return d_concat

In [185]:
d_label['source'].dtype == object

True

In [133]:
sourcelist=['Coulter','Crescent','Miller','Swan','SR520-Cal-in-DI_1000mL'] #Needs adjustment
d_label = source_label(d_sample,sourcelist,area_thres= 50000,concat=True)

In [7]:
#User case for coverage score

# algorithm flow:

1. simplify the chart to mean values, exclude CV variant ones
2. using venn diagram idea, label the source according to mean peak area threshold
3. get coverage score using the label information and get the intensity score similarly
```
but consider the matrix effect and unknowns to the sample, validation is required and more dedication & combination is needed
```


In [171]:
def source_report(d_input, source_key, mix_key, method='multiple', pa_thres=10000, CV_thres=2): #source key needs to be the same as source_label above
    #**only take the concat dataframe from labeling function
    #prefilter & dataframe arrangement
    d_mix = d_input[(d_input[[col for col in d_input.columns if 'Cv' in col]] <= CV_thres).all(1)]# all cv should below thres in order to be checked
    d_simp = d_mix[source_key]
    print('Threshold set to', thres)
    mix_col = []
    for key in mix_key:
        mix_col.extend([col for col in d_mix.columns if 'Mix' in col])
    if len(mix_col) == 0:
        print("didn't find mixture by keyword!")
    source_col = []

    d_st = pd.DataFrame(mix_col)

    c_name = ['sample']
    for source in source_key:
        result = []
        for col in mix_col:
            n_feature = sum(d_mix[d_mix[col] >= thres]['source'].str.contains(source))
            cov_score = n_feature / sum(d_mix['source'].str.contains(source))
            if method == 'single':
                mix = d_mix[d_mix['source'] == source][col]
                s_simp = d_simp[d_simp['source'] == source][source]
            elif method == 'multiple':
                mix = d_mix[d_mix['source'].str.contains(source)][col]
                s_simp = d_simp.loc[d_mix[d_mix['source'].str.contains(source)].index][source]
            match_index = [i for i, j in enumerate(mix) if j >= thres]
            dilu = mix.iloc[match_index] / s_simp.iloc[match_index]
            ratio_score = np.average(dilu[dilu<1])
            result.append([n_feature, cov_score, ratio_score])
        d_st = pd.concat([d_st, pd.DataFrame(result)], axis = 1)
        c_name.extend(['n_'+str(source), 'cover_s', 'ratio_s'])
    d_st.columns = c_name
    
    return d_st

In [172]:
d_t = source_report(d_label, ['Coulter','Crescent','Miller','Swan','SR520-Cal-in-DI_1000mL'], ['Mix'], method='multiple', pa_thres=10000, CV_thres=2)

Threshold set to 5000


Further comparison:

ref Kathy's paper, compare the d_st with the cluster result/modeling report, further sort our features that is overlapping or meeting the criteria (table1 from the paper) to generate a detailed source tracking report use both source information and dilution information

visualization based on single sample

chart based on all 

|sample name|feature from source1|coverage score source1|etc2|etc2|
|---|---|---|---|---|
|sample1|1800|0.3|2000|0.5|
|sample2|a|b|c|d|

first --> assign label to different features

second --> ID using the features

third --> assessment, for instance, final ID confidence = 50% feature quantity score + 50% feature intensity score

sample A have 50% of source A feature, avg intensity ratio(5~95%) is 90%, then score = $0.5*0.5(feature #)+0.5*0.9$ (major feature intensity)

for approportioning calculation --> matrix effect needs to be overcome --> more samples and data needed and will be a long term dev & validation process