In [31]:
import pandas as pd
import numpy as np
from itertools import groupby
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn import cluster,mixture
from sklearn.cluster import DBSCAN
from sklearn.manifold import TSNE
import scipy
from pandas.core.common import flatten

Idea:
blank removal --> noise removal --> find unique/shared clusters --> use information for id --> use information for tracking
```
sudo code:
picking up sources
output labeled table
use the information for the source id

two way:
1. venn diagram --> source id
2. more data: single source approach to identify clusters for different sources and use the modeling approach for source tracking
```

important : tweak the parameters during venn diagram approach

Two ways: use source data + venn diagram, give unique cluster higher score and shared cluster lower score, given a new sample, can predict the source id, or assign with possibility scores, for instance >70% of cluster features present then the shource exists. Hard to do source tracking since matrix effect and dilution effect is not considered

better way with more data: every source with dilution series, and use the single source approach to find clusters, and modeling for the source approportioning



In [2]:
d_ms = pd.read_csv('../example_data/clustering/sample1114.csv')
d_ms = d_ms.rename(columns={'Average Rt(min)': 'Average RT (min)', 'Average Mz': 'Average m/z', 'S/N average': 'Average sn'})
d_ms.insert(3, "Average score", 1)

In [81]:
def data_prep(d_input, blank_keyword, svb_thres=10, empty_thres=0, cv_thres=3,rt_range=[0, 30], mz_range=[0, 1200], sn_thres=3, score_thres=0, area_thres=5000):
    '''
    The function is used to clean the dataframe according to user setting
    blank_keyword: part of string from column that indicates the column is a blank sample
    svb_thres: sample vs blank thres
    empty_thres: empty cell thres in a row
    cv_thres: as all sample is in triplicate, calculate the CV for every triplicate sample set #Needs to be updated in case there is no triplicate samples
    rt_range: rt filter
    mz_range: mz filter
    sn_thres: signal/noise column thres
    score_thres: score column thres
    area_thres: count for max peak area from each row
    '''
    d_thres = d_input[d_input[d_input.columns[4:]].max(1) >= area_thres]
    
    d_thres = d_thres[(d_thres['Average RT (min)'] > rt_range[0]) & (d_thres['Average RT (min)'] < rt_range[1])]
    d_thres = d_thres[(d_thres['Average m/z'] > mz_range[0]) & (d_thres['Average m/z'] < mz_range[1])]
    d_thres = d_thres[d_thres['Average sn'] >= sn_thres]
    d_thres = d_thres[d_thres['Average score'] >= score_thres]
    d_thres.reset_index(inplace=True, drop=True)
    
    col_blank = []
    for key in blank_keyword:
        # Get column name if it contains blank indicating strings
        col_blank.extend([col for col in d_thres.columns if key in col])
        
    col_sample = [col for col in d_thres.columns if col not in col_blank]
    # Sample maximum area vs Blank average area to count for svb
    d_sample = d_thres[d_thres[col_sample[4:]].max(axis=1) / d_thres[col_blank].mean(axis=1) > svb_thres][col_sample] 
    d_sample.reset_index(inplace=True)
    d_sample.drop(columns=['index'],inplace=True)
    
    # Get a list of triplicate, every triplicate is in a sublist
    #Sample: [[a1,a2,a3],[b1,b2,b3]]
    #Note: the triplicate parsing is now only used '_' which needs update in the future
    trip_list = [list(i) for j, i in groupby(d_sample.columns[4:], lambda a: a.split('_')[:-1])] 
    trip_list = [i for i in trip_list if len(i)>=2] #filter out columns that is not in triplicate -- sample naming issue

    for triplicate in tqdm(trip_list):
        # DM: maybe use iterrtuples? iterrows has low efficiency and is not reccomended 
        for row in d_sample[triplicate].itertuples(): # Loop for every sets of triplicates
            if row[1:].count(0) > empty_thres:
                d_sample.loc[row.Index, triplicate] = 0 # if more than thres, then set all three values to 0
            elif np.mean(row[1:]) != 0:
                if np.std(row[1:]) / np.mean(row[1:]) > cv_thres:
                    d_sample.loc[row.Index, triplicate] = 0 #need verify, not work for now
            else:
                pass
            
    d_sample = d_sample[~(d_sample[d_sample.columns[4:]]==0).all(1)] #clean rows with all 0
    
    return d_sample

In [62]:
keys=['CEC','Blank','ISTD','Wash','Shutdown']
d_sample = data_prep(d_ms,keys,rt_range = [1,30], mz_range = [200,800], area_thres=500)

100%|██████████| 35/35 [02:04<00:00,  3.57s/it]


1. grouping
2. noise_rm & filter
3. source ID using avg PAs -- consider the score or other labels in the source label?
4. calc dilution as below -- score?

In [133]:
def source_label(d_input, sourcelist,area_thres=5000): #noise removal only based on sourcelist cols
    np.seterr(divide='ignore', invalid='ignore')
    #source labeling
    d_result = d_input.copy()
    source_col=[]
    for s in sourcelist:
        source = [col for col in d_input.columns if s in col]
        source_col.append(source)
    simp_dict={}
    for i, column in enumerate(source_col):
        avg = d_result[column].mean(1)
        cv = d_result[column].std(1) / d_result[column].mean(1) #optional display CV
        simp_dict.update({sourcelist[i]:avg})
    d_summary = pd.DataFrame(simp_dict)
    d_summary['source']="NA"
    for row in d_summary.itertuples():
        sourcelabel = list(d_summary.columns[[col_index for col_index, peak_avg in enumerate(row[1:-1]) if peak_avg >= area_thres]])
        if len(sourcelabel) != 0:
            labelstr = ','.join(sourcelabel)
            d_summary.at[row.Index,'source'] = labelstr
    d_result['source'] = d_summary['source']
    
    return d_result, d_summary

In [134]:
sourcelist=['Coulter','Crescent','Miller','Swan','SR520-Cal-in-DI_1000mL']
d_label, d_simp = source_label(d_sample,sourcelist,area_thres=5000)

In [None]:
#User case for coverage score

In [147]:
sum(d_label[d_label['20181114_SR520-Creek_Mix6B_1'] >= 1000]['source'].str.contains('SR520')) / sum(d_label['source'].str.contains('SR520'))

0.5393298059964726

In [218]:
#for 2. complicated: consider other source impact
# s_simp = d_simp.loc[d_label[d_label['source'].str.contains('SR520')].index]['SR520-Cal-in-DI_1000mL']
# mix = d_label[d_label['source'].str.contains('SR520')]['20181114_SR520-Creek_Mix1_1']
#1. simple way: only use 520 specific source
mix = d_label[d_label['source'] == 'SR520-Cal-in-DI_1000mL']['20181114_SR520-Creek_Mix6A_1']
s_simp = d_simp[d_simp['source'] == 'SR520-Cal-in-DI_1000mL']['SR520-Cal-in-DI_1000mL']
match_index = [i for i, j in enumerate(mix) if j >= 1000]
dilu = mix.iloc[match_index] / s_simp.iloc[match_index]
np.average(dilu[dilu<1]) #Calculate the dilution rate prediction

0.10336347135237217

In [247]:
mix_key = 'Mix'
thres = 1000
source_key = 'SR520-Cal-in-DI_1000mL'
method='multiple'
result = []
mix_col = [col for col in d_label.columns if mix_key in col]
for col in mix_col:
    cov_score = sum(d_label[d_label[col] >= thres]['source'].str.contains(source_key)) / sum(d_label['source'].str.contains(source_key))
    if method == 'single':
        mix = d_label[d_label['source'] == source_key][col]
        s_simp = d_simp[d_simp['source'] == source_key][source_key]
    elif method == 'multiple':
        mix = d_label[d_label['source'].str.contains(source_key)][col]
        s_simp = d_simp.loc[d_label[d_label['source'].str.contains(source_key)].index][source_key]
    match_index = [i for i, j in enumerate(mix) if j >= thres]
    dilu = mix.iloc[match_index] / s_simp.iloc[match_index]
    ratio_score = np.average(dilu[dilu<1])
    result.append([col, cov_score, ratio_score])
    d_st = pd.DataFrame(result, columns = ['sample', 'cov_score', 'ratio_score'])

In [246]:
d_st #wrap it into function

Unnamed: 0,sample,cov_score,ratio_score
0,20181114_SR520-Creek_Mix1_1,0.941799,0.696022
1,20181114_SR520-Creek_Mix1_2,0.941799,0.707796
2,20181114_SR520-Creek_Mix1_3,0.941446,0.715812
3,20181114_SR520-Creek_Mix2_1,0.928042,0.717283
4,20181114_SR520-Creek_Mix2_2,0.928042,0.704341
5,20181114_SR520-Creek_Mix2_3,0.929101,0.709678
6,20181114_SR520-Creek_Mix3_1,0.868783,0.521747
7,20181114_SR520-Creek_Mix3_2,0.870194,0.517127
8,20181114_SR520-Creek_Mix3_3,0.867019,0.520016
9,20181114_SR520-Creek_Mix4A_1,0.798589,0.369325


first --> assign label to different features

second --> ID using the features

third --> assessment, for instance, final ID confidence = 50% feature quantity score + 50% feature intensity score

sample A have 50% of source A feature, avg intensity ratio(5~95%) is 90%, then score = $0.5*0.5(feature #)+0.5*0.9$ (major feature intensity)

for approportioning calculation --> matrix effect needs to be overcome --> more samples and data needed and will be a long term dev & validation process

sample output for score assignment using label information:

|sample name|coverage score| intensity score | final score|
|---|---|---|---|
|sample1|0.5|0.3|0.4|
|sample2|a|b|c|

|sample name|coverage score1(5-25%)|coverage score2(25-50%)|coverage score3(50-75%)|
|---|---|---|---|
|sample1|0.5|0.3|0.4|
|sample2|a|b|c|