In [1]:
import pandas as pd
import numpy as np
from itertools import groupby
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn import cluster,mixture
from sklearn.cluster import DBSCAN
from sklearn.manifold import TSNE
import scipy

  import pandas.util.testing as tm


Idea:
blank removal --> noise removal --> find unique/shared clusters --> use information for id --> use information for tracking
```
sudo code:
picking up sources
output labeled table
use the information for the source id

two way:
1. venn diagram --> source id
2. more data: single source approach to identify clusters for different sources and use the modeling approach for source tracking
```

important : tweak the parameters during venn diagram approach

Two ways: use source data + venn diagram, give unique cluster higher score and shared cluster lower score, given a new sample, can predict the source id, or assign with possibility scores, for instance >70% of cluster features present then the shource exists. Hard to do source tracking since matrix effect and dilution effect is not considered

better way with more data: every source with dilution series, and use the single source approach to find clusters, and modeling for the source approportioning



In [2]:
d_ms = pd.read_csv('../example_data/clustering/sample1114.csv')
d_ms = d_ms.rename(columns={'Average Rt(min)': 'Average RT (min)', 'Average Mz': 'Average m/z', 'S/N average': 'Average sn'})
d_ms.insert(3, "Average score", 1)

In [3]:
def data_prep(d_input, blank_keyword, svb_thres=10, empty_thres=0, cv_thres=5,rt_range=[0, 30], mz_range=[0, 1200], sn_thres=3, score_thres=0, area_thres=5000):
    '''
    The function is used to clean the dataframe according to user setting
    blank_keyword: part of string from column that indicates the column is a blank sample
    svb_thres: sample vs blank thres
    empty_thres: empty cell thres in a row
    cv_thres: as all sample is in triplicate, calculate the CV for every triplicate sample set #Needs to be updated in case there is no triplicate samples
    rt_range: rt filter
    mz_range: mz filter
    sn_thres: signal/noise column thres
    score_thres: score column thres
    area_thres: count for max peak area from each row
    '''
    d_thres = d_input[d_input[d_input.columns[4:]].max(1) >= area_thres]
    
    d_thres = d_thres[(d_thres['Average RT (min)'] > rt_range[0]) & (d_thres['Average RT (min)'] < rt_range[1])]
    d_thres = d_thres[(d_thres['Average m/z'] > mz_range[0]) & (d_thres['Average m/z'] < mz_range[1])]
    d_thres = d_thres[d_thres['Average sn'] >= sn_thres]
    d_thres = d_thres[d_thres['Average score'] >= score_thres]
    d_thres.reset_index(inplace=True, drop=True)
    
    col_blank = []
    for key in blank_keyword:
        # Get column name if it contains blank indicating strings
        col_blank.extend([col for col in d_thres.columns if key in col])
        
    col_sample = [col for col in d_thres.columns if col not in col_blank]
    # Sample maximum area vs Blank average area to count for svb
    d_sample = d_thres[d_thres[col_sample[4:]].max(axis=1) / d_thres[col_blank].mean(axis=1) > svb_thres][col_sample] 
    d_sample.reset_index(inplace=True)
    d_sample.drop(columns=['index'],inplace=True)
    
    # Get a list of triplicate, every triplicate is in a sublist
    #Sample: [[a1,a2,a3],[b1,b2,b3]]
    #Note: the triplicate parsing is now only used '_' which needs update in the future
    trip_list = [list(i) for j, i in groupby(d_sample.columns[4:], lambda a: a.split('_')[:-1])] 
    trip_list = [i for i in trip_list if len(i)>=2] #filter out columns that is not in triplicate -- sample naming issue

    for triplicate in tqdm(trip_list):
        # DM: maybe use iterrtuples? iterrows has low efficiency and is not reccomended 
        for row in d_sample[triplicate].itertuples(): # Loop for every sets of triplicates
            if row[1:].count(0) > empty_thres:
                d_sample.loc[row.Index, triplicate] = 0 # if more than thres, then set all three values to 0
            elif np.mean(row[1:]) != 0:
                if np.std(row[1:]) / np.mean(row[1:]) > cv_thres:
                    d_sample.loc[row.Index, triplicate] = 0 #If delete or reduce all number to avg?
            else:
                pass
            
    d_sample = d_sample[~(d_sample[d_sample.columns[4:]]==0).all(1)] #clean rows with all 0
    
    return d_sample

In [4]:
keys=['CEC','Blank','ISTD','Wash','Shutdown']
d_sample = data_prep(d_ms,keys,rt_range = [1,30], mz_range = [200,800], area_thres=500)

100%|██████████| 10/10 [02:25<00:00, 14.57s/it]


In [13]:
def noise_rm(d_input, normalization='linear',eps=0.8,min_samples=10):
    
    c_data = d_input.iloc[:,4:].values
    c_norm = []
    #Performs normalization
    np.seterr(divide='ignore', invalid='ignore')
    for row in c_data:
        if normalization == 'linear':
            c_norm.append(row/max(row))
        elif normalization == 'zscore':
            c_norm.append((row-np.mean(row))/np.std(row))
        elif normalization == 'log':
            row[row==0]=1
            c_norm.append(np.log10(row)/np.log10(max(row)))
    #Clean up dataframe
    c_norm = np.asarray(c_norm)
    d_norm = pd.DataFrame(c_norm)
    d_norm['index']=d_sample.index
    d_norm.set_index('index',inplace=True)
    d_norm.dropna(how='all',inplace=True)

    X = d_norm.copy()
    dbscan = cluster.DBSCAN(eps=eps, min_samples=min_samples).fit(X)
    labels = dbscan.labels_
    samp_index = np.argwhere(labels!=-1).reshape(1,-1)[0]
    d_samp = d_input.iloc[samp_index]
    
    return d_samp

In [14]:
d_f = noise_rm(d_sample)

In [21]:
d_f

Unnamed: 0,Average RT (min),Average m/z,Average sn,Average score,20181114_CoulterCreek_1,20181114_CoulterCreek_2,20181114_CoulterCreek_3,20181114_Crescent-Creek-Jan_1,20181114_Crescent-Creek-Jan_2,20181114_Crescent-Creek-Jan_3,...,20181114_SR520-Creek_Mix6B_1,20181114_SR520-Creek_Mix6B_2,20181114_SR520-Creek_Mix6B_3,20181114_SwanCreek-Dec_1,20181114_SwanCreek-Dec_2,20181114_SwanCreek-Dec_3,20181114_SwanCreek-May_1,20181114_SwanCreek-May_2,20181114_SwanCreek-May_3,source
0,5.696,200.07405,35.03,1,3339,2477,3796,4698,3918,2252,...,5471,5255,5571,4397,5408,5311,3060,2645,2647,"Miller,Swan,SR520-Cal"
1,4.326,200.12823,16.90,1,1791,2769,2955,2544,1265,2225,...,2429,1507,1572,539,2316,2175,3045,1496,802,"Crescent,Miller,SR520-Cal"
2,4.220,200.12869,29.97,1,2586,1877,2674,4057,3302,4223,...,2229,2068,4053,1055,2186,1263,1387,2033,2486,"Miller,SR520-Cal"
3,5.920,200.16484,192.03,1,1403,166,137,679,388,677,...,7230,10364,8871,772,1658,1421,660,474,368,"Miller,SR520-Cal"
4,5.295,200.20181,23.37,1,0,0,0,0,0,0,...,571,232,868,211,141,193,372,96,333,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4862,12.912,702.49805,32.09,1,90,136,197,187,91,97,...,305,165,105,138,378,293,199,361,148,SR520-Cal
4863,13.852,702.49976,17.89,1,103,80,133,0,0,0,...,0,0,0,0,0,0,0,0,0,
4864,13.280,702.50018,29.19,1,133,149,94,74,89,77,...,182,299,130,73,147,96,70,68,71,SR520-Cal
4865,12.302,702.50134,63.83,1,0,0,0,95,143,73,...,336,386,462,0,0,0,0,0,0,SR520-Cal


In [17]:
def source_label(d_input, sourcelist, area_thres=5000, method='any'):
    np.seterr(divide='ignore', invalid='ignore')
    d_result = d_input.copy()
    d_result['source']="NA"
    source_col=[]
    for s in sourcelist:
        source = [col for col in d_input.columns if s in col]
        source_col.append(source)
    for index, row in d_result.iterrows(): #Potentially change to itertuples? 2. think a more precise label strategy
        sourcelabel=[]
        for i, column in enumerate(source_col):
            if method == 'any': #option of all
                if (row[column] > area_thres).any() == True:
                    sourcelabel.append(sourcelist[i])
            elif method == 'max': #option of min
                if row[column].max() > area_thres == True:
                    sourcelabel.append(sourcelist[i])
        if len(sourcelabel) != 0:
            labelstr = ','.join(sourcelabel)
            d_result.at[index,'source'] = labelstr
    
    return d_input

In [18]:
sourcelist=['Coulter','Crescent','Miller','Swan','SR520-Cal']
d_test = source_label(d_f, sourcelist,method='any')

first --> assign label to different features

second --> ID using the features

third --> assessment, for instance, final ID confidence = 50% feature quantity score + 50% feature intensity score

sample A have 50% of source A feature, avg intensity ratio(5~95%) is 90%, then score = $0.5*0.5(feature #)+0.5*0.9$ (major feature intensity)

for approportioning calculation --> matrix effect needs to be overcome --> more samples and data needed and will be a long term dev & validation process

sample output for score assignment using label information:

|sample name|coverage score| intensity score | final score|
|---|---|---|---|
|sample1|0.5|0.3|0.4|
|sample2|a|b|c|

|sample name|coverage score1(5-25%)|coverage score2(25-50%)|coverage score3(50-75%)|
|---|---|---|---|
|sample1|0.5|0.3|0.4|
|sample2|a|b|c|

In [9]:
#Use case
d_520 = d_test[d_test['source']=='SR520-Cal']
len(d_520) #total feature unique to SR520
len(d_520[d_520['20181114_SR520-Creek_Mix6B_3']>0]) #total feature found in the mixture that above zero
len(d_520[d_520['20181114_SR520-Creek_Mix6B_3']>0])/len(d_520) #coverage score
#Come up with idea for dilution rate calculation using single data..
#oneway: calculate every feature ratio, merge them at the end as dilution score

0.4194260485651214

In [10]:
#example
dilu = d_520['20181114_SR520-Creek_Mix6B_3']/d_520['20181114_SR520-Cal-in-DI_1000mL_3']
dilu = dilu[dilu<1]
dilu.mean() #inaccurate

0.0217686937253888