In [1]:
import pandas as pd
import numpy as np
from itertools import groupby
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn import cluster,mixture
from sklearn.cluster import DBSCAN
from sklearn.manifold import TSNE
import scipy

  import pandas.util.testing as tm


Idea:
blank removal --> noise removal --> find unique/shared clusters --> use information for id --> use information for tracking
```
sudo code:
picking up sources
output labeled table
use the information for the source id

two way:
1. venn diagram --> source id
2. more data: single source approach to identify clusters for different sources and use the modeling approach for source tracking
```

important : tweak the parameters during venn diagram approach

Two ways: use source data + venn diagram, give unique cluster higher score and shared cluster lower score, given a new sample, can predict the source id, or assign with possibility scores, for instance >70% of cluster features present then the shource exists. Hard to do source tracking since matrix effect and dilution effect is not considered

better way with more data: every source with dilution series, and use the single source approach to find clusters, and modeling for the source approportioning



In [2]:
d_ms = pd.read_csv('../example_data/clustering/sample1114.csv')
d_ms = d_ms.rename(columns={'Average Rt(min)': 'Average RT (min)', 'Average Mz': 'Average m/z', 'S/N average': 'Average sn'})
d_ms.insert(3, "Average score", 1)

In [3]:
def data_prep(d_input, blank_keyword, svb_thres=10, empty_thres=0, cv_thres=5,rt_range=[0, 30], mz_range=[0, 1200], sn_thres=3, score_thres=0, area_thres=5000):
    '''
    The function is used to clean the dataframe according to user setting
    blank_keyword: part of string from column that indicates the column is a blank sample
    svb_thres: sample vs blank thres
    empty_thres: empty cell thres in a row
    cv_thres: as all sample is in triplicate, calculate the CV for every triplicate sample set #Needs to be updated in case there is no triplicate samples
    rt_range: rt filter
    mz_range: mz filter
    sn_thres: signal/noise column thres
    score_thres: score column thres
    area_thres: count for max peak area from each row
    '''
    drop_index = np.argwhere(np.asarray(d_input[d_input.columns[4:]].max(axis=1)) < area_thres).reshape(1,-1) #Get the index for area thres filter
    d_thres = d_input.drop(drop_index[0])
    
    d_thres = d_thres[(d_thres['Average RT (min)'] > rt_range[0]) & (d_thres['Average RT (min)'] < rt_range[1])]
    d_thres = d_thres[(d_thres['Average m/z'] > mz_range[0]) & (d_thres['Average m/z'] < mz_range[1])]
    d_thres = d_thres[d_thres['Average sn'] >= sn_thres]
    d_thres = d_thres[d_thres['Average score'] >= score_thres]
    d_thres.reset_index(inplace=True)
    d_thres.drop(columns=['index'],inplace=True)
    
    col_blank = []
    for key in blank_keyword:
        col_app = [col for col in d_thres.columns if key in col] # Get column name if it contains blank indicating strings
        col_blank += col_app
    col_sample = [col for col in d_thres.columns if col not in col_blank]
    
    d_sample = d_thres[d_thres[col_sample[4:]].max(axis=1) / d_thres[col_blank].mean(axis=1) > svb_thres][col_sample] # Sample maximum area vs Blank average area to count for svb
    d_sample.reset_index(inplace=True)
    d_sample.drop(columns=['index'],inplace=True)
    
    # Get a list of triplicate, every triplicate is in a sublist
    #Sample: [[a1,a2,a3],[b1,b2,b3]]
    trip_list = [list(i) for j, i in groupby(d_sample.columns[4:], lambda a: a.split('_')[1])] #Note: the triplicate parsing is now only used '_' which needs update in the future

    for triplicate in tqdm(trip_list):
        for index, row in d_sample[triplicate].iterrows(): # Loop for every sets of triplicates
            if (row == 0).sum() > empty_thres:
                d_sample.loc[index, triplicate] = 0 # if more than thres, then set all three values to 0
            elif row.std() / row.mean() > cv_thres:
                d_sample.loc[index, triplicate] = 0 #If delete or reduce all number to avg?
    #d_sample = d_sample[(d_sample.iloc[:,4:]!=0).sum(1) > 3]
    
    
    return d_sample

In [4]:
keys=['CEC','Blank','ISTD','Wash','Shutdown']
d_sample = data_prep(d_ms,keys,rt_range = [1,30], mz_range = [200,800], area_thres=500)

100%|██████████| 10/10 [02:04<00:00, 12.43s/it]


In [28]:
def noise_detect(d_input, normalization='linear',eps=0.8,min_samples=10):
    
    c_data = d_input.iloc[:,4:].values
    c_norm = []
    #Performs normalization
    for row in c_data:
        if normalization == 'linear':
            c_norm.append(row/max(row))
        elif normalization == 'zscore':
            c_norm.append((row-np.mean(row))/np.std(row))
        elif normalization == 'log':
            row[row==0]=1
            c_norm.append(np.log10(row)/np.log10(max(row)))
    #Clean up dataframe
    c_norm = np.asarray(c_norm)
    d_norm = pd.DataFrame(c_norm)
    d_norm['index']=d_sample.index
    d_norm.set_index('index',inplace=True)
    d_norm.dropna(how='all',inplace=True)

    X = d_norm.copy()
    dbscan = cluster.DBSCAN(eps=eps, min_samples=min_samples).fit(X)
    labels = dbscan.labels_
    samp_index = np.argwhere(labels!=-1).reshape(1,-1)[0]
    d_samp = d_input.iloc[samp_index]
    
    return d_samp

In [30]:
d_f = noise_detect(d_sample)

  


In [67]:
def source_label(d_input, sourcelist, area_thres=5000, method='any'):
    source_col=[]
    for s in sourcelist:
        source = [col for col in d_input.columns if s in col]
        source_col.append(source)
    W
    for index, row in d_input.iterrows():
        sourcelabel=[]
        for i, column in enumerate(source_col):
            if method == 'any': #option of all
                if (row[column] > area_thres).any() == True:
                    sourcelabel.append(sourcelist[i])
            elif method == 'max': #option of min
                if row[column].max() > area_thres == True:
                    sourcelabel.append(sourcelist[i])
        if len(sourcelabel) != 0:
            d_input.at[index,'source'] = sourcelabel
    
    return d_input

In [68]:
sourcelist=['Coulter','Crescent','SR520-Cal']
d_test = source_label(d_f, sourcelist,method='any')