In [1]:
import os
import glob
from tqdm import tqdm
import pandas as pd
import ast

In [4]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [5]:
algo_3_output_folder = 'algorithm_output'
retrieved_filename = 'retrieved_result.csv'
extraction_keys_filename = 'environstd_inchi.csv'
extraction_keys_file_encoding = 'ANSI'

foldername = 'GCMS_output' # this is the forlder where the cleaned data is stored
cleaned_folder_name = 'cleaned_data'

read_folder = os.path.join(os.getcwd(), foldername)
cleaned_folder = os.path.join(read_folder, cleaned_folder_name) # folder that we will be reading data from

df = pd.read_csv(os.path.join(os.getcwd(), extraction_keys_filename), encoding=extraction_keys_file_encoding)
df.head()

Unnamed: 0,Name:,InChIKey:
0,"10,11-(4',5'-Phthaloyl-4',5'-diazacyclohexano)...",PJHMULXOUYBGON-UHFFFAOYSA-N
1,"Acetic acid, 1-acetoxy-10a,12a-dimethyl-5-oxo-...",ICFHFOMKTZRYEJ-UHFFFAOYSA-N
2,"(10aR)-6-Oxodecahydro-7,10-methanopyrido[1,2-a...",XGDWTAWUJCJBON-XOJNLIDHSA-N
3,"1,10a-Dihydroxy-4,4,7,11b-tetramethyl-1,2,3,4a...",KVYGWNFQSFDUEL-UHFFFAOYSA-N
4,,


In [6]:
inchikey_list = df['InChIKey:'].str.strip().array
name_list = df['Name:'].str.strip().array
columns_to_be_included = [
    'Name:', 
    'Formula:',
    'MW:',
    'NIST#:',
    'InChIKey:', 
    'MS2:',
    'Score:',
    'EstRI', #try to extract the 2 RI values in the text
    'PredRI'
]

In [None]:
#algorithm part from algorithm3
##clean up the mw to valide python datatype
#work on 2021 July23
def clean_MW(df):
# def clean_precursormz(df):
    inchikeys = df['InChIKey:'].unique().tolist()
    for inchikey in inchikeys:
        curr_precursormz = df.loc[df['InChIKey:'] == inchikey, 'MW:'].iloc[0]
        try:
            curr_precursormz = ast.literal_eval(curr_precursormz)
            df.loc[df['InChIKey:'] == inchikey, ['MW:']] = curr_precursormz[0]
        except TypeError:
            curr_precursormz = float(curr_precursormz)
            df.loc[df['InChIKey:'] == inchikey, ['MW:']] = curr_precursormz
        except ValueError:
            curr_precursormz = float(curr_precursormz)
            df.loc[df['InChIKey:'] == inchikey, ['MW:']] = curr_precursormz
    return df

def filter_small_score(df, threshold):
    return df[df['Score:'] >= threshold]

def retrieve_ms2_dict(df): 
    ms2_dict = pd.Series(df['Score:'].array, index=df['MS2:']).to_dict()
    return ms2_dict

def append_to_list(df):
    output_dict = {}
    for i, item in df['Name:'].items():
        if 'Name:' in output_dict.keys():
            if item not in output_dict['Name:']:
                if type(output_dict['Name:']) == list:
                    if type(item) == list:
                        output_dict['Name:'] += item
                    else:
                        output_dict['Name:'].append(item)
                else:
                    if type(item) == list:
                        output_dict['Name:'] = item.append(output_dict['Name:'])
                    else:
                        output_dict['Name:'] = [output_dict['Name:'], item]
        else:
            output_dict['Name:'] = item
    return pd.Series(output_dict)


#set search range for the similar product ms
search_range = 1
output_folder = os.path.join(os.getcwd(), algo_3_output_folder, 'non_combined_output')

if not (os.path.exists(output_folder)):
    os.makedirs(output_folder)

## get list of csv file names
cwd = os.getcwd()
os.chdir(cleaned_folder)
listdir = glob.glob('*.csv')
os.chdir(cwd)

threshold = 30

#from below, precursormz === mw  

for filename in tqdm(listdir):
#     if filename == '74_cleaned.csv':
        cleaned_file = os.path.join(cleaned_folder, filename)
        precursormz = int(filename.split('_')[0])
        raw_file = os.path.join(read_folder, str(precursormz) + '.csv')
        raw_interfering_file = os.path.join(read_folder, str(precursormz - 1) + '.csv')
        
        cleaned_df = pd.read_csv(cleaned_file)
        cleaned_df = filter_small_score(cleaned_df, threshold)
        cleaned_df.sort_values(by=['InChIKey:', 'MS2:'], inplace=True)
        cleaned_df = clean_MW(cleaned_df) #clean up the precursor ion or the mw, make it into python datatype
        
        raw_df = pd.read_csv(raw_file)
        raw_df = raw_df[columns_to_be_included]
        raw_df = filter_small_score(raw_df, threshold)
        raw_df.sort_values(by=['InChIKey:', 'MS2:'], inplace=True)
        raw_df = clean_MW(raw_df)

        try:
            raw_interfering_df = pd.read_csv(raw_interfering_file)
            raw_interfering_df = filter_small_score(raw_interfering_df, threshold)
            raw_interfering_df = raw_interfering_df[columns_to_be_included]    
        except FileNotFoundError:
            raw_interfering_df = pd.DataFrame()

        output_file = os.path.join(output_folder, str(precursormz) + '_algo3.csv')
        inchikeys = cleaned_df['InChIKey:'].unique().tolist()
        header_flag = True

        for inchikey in inchikeys:
            target_item_df = cleaned_df[cleaned_df['InChIKey:'] == inchikey].copy() #put all ms2 lists of one compound all in the target_item_df by inchikey separation
            curr_precursormz = target_item_df.iloc[0]['MW:']  #
            
            raw_target_item_df = raw_df[raw_df['InChIKey:'] == inchikey].copy()
            raw_target_item_df.sort_values(by=['NIST#:', 'MS2:'], inplace=True)
            raw_target_item_s = raw_target_item_df.groupby(['NIST#:']).apply(retrieve_ms2_dict)
            
            raw_interfering_items_df = raw_df[raw_df['InChIKey:'] != inchikey]
            raw_interfering_items_df = raw_interfering_items_df.append(raw_interfering_df, ignore_index=True)
            raw_interfering_items_df = clean_MW(raw_interfering_items_df)
            raw_interfering_items_df = raw_interfering_items_df[raw_interfering_items_df['MW:'] >= (curr_precursormz - 1)]
            
            if raw_interfering_items_df.shape[0] < 1:
                target_item_df.to_csv(output_file, index=False)
                break
                
            raw_interfering_items_df.sort_values(by=['NIST#:', 'MS2:'], inplace=True)
            raw_interfering_items_s = raw_interfering_items_df.groupby(['NIST#:']).apply(retrieve_ms2_dict)

            target_ms2_array = target_item_df['MS2:'].array
            output_dict = {}
            
            
            #add notes on the TP, TN for the explaination
            # True positive means the selected ms2 from one compound ms2 list belongs to the compound
            # True negative means the selected ms2 does not appears on the interfering compound ms2 list
            # False positive means the selected ms2 appears on the interfering compound ms2 list
            # False negative means the selected ms2 does not inside the selected compounds ms2 lists
            
            # Calculate TP, FN
            for ms2 in target_ms2_array:
                TP, FN = 0, 0
                for i, ms2_dict in raw_target_item_s.iteritems():
                    FN_flag = True
                    
                    if (ms2 - max(ms2_dict.keys())) > 0.7:
                        FN += 1
                        continue
                        
                    for key in ms2_dict.keys():
                        diff = round(ms2 - key, 2)
                        if abs(diff) <= search_range:
                            FN_flag = False
                            TP += 1
                        elif diff < 0:
                            if (FN_flag):
                                FN += 1
                            break
                
                output_dict[ms2] = {
                    'TP': TP,
                    'FN': FN
                }
# ====================================================================================================================== #
            # Calculate FP, TN
            for ms2 in target_ms2_array:
                FP, TN = 0, 0
                for i, ms2_dict in raw_interfering_items_s.iteritems():
                    TN_flag = True
                    
                    if (ms2 - max(ms2_dict.keys())) > 0.7:
                        TN += 1
                        continue
                        
                    for key in ms2_dict.keys():
                        diff = round(ms2 - key, 2)
                        if abs(diff) <= search_range:
                            TN_flag = False
                            FP += 1
                        elif diff < 0:
                            if (TN_flag):
                                TN += 1
                            break
                
                output_dict[ms2]['FP'] = FP
                output_dict[ms2]['TN'] = TN
                
                TP = output_dict[ms2]['TP']
                FN = output_dict[ms2]['FN']
                output_dict[ms2]['Accuracy'] = (TP+TN) / (TP+TN+FP+FN)
                output_dict[ms2]['Sensitivity'] = TP / (TP+FN)
                output_dict[ms2]['Specificity'] = TN / (FP+TN)

            output_df = pd.DataFrame.from_dict(output_dict, orient='index')
            output_df = target_item_df.join(output_df, on='MS2:')

            output_df.sort_values(by=['Specificity'], inplace=True)
            output_df.to_csv(output_file, index=False, mode='a', header=header_flag)
            header_flag = False # only include header for the first time

In [None]:
# combine all output into same csv file

read_folder = os.path.join(os.getcwd(), algo_3_output_folder, 'non_combined_output')
output_folder = os.path.join(os.getcwd(), algo_3_output_folder)
cwd = os.getcwd()
os.chdir(read_folder)
listdir = glob.glob('*.csv')
os.chdir(cwd)

output_filename = os.path.join(output_folder, 'combined_output_algo_3.csv')

combined_csv = pd.concat([ pd.read_csv(os.path.join(read_folder, f)) for f in tqdm(listdir) ], ignore_index=True, sort=False)
combined_csv.to_csv(output_filename, index=False)