In [7]:
import os
import glob
from tqdm import tqdm
import pandas as pd
import ast

In [117]:
algo_3_output_folder = 'algorithm_3_output'
retrieved_filename = 'retrieved_result.csv'
extraction_keys_filename = 'environstd_inchi.csv'
extraction_keys_file_encoding = 'ANSI'

foldername = 'Mona_experimental_ms' # this is the folder where the cleaned data is stored
cleaned_folder_name = 'cleaned_data'

read_folder = os.path.join(os.getcwd(), foldername)
cleaned_folder = os.path.join(read_folder, cleaned_folder_name) # folder that we will be reading data from

df = pd.read_csv(os.path.join(os.getcwd(), extraction_keys_filename), encoding=extraction_keys_file_encoding)
inchikey_list = df['Inchikey'].str.strip().array
name_list = df['Name'].str.strip().array

environstd_inchi.csv


In [8]:
# algorithm 3

### this code block might take around 25 minutes to run

def clean_precursormz(df):
    inchikeys = df['InChIKey'].unique().tolist()
    for inchikey in inchikeys:
        curr_precursormz = df.loc[df['InChIKey'] == inchikey, 'PrecursorMZ'].iloc[0]
        try:
            curr_precursormz = ast.literal_eval(curr_precursormz)
            df.loc[df['InChIKey'] == inchikey, ['PrecursorMZ']] = curr_precursormz[0]
        except TypeError:
            curr_precursormz = float(curr_precursormz)
            df.loc[df['InChIKey'] == inchikey, ['PrecursorMZ']] = curr_precursormz
        except ValueError:
            curr_precursormz = float(curr_precursormz)
            df.loc[df['InChIKey'] == inchikey, ['PrecursorMZ']] = curr_precursormz
    return df


def retrieve_ms2_dict(df):
    ms2_dict = pd.Series(df['Score'].array, index=df['MS2']).to_dict()
    return ms2_dict

def append_to_list(df):
    output_dict = {}
    for i, item in df['Name'].items():
        if 'Name' in output_dict.keys():
            if item not in output_dict['Name']:
                if type(output_dict['Name']) == list:
                    if type(item) == list:
                        output_dict['Name'] += item
                    else:
                        output_dict['Name'].append(item)
                else:
                    if type(item) == list:
                        output_dict['Name'] = item.append(output_dict['Name'])
                    else:
                        output_dict['Name'] = [output_dict['Name'], item]
        else:
            output_dict['Name'] = item
    return pd.Series(output_dict)

search_range = 0.7
output_folder = os.path.join(os.getcwd(), algo_3_output_folder, 'non_combined_output')

if not (os.path.exists(output_folder)):
    os.makedirs(output_folder)

## get list of csv file names
cwd = os.getcwd()
os.chdir(cleaned_folder)
listdir = glob.glob('*.csv')
os.chdir(cwd)

for filename in tqdm(listdir):
#     if filename == '74_cleaned.csv':
        readf = os.path.join(cleaned_folder, filename)
        precursormz = int(filename.split('_')[0])
        interfering_file = os.path.join(cleaned_folder, str(precursormz - 1) + '_cleaned.csv')
        df = pd.read_csv(readf)
        df.sort_values(by=['InChIKey', 'MS2'], inplace=True)
        df = clean_precursormz(df)

        try:
            df2 = pd.read_csv(interfering_file)
        except FileNotFoundError:
            df2 = pd.DataFrame()

        output_file = os.path.join(output_folder, str(precursormz) + '_algo4.csv')
        inchikeys = df['InChIKey'].unique().tolist()
        TP, FN = 1, 0
        header_flag = True

        for inchikey in inchikeys:
            target_item_df = df[df['InChIKey'] == inchikey].copy()
            curr_precursormz = target_item_df.iloc[0]['PrecursorMZ']
            
            interfering_items_df = df[df['InChIKey'] != inchikey]
            interfering_items_df = interfering_items_df.append(df2, ignore_index=True)
            interfering_items_df = clean_precursormz(interfering_items_df)
            interfering_items_df = interfering_items_df[interfering_items_df['PrecursorMZ'] >= (curr_precursormz - 0.7)]
            
            if interfering_items_df.shape[0] < 1:
                target_item_df.to_csv(output_file, index=False)
                break
                
            interfering_items_df.sort_values(by=['InChIKey', 'MS2'], inplace=True)
            interfering_items_s = interfering_items_df.groupby(['InChIKey']).apply(retrieve_ms2_dict)

            target_ms2_array = target_item_df['MS2'].array
            output_dict = {}
            
            for ms2 in target_ms2_array:
                FP, TN = 0, 0
                for i, ms2_dict in interfering_items_s.iteritems():
                    TN_flag = True
                    
                    if (ms2 - max(ms2_dict.keys())) > 0.7:
                        TN += 1
                        continue
                        
                    for key in ms2_dict.keys():
                        diff = round(ms2 - key, 2)
                        if abs(diff) <= search_range:
                            TN_flag = False
                            FP += 1
                        elif diff < 0:
                            if (TN_flag):
                                TN += 1
                            break
                
                output_dict[ms2] = {
                    'TP': TP,
                    'FN': FN,
                    'TN': TN,
                    'FP': FP,
                    'Accuracy': (TP+TN) / (TP+TN+FP+FN),
                    'Sensitivity': TP / (TP+FN),
                    'Specificity': TN / (FP+TN)
                }

            output_df = pd.DataFrame.from_dict(output_dict, orient='index')
            output_df = target_item_df.join(output_df, on='MS2')

            output_df.sort_values(by=['Specificity'], inplace=True)
            output_df.to_csv(output_file, index=False, mode='a', header=header_flag)
            header_flag = False # only include header for the first time

100%|██████████████████████████████████████████████████████████████████████████████| 1046/1046 [25:35<00:00,  1.47s/it]


In [9]:
# combine all output into same csv file

read_folder = os.path.join(os.getcwd(), algo_3_output_folder, 'non_combined_output')
output_folder = os.path.join(os.getcwd(), algo_3_output_folder)
cwd = os.getcwd()
os.chdir(read_folder)
listdir = glob.glob('*.csv')
os.chdir(cwd)

output_filename = os.path.join(output_folder, 'combined_output_algo_3.csv')

combined_csv = pd.concat([ pd.read_csv(os.path.join(read_folder, f)) for f in tqdm(listdir) ], ignore_index=True, sort=False)
combined_csv.to_csv(output_filename, index=False)

100%|█████████████████████████████████████████████████████████████████████████████| 1046/1046 [00:07<00:00, 133.06it/s]
