In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import os
import glob
from tqdm import tqdm #show progress of the loop
import pandas as pd
import ast

In [8]:
algo_3_output_folder = 'GCMS_output'
retrieved_filename = 'retrieved_result.csv'  #the retrived_preclean data
extraction_keys_filename = 'environstd_inchi.csv' #this is the folder where the cleaned data is stored
extraction_keys_file_encoding = 'ANSI' #encoding method for output and extraction input

foldername = 'Mona_experimental_ms'  #this is the folder where the cleaned data is stored
cleaned_folder_name = 'cleaned_data'    #this is the folder where the cleaned data is stored

read_folder = os.path.join(os.getcwd(), foldername)  
cleaned_folder = os.path.join(read_folder, cleaned_folder_name) #folder that we will be reading data from

df = pd.read_csv(os.path.join(os.getcwd(), extraction_keys_filename), encoding=extraction_keys_file_encoding)
inchikey_list = df['Inchikey'].str.strip().array
name_list = df['Name'].str.strip().array
columns_to_be_included = ['Name', 'DB#', 'InChIKey', 'ExactMass', 'Precursor_type', 'PrecursorMZ', 'Instrument_type', 'Instrument', 'Collision_energy', 'MW', 'MS2', 'Score']

FileNotFoundError: [Errno 2] No such file or directory: 'D:\\jupyter_workspace\\datacleaning\\GC-MS\\environstd_inchi.csv'

In [5]:
# algorithm 3

### this code block might take around 25 minutes to run

def clean_precursormz(df):
    inchikeys = df['InChIKey'].unique().tolist()
    for inchikey in inchikeys:
        curr_precursormz = df.loc[df['InChIKey'] == inchikey, 'PrecursorMZ'].iloc[0]
        try:
            curr_precursormz = ast.literal_eval(curr_precursormz)
            df.loc[df['InChIKey'] == inchikey, ['PrecursorMZ']] = curr_precursormz[0]
        except TypeError:
            curr_precursormz = float(curr_precursormz)
            df.loc[df['InChIKey'] == inchikey, ['PrecursorMZ']] = curr_precursormz
        except ValueError:
            curr_precursormz = float(curr_precursormz)
            df.loc[df['InChIKey'] == inchikey, ['PrecursorMZ']] = curr_precursormz
    return df

def filter_small_score(df, threshold):
    return df[df['Score'] >= threshold]

def retrieve_ms2_dict(df):
    ms2_dict = pd.Series(df['Score'].array, index=df['MS2']).to_dict()
    return ms2_dict

def append_to_list(df):
    output_dict = {}
    for i, item in df['Name'].items():
        if 'Name' in output_dict.keys():
            if item not in output_dict['Name']:
                if type(output_dict['Name']) == list:
                    if type(item) == list:
                        output_dict['Name'] += item
                    else:
                        output_dict['Name'].append(item)
                else:
                    if type(item) == list:
                        output_dict['Name'] = item.append(output_dict['Name'])
                    else:
                        output_dict['Name'] = [output_dict['Name'], item]
        else:
            output_dict['Name'] = item
    return pd.Series(output_dict)

search_range = 0.7
output_folder = os.path.join(os.getcwd(), algo_3_output_folder, 'non_combined_output')

if not (os.path.exists(output_folder)):
    os.makedirs(output_folder)

## get list of csv file names
cwd = os.getcwd()
os.chdir(cleaned_folder)
listdir = glob.glob('*.csv')
os.chdir(cwd)

threshold = 10

for filename in tqdm(listdir):
#     if filename == '74_cleaned.csv':
        cleaned_file = os.path.join(cleaned_folder, filename)
        precursormz = int(filename.split('_')[0])
        raw_file = os.path.join(read_folder, str(precursormz) + '.csv')
        raw_interfering_file = os.path.join(read_folder, str(precursormz - 1) + '.csv')
        
        cleaned_df = pd.read_csv(cleaned_file)
        cleaned_df = filter_small_score(cleaned_df, threshold)
        cleaned_df.sort_values(by=['InChIKey', 'MS2'], inplace=True)
        cleaned_df = clean_precursormz(cleaned_df)
        
        raw_df = pd.read_csv(raw_file)
        raw_df = raw_df[columns_to_be_included]
        raw_df = filter_small_score(raw_df, threshold)
        raw_df.sort_values(by=['InChIKey', 'MS2'], inplace=True)
        raw_df = clean_precursormz(raw_df)

        try:
            raw_interfering_df = pd.read_csv(raw_interfering_file)
            raw_interfering_df = filter_small_score(raw_interfering_df, threshold)
            raw_interfering_df = raw_interfering_df[columns_to_be_included]
        except FileNotFoundError:
            raw_interfering_df = pd.DataFrame()

        output_file = os.path.join(output_folder, str(precursormz) + '_algo3.csv')
        inchikeys = cleaned_df['InChIKey'].unique().tolist()
        header_flag = True

        for inchikey in inchikeys:
            target_item_df = cleaned_df[cleaned_df['InChIKey'] == inchikey].copy()
            curr_precursormz = target_item_df.iloc[0]['PrecursorMZ']
            
            raw_target_item_df = raw_df[raw_df['InChIKey'] == inchikey].copy()
            raw_target_item_df.sort_values(by=['DB#', 'MS2'], inplace=True)
            raw_target_item_s = raw_target_item_df.groupby(['DB#']).apply(retrieve_ms2_dict)
            
            raw_interfering_items_df = raw_df[raw_df['InChIKey'] != inchikey]
            raw_interfering_items_df = raw_interfering_items_df.append(raw_interfering_df, ignore_index=True)
            raw_interfering_items_df = clean_precursormz(raw_interfering_items_df)
            raw_interfering_items_df = raw_interfering_items_df[raw_interfering_items_df['PrecursorMZ'] >= (curr_precursormz - 0.7)]
            
            if raw_interfering_items_df.shape[0] < 1:
                target_item_df.to_csv(output_file, index=False)
                break
                
            raw_interfering_items_df.sort_values(by=['DB#', 'MS2'], inplace=True)
            raw_interfering_items_s = raw_interfering_items_df.groupby(['DB#']).apply(retrieve_ms2_dict)

            target_ms2_array = target_item_df['MS2'].array
            output_dict = {}

            # Calculate TP, FN
            for ms2 in target_ms2_array:
                TP, FN = 0, 0
                for i, ms2_dict in raw_target_item_s.iteritems():
                    FN_flag = True
                    
                    if (ms2 - max(ms2_dict.keys())) > 0.7:
                        FN += 1
                        continue
                        
                    for key in ms2_dict.keys():
                        diff = round(ms2 - key, 2)
                        if abs(diff) <= search_range:
                            FN_flag = False
                            TP += 1
                        elif diff < 0:
                            if (FN_flag):
                                FN += 1
                            break
                
                output_dict[ms2] = {
                    'TP': TP,
                    'FN': FN
                }
# ====================================================================================================================== #
            # Calculate FP, TN
            for ms2 in target_ms2_array:
                FP, TN = 0, 0
                for i, ms2_dict in raw_interfering_items_s.iteritems():
                    TN_flag = True
                    
                    if (ms2 - max(ms2_dict.keys())) > 0.7:
                        TN += 1
                        continue
                        
                    for key in ms2_dict.keys():
                        diff = round(ms2 - key, 2)
                        if abs(diff) <= search_range:
                            TN_flag = False
                            FP += 1
                        elif diff < 0:
                            if (TN_flag):
                                TN += 1
                            break
                
                output_dict[ms2]['FP'] = FP
                output_dict[ms2]['TN'] = TN
                
                TP = output_dict[ms2]['TP']
                FN = output_dict[ms2]['FN']
                output_dict[ms2]['Accuracy'] = (TP+TN) / (TP+TN+FP+FN)
                output_dict[ms2]['Sensitivity'] = TP / (TP+FN)
                output_dict[ms2]['Specificity'] = TN / (FP+TN)

            output_df = pd.DataFrame.from_dict(output_dict, orient='index')
            output_df = target_item_df.join(output_df, on='MS2')

            output_df.sort_values(by=['Specificity'], inplace=True)
            output_df.to_csv(output_file, index=False, mode='a', header=header_flag)
            header_flag = False # only include header for the first time

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
100%|██████████| 1046/1046 [38:35<00:00,  2.21s/it] 


In [6]:
# combine all output into same csv file

read_folder = os.path.join(os.getcwd(), algo_3_output_folder, 'non_combined_output')
output_folder = os.path.join(os.getcwd(), algo_3_output_folder)
cwd = os.getcwd()
os.chdir(read_folder)
listdir = glob.glob('*.csv')
os.chdir(cwd)

output_filename = os.path.join(output_folder, 'combined_output_algo_3.csv')

combined_csv = pd.concat([ pd.read_csv(os.path.join(read_folder, f)) for f in tqdm(listdir) ], ignore_index=True, sort=False)
combined_csv.to_csv(output_filename, index=False)

100%|██████████| 1046/1046 [00:09<00:00, 115.79it/s]
