
### Please ensure that this script is in the same directory as the text file (Mona_experimental_ms.txt)
Run the following code block in sequence, some code block might take up to half an hour (depending on computer speed)

In [2]:
import os
import glob
from tqdm import tqdm  #instant showing the progress of loops 
import csv
import pandas as pd
import numpy as np

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [3]:
header = [
    'Name', 
    'Formula',
    'MW',
    'Exact Mass',
    'NIST#',
    'InChIKey', 
    'RI_estimated'
    'RI_predicted'
    'MS'
    'Score'
    #ms intensity
]

In [4]:
txt_filename = 'GCMS-NIST20-example.txt' # name of the raw data text file
foldername = 'GCMS_output' # this is the folder that we are going to store our cleaned data
ignore = False # variable used to determine whether to exclude the element
file_exist = True # variable used to determine whether the file that we are going to append exist or not
repeating_key = False # variable used to determine whether there are multiple fields with same name in the element
item_dict = {} # dictionary to store all the fields and value for an element
string = '' # element in string form that are going to be written into error.txt when we can't decide its filename
filename = '' # name of the file to store the cleaned data
inchikey, mw= '','' # these variables are used to determine whether to include or exclude the element

if not (os.path.exists(foldername)):
    os.makedirs(foldername)

In [4]:
### This code block might take around 8 minutes to run

with open(txt_filename, mode='r', encoding='utf-8-sig') as readf:
    while True:
        row = readf.readline()
        
        #use name as the collection starts
        if (row == 'Name'):
            readf.readline()
            
            # We ignore the current item if InChIKey is missing
            if not(inchikey):
                ignore = True
            
            if not (ignore):
                # if there are multiple fields with same name in the current element,
                # we output them into repeating_key.csv for further debugging purpose
                if (repeating_key):
                    filename = os.path.join(os.getcwd(), 'repeating_key.csv')

                if (filename):
                        
                    file_exist = os.path.isfile(filename)

                    with open(filename, 'a', encoding='utf-8-sig', newline='') as writef:
                        writer = csv.writer(writef)
                        if not (file_exist):
                            writer.writerow(header)
                        
                        ms = item_dict['MS']
                        score = item_dict['Score']
                        while (len(ms) > 0):
                            item_dict['MS'] = ms.pop(0)
                            item_dict['Score'] = score.pop(0)
                            writer.writerow(item_dict.get(col, '') for col in header)

                else:
                    # Will generate error.txt when:
                    # 1. PrecursorMZ and MW are both not found
                    # 2. There is trailing space or newline at the end of the file
                    with open('error.txt', 'a', encoding='utf-8-sig') as writef:
                        writef.write(string)
                        writef.write('\n')
                    
            # reinitialize everything after writing an item into a file
            ignore = False
            file_exist = True
            repeating_key = False
            item_dict = {}
            string = ''
            filename = ''
            inchikey= ''
            
            if (row == 'Name'):
                break # break out of the loop when we reach EOF
            else:
                continue
        
        # Store InChIKey if it exists, used for InChIKey existence check later on
        if ('InChIKey: ' in row):
            inchikey = row.strip('\n').split(": ")[1]
        
        # exclude these fields
#         if ('Synon: ' in row or 'Comments: ' in row or 'Num Peaks: ' in row):
        if ('Synonyms: ' in row or 'Values and Intensities: ' in row):
            continue

        
            
        # determine the filename by using either PrecursorMZ or MW+1
        # (MW+1 is used if PrecursorMZ does not exist or PrecursorMZ is equals to -1/0/1)
        #NIST use MW for filename
        if (not filename):
#             if ('MW: ' in row):
#                 try:
#                     filename = os.path.join(os.getcwd(), foldername, str(int(float(row.strip('\n').split(": ")[1]))) + '.csv')
#                     precursor_mz = int(float(row.strip('\n').split(": ")[1]))
#                     if (precursor_mz == -1 or precursor_mz == 0 or precursor_mz == 1):
#                         filename = ''
#                 except ValueError:
#                     print('ValueError: ', row)
#                     break
            if ('MW: ' in row):
                try:
                    filename = os.path.join(os.getcwd(), foldername, str(int(float(row.strip('\n').split(": ")[1]))) + '.csv')
                    mw = int(float(row.strip('\n').split(": ")[1]))
                except ValueError:
                    print('ValueError: ', row)
                    break
                    
#         if (not precursor_mz):
#             if ('PrecursorMZ: ' in row):
#                 precursor_mz = int(float(row.strip('\n').split(": ")[1]))
        
        if (not mw):
            if ('MW: ' in row):
                mw = int(float(row.strip('\n').split(": ")[1]))
        
                    
        if ('Values and Intensities' in row):
            
        
        # store the current element in string and dictionary data structure
        string += row
        if (': ' in row):
            try:
                [key, val] = row.strip('\n').split(': ', 1)
            except ValueError:
                print('ValueError: ')
                print(row)
                print(row.strip('\n').split(': '))
            try:
                if (key in item_dict.keys()):
                    item_dict[key] = [item_dict[key]]
                    item_dict[key].append(val)
                    repeating_key = True
                else:
                    item_dict[key] = val
            except IndexError:
                print('Index error: ', item_dict)
        else:
            [ms, score] = row.strip('\n').split(' ')
            if ('MS' in item_dict.keys()):
                item_dict['MS'].append(ms)
            else:
                item_dict['MS'] = [ms]
            if ('Score' in item_dict.keys()):
                item_dict['Score'].append(score)
            else:
                item_dict['Score'] = [score]
        

In [5]:

# def df_check_precursorMZ_MS(df, output_df, problematic_df):
#     precursorMZ = df.iloc[0]['PrecursorMZ']
#     if (abs(precursorMZ - df.iloc[0]['MS']) >= 1):
#         output_df = output_df.append(df.iloc[0], ignore_index=True)
#     else:
#         problematic_df = problematic_df.append(df.iloc[0], ignore_index=True) 
#     return output_df, problematic_df


# def s_check_precursorMZ_MS2(series, output_df):
#     precursorMZ = series['PrecursorMZ']
#     if (type(precursorMZ) == list):
#         precursorMZ = precursorMZ[0]
#     if (abs(precursorMZ - series['MS2']) >= 1):
#         output_df = output_df.append(series, ignore_index=True)
#     return output_df
    
def clean_data(df, filename):
    count = 0
    # filter out rows with Score value < 10
    #NIST how to set the score value <200
    df = df[df['Score'] >= 200]
    
    # output_df is the processed (cleaned) DataFrame that will be returned
    output_df = pd.DataFrame(columns=['Name', 'InChIKey', 'ExactMass','NIST#', 'Formula', 'MW', 'MS', 'Score','RI_estimated','RI_predicted'])    
    
      #NIST gc-ms data ms accuracy is low, not considering precursor and ms2 
#     # problematic_df is the DataFrame that contains items with only one row and its PrecursorMZ is similar to its MS2
#     problematic_df = pd.DataFrame(columns=['Name', 'InChIKey', 'ExactMass', 'Precursor_type', 'PrecursorMZ', 'MW', 'MS2', 'Score'])    
        
#     if (df.shape[0] < 2):
#         # check for similar PrecursorMZ and MS2
#         output_df, problematic_df = df_check_precursorMZ_MS2(df, output_df, problematic_df)
            
#         return output_df, problematic_df
    
    df = df.sort_values(by=['InChIKey', 'MS'], ascending=True)
    
    df_columns = ['Name', 'ExactMass', 'MW']
    inchikeys = df['InChIKey'].unique().tolist()
    
    for inchikey in inchikeys:
        working_df = df[df['InChIKey'] == inchikey]
        
#         if (working_df.shape[0] < 2):
#             # check for similar PrecursorMZ and MS2
#             output_df, problematic_df = df_check_precursorMZ_MS2(working_df, output_df, problematic_df)
#             continue
        
        output_MS = working_df.iloc[0]['MS'] # to store the chosen MS(MS with the highest score is chosen)
        output_MS_score = working_df.iloc[0]['Score'] # to store the chosen MS score
        output_score = working_df.iloc[0]['Score'] # to store the accumulated score
        retain_prev = False
        
        for i in range(1, working_df.shape[0]):
            # update prev_row and curr_row
            if not (retain_prev):
                prev_row = working_df.iloc[i-1].copy()
            curr_row = working_df.iloc[i]
            curr_row_MS = curr_row['MS']
            
            # check if both MS2 should be group together by using the formula
            # if they should be group together, we do the following:
            # 1. add up the score
            # 2. update output_MS2 to the one with the higher score
            # 3. check for all other columns other than (InChIKey, MS2, Score),
            #    if there are different values, we append them into list
#             if ( (abs(output_MS2 - curr_row_MS2) * 1000000 / output_MS2) < 1000 ):
#             if ( abs(output_MS - curr_row_MS) <= 0.7 ):
            if (abs(output_MS - curr_row_MS) <=1):
                retain_prev = True
                output_score += curr_row['Score']
                if (curr_row['Score'] > output_MS_score):
                    output_MS_score = curr_row['Score']
                    output_MS = curr_row_MS
                for column in df_columns:
                    if (type(prev_row[column]) == list):
                        if (curr_row[column] not in prev_row[column]):
                            prev_row[column].append(curr_row[column])
                    elif (prev_row[column] != curr_row[column]):
                        prev_row[column] = [prev_row[column]]
                        prev_row[column].append(curr_row[column])
                        
            # if they shouldn't be group together, we update the MS2 and Score
            # to the chosen MS2 (output_MS2) and accumulated score (output_score)
            # and append the series to output_df
            else:
                retain_prev = False
                prev_row['MS'] = output_MS
                prev_row['Score'] = output_score
                
                # check for similar PrecursorMZ and MS2
                output_df = s_check_precursorMZ_MS(prev_row, output_df)
                output_MS2 = curr_row['MS']
                output_MS2_score = curr_row['Score']
                output_score = curr_row['Score']

        # handling the last row
        if (retain_prev):
            prev_row['MS'] = output_MS
            prev_row['Score'] = output_score
            
#             # check for similar PrecursorMZ and MS2
#             output_df = s_check_precursorMZ_MS(prev_row, output_df)
#         else:
#             # check for similar PrecursorMZ and MS2
#             output_df = s_check_precursorMZ_MS(curr_row, output_df)

    output_df.sort_values(by=['InChIKey', 'Score'], ascending=False, inplace=True)
#     return output_df, problematic_df
    return output_df