# Goal: Convert the 21,166 mass spectra and our list of CAS numbers with their respective functional groups into 2 files. One with all of the mass spectra and another showing all of the CAS numbers that we have data for and which functional groups those molecules have

This process will be done in the same way that was shown in the Preprocess_and_Plot_Mass_Spectra_EXAMPLE notebook. There will be less of a walkthrough in this notebook but the process will be the same.

## Here are the necessary imports to run the code

In [1]:
#IMPORTS 
import os
import glob
import csv
import pandas as pd
import numpy as np

In [2]:
path = r'PATH'

### First we can define the different functional groups we will be looking at this will allow us to make the code look at the different functional groups one at a time and will make the process faster to code

In [3]:
top_dir = ['nitrile','ketone','ether','ester','carboxylic_acid','aromatic','amine','amide','alkyne','alkane','alkene','alcohol','nitro','alkyl_halide','acyl_halide','methyl','aldehyde','n_containing', 'o_containing', 'a_containing']

### Then we will use glob to pull in all of the mass spectra from the folder on the computer

This will also allow us to look at which CAS numbers don't have associated mass spectrometry data so that they can be removed from the CAS list.

In [4]:
#Define where the folder of csv data is located on the computer
data_path = path + '\\'+'Full_Dataset'
#The following lines will find all of the files of a given type (defined in extension) in the path's folder
extension = 'csv'
os.chdir(data_path)
result = glob.glob('*.{}'.format(extension))
#Here are the files that fit your criterion that are within the path file
all_files = glob.glob(data_path + "/*.csv")

Now we can read in our Target file. This tells us which functional groups are present in each of the different molecules via the CAS number. This file also currently tells us how many of those functional groups are present but we only need the presence or absense for now so we will go ahead and remove that extra information to avoid confusion.

In [5]:
# read in target
target = pd.read_csv(path +'\\'+"target.csv")
#Change any positive integer to numbers to 1
for f in top_dir:
    target.loc[target[f] != 0, f] = 1 
#Remove CAS that do not have associated mass spectrometry data
for i in range(len(target)):
    n = target.loc[i,"cas"]
    source =path + '\\'+"Full_Dataset"
    kind = ".csv"
    path_name =str(source)+str("\\")+str(n)+kind
    if not os.path.exists(str(path_name)):
        #If the data file doesn't exist drop the row from the data frame so it is not called later while sorting training data
        target = target.drop(i, axis=0)
target= target.reset_index()
target.drop('index', axis = 1, inplace = True)
#take a peek and make sure it looks correct
target.head()

Unnamed: 0,cas,alkane,methyl,alkene,alkyne,alcohol,amine,nitrile,aromatic,alkyl_halide,...,ketone,aldehyde,carboxylic_acid,ether,acyl_halide,amide,nitro,n_containing,o_containing,a_containing
0,100016,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,1,1,1,1
1,100027,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,1,1,1,1
2,1000493,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,100061,0,1,0,0,0,0,0,1,0,...,1,0,0,1,0,0,0,0,1,1
4,100072,0,1,0,0,0,0,0,1,1,...,0,0,0,1,1,0,0,0,1,1


### Now we can export this DataFrame into a csv for our training

In [6]:
target.to_csv(path + '\\'+ 'target_corrected.csv', index = False)

In [10]:
all_spectra = pd.DataFrame()
for i in range(len(target)):
    #Pull the CAS number from the target file
    f = target.loc[i,"cas"]
    #Location of the mass spec data
    data_path = path + '\\'+ 'Full_Dataset'
    #Generate path to find that CAS number's data file
    filename = str(f) + ".csv"
    path_i = data_path + "\\" + filename
    #Confirm that the file exists
    if os.path.exists(path_i) == True:
        #pull data into a temporary dataframe
        hold = pd.read_csv(path_i, header = 0, engine = 'python')
        mass = hold.iloc[:,0].tolist()
        y = hold.iloc[:,1].tolist()
        
        #normalize the mass spec intensity with respect to the most intense peak before continuing
        y_max = max(y)
        y_2 = [y/y_max for y in y]
        
        #Choose mass range of interest
        counter=list(range(1,501))
        #y_fill is what all of the unlisted values will be taken in as.
        y_fill = 0
        #The next for statement will add in lines for missing mass values
        for i in counter:
            if i not in mass:
                mass.append(i)
                y_2.append(y_fill)
        
        #Now we can put the data back together and sort with respect to the mass so they are in ascending mass order
        spectrum = zip(mass, y_2)
        spectrum = sorted(spectrum, key = lambda x: x[0])
        #Convert to dataframe to combine with the total data dataframe
        spectrum = pd.DataFrame(spectrum, columns=['mass', f])
        all_spectra[f] = spectrum[f]
#Add mass column to make the data file easier to interpet.
counter=list(range(1,501))
all_spectra.insert(0, 'mass', counter)

### Now we can export this DataFrame into a csv so we can reduce the computational time by not resorting all of the csv files every time we run the notebook.

In [11]:
all_spectra.to_csv(path + '\\'+'full_dataset.csv', index = False)

### Next we can pull in our experimental data to do the same process to that data as well!

In [12]:
#Define where the folder of csv data is located on the computer
exp_path = path + '\\' + 'experimental_data'
#The following lines will find all of the files of a given type (defined in extension) in the path's folder
extension = 'csv'
os.chdir(exp_path)
result = glob.glob('*.{}'.format(extension))
#Here are the files that fit your criterion that are within the path file
experimental_files = glob.glob(exp_path + "/*.csv")

In [13]:
experimental_files

['C:\\Users\\Nicole\\OneDrive - The Ohio State University\\Desktop\\Current Work\\Papers\\MS ML Paper Rewrite\\Code\\experimental_data\\2Furanmethanol.CSV',
 'C:\\Users\\Nicole\\OneDrive - The Ohio State University\\Desktop\\Current Work\\Papers\\MS ML Paper Rewrite\\Code\\experimental_data\\Limonene.CSV',
 'C:\\Users\\Nicole\\OneDrive - The Ohio State University\\Desktop\\Current Work\\Papers\\MS ML Paper Rewrite\\Code\\experimental_data\\Pyridine.CSV']

In [15]:
experimental_spectra = pd.DataFrame()
for i in experimental_files:
    name = i.replace(path + '\\'+ 'experimental_data' + '\\', '')
    name = name.replace('.CSV','')
    #pull data into a temporary dataframe
    hold = pd.read_csv(i, header = 0, engine = 'python')
    mass = hold.iloc[:,0].tolist()
    y = hold.iloc[:,1].tolist()
    #normalize the mass spec intensity with respect to the most intense peak before continuing
    y_max = max(y)
    y_2 = [y/y_max for y in y]
        
    #Choose mass range of interest
    counter=list(range(1,401))
    #y_fill is what all of the unlisted values will be taken in as.
    y_fill = 0
    #The next for statement will add in lines for missing mass values
    for i in counter:
        if i not in mass:
            mass.append(i)
            y_2.append(y_fill)
    #Now we can put the data back together and sort with respect to the mass so they are in ascending mass order
    spectrum = zip(mass, y_2)
    spectrum = sorted(spectrum, key = lambda x: x[0])
    #Convert to dataframe to combine with the total data dataframe
    spectrum = pd.DataFrame(spectrum, columns=['mass', name])
    experimental_spectra[name] = spectrum[name]
#Add mass column to make the data file easier to interpet.
experimental_spectra.insert(0, 'mass', counter)
experimental_spectra.set_index('mass')

Unnamed: 0_level_0,2Furanmethanol,Limonene,Pyridine
mass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.0,0.0,0.0
2,0.0,0.0,0.0
3,0.0,0.0,0.0
4,0.0,0.0,0.0
5,0.0,0.0,0.0
...,...,...,...
396,0.0,0.0,0.0
397,0.0,0.0,0.0
398,0.0,0.0,0.0
399,0.0,0.0,0.0


In [16]:
experimental_spectra.to_csv(path + '\\' + 'experimental_spectra.csv')