In [53]:
import pandas as pd
import numpy as np
from glob import glob

This notebook takes all the test and training samples and adds labels using the data avialable in the GCMS and EGAMS contests. The function takes the filenames which correspond to the sample id and then add labels which are in two sets:
1. label: "basalt, carbonate, chloride, iron_oxide, oxalate, oxychlorine, phyllosilicate, silicate, sulfate, sulfide"
2. label_1: "aromatic, hydrocarbon, carboxylic acid, nitrogen_bearing_compound, chlorine_bearing_compound, sulfur_bearing_compound, alcohol, other_oxygen_bearing_compound, mineral

The generated dataframe includes four columns such that:
column_1 : index_id (starting from 1 to the number of samples)
column_2 : sample_id from GSFC
column_3 : sample data with columns representing 'time' at which the measurement is made, 'mass' of the module, and 'intensity' of the sample released
column_4 : label set 1
column_5 : label_1 set 2

In [54]:
# change the file names, and label names for the dataset you want to use
dir_samples      = '/home/arushi/Desktop/FDL_2024/Data_from_Goddard/GCMS_challenge/train_features-003/'
dir_label_name   = '/home/arushi/Desktop/FDL_2024/Data_from_Goddard/GCMS_challenge/GCMS_challenge-20240617T192949Z-002/GCMS_challenge/'

In [55]:
# test to see if we get the sample id from the filename correctly
file_names  = glob(dir_samples + 'train_features/' + '*.csv')
label_file  = dir_label_name + 'train_labels.csv'

In [56]:
# this cell is just for previweing how the output data would be structured
num = 1

data_file           = np.loadtxt(file_names[0], skiprows=1, delimiter=",")
df = pd.read_csv(label_file) 
sample_id           = file_names[0].split('/')[-1].split('.')[0]
df[df['sample_id'] == file_names[0].split('/')[-1].split('.')[0]].iloc[0][1:].to_dict()
my_dict =  {'Indx': num, 'sample_id': sample_id, 'Data': {'time': [data_file[:, 0]], \
                                                          'mass': [data_file[:, 1]], \
                                                          'intensity': [data_file[:, 2]]}, \
           'label': df[df['sample_id'] == file_names[0].split('/')[-1].split('.')[0]].iloc[0][1:].to_dict()}
my_dict

{'Indx': 1,
 'sample_id': 'S0048',
 'Data': {'time': [array([ 0.0422  ,  0.0422  ,  0.0422  , ..., 36.015633, 36.015633,
          36.015633])],
  'mass': [array([ 50.279495,  51.120758,  52.053543, ..., 531.709717, 533.026001,
          534.64978 ])],
  'intensity': [array([12810., 24765., 16562., ...,  1059.,   983.,  4678.])]},
 'label': {'aromatic': 1,
  'hydrocarbon': 0,
  'carboxylic_acid': 0,
  'nitrogen_bearing_compound': 0,
  'chlorine_bearing_compound': 0,
  'sulfur_bearing_compound': 0,
  'alcohol': 0,
  'other_oxygen_bearing_compound': 0,
  'mineral': 0}}

In [57]:
def get_data_cube (filename, num):
    '''
    This function takes the mass spectrometer data from the sample and generates 
    a dictionary with the columns specifying the index, sample_id, mass spec data, 
    and lables. We do this to create a merged data from all the samples with their
    labels attached
    
    Input Parameters
        ----------
        filename : str
            The name of the filename (same as the sample id)
        num : int
            The index field which counts the samples as we loop through them
            
    Output
        ----------
        dictionary item containing the formatted data so that we can read into the panda
        dataframe and eventually make a data frame containing all the sampels.

    '''
    
    # read the data as an numpy array because later we want to merge it
    data_file           = np.loadtxt(filename, skiprows=1, delimiter=",")
    sample_id           = filename.split('/')[-1].split('.')[0]
    dict_out            = {}
    egams               = True # set this to True if the data is from GCMS

    df = pd.read_csv(label_file)             
    indx = df['sample_id'] == sample_id
    
    if (len(indx[indx]) > 0 and not egams):
        dict_out =  {'Indx': num, 'sample_id': sample_id, 'Data': {'time': [data_file[:, 0]], \
                                                                    'mass': [data_file[:, 1]], \
                                                                    'intensity': [data_file[:, 2]]}, \
                    'label': df[df['sample_id'] == sample_id].iloc[0][1:].to_dict()}
    elif (len(indx[indx]) > 0 and not egams):
        dict_out =  {'Indx': num, 'sample_id': sample_id, 'Data': {'time': [data_file[:, 0]], \
                                                                   'temp': [data_file[:, 1]], \
                                                                   'mass_over_charge': [data_file[:, 2]], \
                                                                   'abundance': [data_file[:, 3]]}, \
                    'label': df[df['sample_id'] == sample_id].iloc[0][1:].to_dict()}     

            
    return dict_out    

In [58]:
pd_data_out  =  pd.DataFrame(get_data_cube(file_names[0], 0), index=[0])

for i in range(len(file_names)):
    pd_data_out.loc[i] = get_data_cube(file_names[i], i)
    

IndexError: index 3 is out of bounds for axis 1 with size 3

In [52]:
pd_data_out.to_csv('train_data_gcms.csv', sep=',', index=True, encoding='utf-8')

In [23]:
len(pd_data_out)

447