In [None]:
'''Script to extract and sort data of mini ruedi measurments from .txt files for BLANKS'''

In [None]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
import plotly.express as px
import os
from datetime import datetime


from pathlib import Path

pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [210]:
#sample group directory that should contain all blanks
directory = "/Users/eart0477/Documents/mac_jupyter_files/yellowstone2022/Yellowstone22/raw_data/all_blanks/"

In [211]:
#sort files into categories from a folder. 
txt_files = []
for file in os.listdir(directory):
    if file.endswith(".txt"):
        txt_files.append(os.path.join(file))
        
sample_list = []
standard_list = []
blank_list = []

for string in txt_files:
    if "SAMPLE" in string:
        sample_list.append(string)
    elif "STANDARD" in string:
        standard_list.append(string)
    elif "BLANK" in string:
        blank_list.append(string)


In [212]:
blank_list

['2022-09-06_04-09-03_BLANK.txt',
 '2022-09-16_03-33-09_BLANK.txt',
 '2022-09-15_04-29-55_BLANK.txt',
 '2022-09-17_04-18-42_BLANK.txt',
 '2022-09-13_03-30-47_BLANK.txt',
 '2022-09-21_05-02-31_BLANK.txt',
 '2022-09-19_07-05-44_BLANK.txt',
 '2022-09-05_05-37-04_BLANK.txt',
 '2022-09-20_04-00-59_BLANK.txt',
 '2022-09-08_18-50-36_BLANK.txt']

In [213]:
def processing_function(file_path):   
    
    mz_values = set()

    with open(file_path) as f:



    #extract all mz values        
        for line in f:
            if 'mz=' in line and 'concentration=' not in line:
                mz_start = line.index('mz=') + 3
                space_index = line.index(' ', mz_start)
                mz = line[mz_start:space_index]
                mz_values.add(mz)

    mz_array = list(mz_values)


    #Scan through txt to extract detector and intesity values of every line and store in a nested list grouped by mz

    combined_lists = {mz: {'F': {'PEAK': [], 'ZERO': []}, 'M': {'PEAK': [], 'ZERO': []}} for mz in mz_array}

    with open(file_path) as f:
        for line in f:
            if 'mz=' in line and 'concentration=' not in line:
                mz_start = line.index('mz=') + 3
                space_index = line.index(' ', mz_start)
                mz = line[mz_start:space_index]

                detector_start = line.index('detector=') + 9
                space_index = line.index(' ', detector_start)
                detector = line[detector_start:space_index]

                intensity_start = line.index('intensity=') + 10
                space_index = line.index(' ', intensity_start)
                intensity = line[intensity_start:space_index]

                reading = 'PEAK' if 'PEAK' in line else 'ZERO'

                combined_lists[mz][detector][reading].append(intensity)

    #make a dataframe 

    df = pd.DataFrame(columns=['mz', 'detector', 'reading', 'intensity'])

    for mz in combined_lists:
        for detector in combined_lists[mz]:
            for reading in combined_lists[mz][detector]:
                for intensity in combined_lists[mz][detector][reading]:
                    df = df.append({'mz': mz, 'detector': detector, 'reading': reading, 'intensity': intensity}, ignore_index=True)


    #make sure intensity is numeric                
    df['intensity'] = pd.to_numeric(df['intensity'])

    
     #find sample_name  
    with open(file_path) as f:
        
        # Extract the base filename and extension from the file path
        filename, ext = os.path.splitext(os.path.basename(file_path))

        # Remove spaces from the filename
        file_name = filename.replace(' ', '')
        # find sample_name
        for line in f:
            if 'SAMPLENAME:' in line:
                name_start = line.index('SAMPLENAME:') + 12
                space_index = line.index(' ', name_start)
                sample_name = line[name_start:space_index].strip()
                break
            else:
                sample_name = None 

        # if 'SAMPLENAME:' not found, scan the file again to find 'ANALYSISTYPE:'
        if sample_name is None:
                sample_name = file_name
               
            

       


        return sample_name, df, file_name




In [214]:
def find_mean(sample_name, df):     
    #find mean and std of measurments

        # group the data frame by the 'mz', 'detector' columns
        grouped = df.groupby(['mz', 'detector', 'reading'])

        # calculate the mean and standard deviation of the 'intensity' column for each group
        result = grouped['intensity'].agg(['mean', 'std'])

        # reset the index of the result data frame to make the 'mz' and 'detector' columns columns again
        result = result.reset_index()
        
        peak_F = result[(result['detector'] == 'F') & (result['reading'] == 'PEAK')]



        #Selecting background on M detector
        peak_M = result[(result['detector'] == 'M') & (result['reading'] == 'PEAK')]
        
        final = pd.concat([peak_F, peak_M])
        


        # Initialize an empty dictionary to store the reformatted data
        data_dict = {}

        # Loop through the rows of the original dataframe
        for i, row in final.iterrows():
            # Extract the mz value
            mz = row['mz']

            # Construct column names for mean and std
            mean_col = f"{mz}_F_mean"
            std_col = f"{mz}_F_std"

            # Extract mean and std values
            mean_val = row['mean']
            std_val = row['std']

            # Add the mean and std values to the data_dict
            data_dict[mean_col] = mean_val
            data_dict[std_col] = std_val

        # Create a new dataframe from the data_dict
        new_df = pd.DataFrame([data_dict], index=[sample_name])

      
        
        return final

In [218]:
# Define a list of column names
cols = ['date','sample_name' ,'4_F_mean', '4_F_std', '4_M_mean', '4_M_std', '28_F_mean', '28_F_std', '32_F_mean', '32_F_std',
        '36_M_mean', '36_M_std','40_F_mean', '40_F_std', '40_M_mean', '40_M_std',
        '44_F_mean', '44_F_std',  '84_M_mean', '84_M_std', '14_F_mean', '14_F_std', '15_F_mean', '15_F_std', '16_F_mean', '16_F_std',
        '18_F_mean', '18_F_std',]

# Create an empty dataframe with the specified columns
full_df = pd.DataFrame(columns=cols)

# Iterate through a list of standards
standard_count = 0        
for i in blank_list: 
    standard_count += 1
    path = directory + i
        
    # Run all processing functions for each sample
    sample_name, df,file_name = processing_function(path)
    result = find_mean(sample_name, df)
        
    # Initialize an empty dictionary to store the reformatted data
    data_dict = {}
    
    

    # Loop through the rows of the original dataframe
    for i, row in result.iterrows():
        # Extract the mz value
        mz = row['mz']
        detector = row['detector']

        # Construct column names for mean and std
        mean_col = f"{mz}_{detector}_mean"
        std_col = f"{mz}_{detector}_std"

        # Extract mean and std values
        mean_val = row['mean']
        std_val = row['std']

        # Add the mean and std values to the data_dict
        data_dict[mean_col] = mean_val
        data_dict[std_col] = std_val
        data_dict['sample_name']=sample_name
        
        
    

    # Create a new dataframe from the data_dict
    new_df = pd.DataFrame([data_dict], index=[file_name])

    # Append the new row to the full_df dataframe
    full_df = full_df.append(new_df)





The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated a


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated a


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated a


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated a

In [221]:
for i, index in full_df.iterrows():
    dt_obj = datetime.strptime(i, '%Y-%m-%d_%H-%M-%S_BLANK')
    full_df.loc[i, 'date'] = dt_obj
    full_df.loc[i, 'day'] = dt_obj.day

full_df.sort_values(by='date', inplace=True)


In [223]:
# Save the dataframe
filename = 'all_yellowstone_blanks.xlsx'
full_df.to_excel(filename)


In [222]:
full_df

Unnamed: 0,date,sample_name,4_F_mean,4_F_std,4_M_mean,4_M_std,28_F_mean,28_F_std,32_F_mean,32_F_std,36_M_mean,36_M_std,40_F_mean,40_F_std,40_M_mean,40_M_std,44_F_mean,44_F_std,84_M_mean,84_M_std,14_F_mean,14_F_std,15_F_mean,15_F_std,16_F_mean,16_F_std,18_F_mean,18_F_std,20_M_mean,20_M_std,20_F_mean,20_F_std,36_F_mean,36_F_std,44_M_mean,44_M_std,day
2022-09-05_05-37-04_BLANK,2022-09-05 05:37:04,2022-09-05_05-37-04_BLANK,7.5e-15,6.203628e-15,-4.5066e-12,1.254995e-14,5.032e-14,5.552657e-15,5.914e-14,4.286374e-15,,,2.12e-15,6.669108e-15,,,1.2056e-13,5.498454e-15,-4.60572e-12,1.115889e-14,1.162e-14,5.687442e-15,7.64e-15,4.867546e-15,6.626e-14,8.992386e-15,1.67306e-12,1.340518e-13,,,5.44e-15,5.083601e-15,4.86e-15,5.13303e-15,7.256639e-10,1.92212e-11,5.0
2022-09-06_04-09-03_BLANK,2022-09-06 04:09:03,2022-09-06_04-09-03_BLANK,-3.63e-14,8.045806e-15,-1.60438e-12,1.690047e-14,2.752e-14,9.992347e-15,7.638e-14,4.599674e-15,6.79705e-12,4.662604e-13,-2.708e-14,8.707296e-15,6.2369e-13,1.117106e-13,8.906e-14,7.833135e-15,-1.60764e-12,4.381347e-14,-3.06e-14,3.761649e-15,-2.322e-14,1.173784e-14,1.0426e-13,1.442248e-14,4.4503e-12,2.185292e-13,3.314747e-11,1.507076e-12,,,,,,,6.0
2022-09-08_18-50-36_BLANK,2022-09-08 18:50:36,2022-09-08_18-50-36_BLANK,-5.163333e-14,2.050203e-15,-1.649067e-12,1.326483e-14,-9.166667e-15,1.923131e-14,3.26e-14,2.884441e-15,1.804883e-12,6.203758e-14,-4.64e-14,5.597321e-15,-8.454333e-13,6.043745e-14,3.14e-14,9.777014e-15,-1.659917e-12,2.094525e-14,-4.82e-14,2.622975e-15,-4.373333e-14,7.993956e-15,2.483333e-14,6.951499e-15,2.492767e-12,8.690468e-14,1.73168e-11,6.176277e-13,,,,,,,8.0
2022-09-13_03-30-47_BLANK,2022-09-13 03:30:47,2022-09-13_03-30-47_BLANK,8.2e-15,7.529276e-15,-1.96506e-12,2.903038e-14,2.796e-14,7.823235e-15,8.77e-14,1.556776e-14,-3.5455e-13,1.492944e-13,1.178e-14,7.842959e-15,-1.72811e-12,3.760617e-14,6.876e-14,1.162618e-14,-2.10079e-12,9.093363e-15,1.632e-14,7.224403e-15,1.186e-14,4.242994e-15,5.67e-14,1.531388e-14,1.51972e-12,1.214569e-13,1.07546e-11,7.77298e-13,,,,,,,13.0
2022-09-15_04-29-55_BLANK,2022-09-15 04:29:55,2022-09-15_04-29-55_BLANK,-7.124e-14,3.167491e-15,-2.88884e-12,2.519368e-14,2.754e-14,1.12438e-14,8.3e-14,9.021364e-15,7.2083e-13,1.372966e-13,-7.02e-14,7.395607e-15,-9.0986e-13,1.690398e-13,2.389e-13,3.356538e-14,-2.96599e-12,3.973537e-14,-7.068e-14,3.396616e-15,-7.312e-14,3.153886e-15,1.2708e-13,1.19326e-14,6.77454e-12,2.916737e-13,5.712746e-11,2.474412e-12,,,,,,,15.0
2022-09-16_03-33-09_BLANK,2022-09-16 03:33:09,2022-09-16_03-33-09_BLANK,-7.636e-14,7.681992e-15,-3.42568e-12,2.443759e-14,-3.758e-14,3.505282e-15,4.606e-14,3.367195e-15,-1.33101e-12,1.194732e-13,-7.496e-14,6.027271e-15,-2.57878e-12,9.609537e-14,6.222e-14,1.190071e-14,-3.56027e-12,2.17603e-14,-7.654e-14,5.210374e-15,-7.224e-14,3.951329e-15,5.034e-14,1.59915e-14,4.18266e-12,1.95636e-13,3.422771e-11,1.859098e-12,,,,,,,16.0
2022-09-17_04-18-42_BLANK,2022-09-17 04:18:42,2022-09-17_04-18-42_BLANK,4.96e-15,5.085568e-15,-3.89554e-12,2.699906e-14,1.544e-14,4.364974e-15,8.114e-14,1.524264e-14,-3.00843e-12,6.743075e-14,4.6e-15,3.957272e-15,-3.59956e-12,3.733175e-14,5.404e-14,6.003582e-15,-3.99395e-12,2.343152e-14,3.14e-15,5.737857e-15,3.58e-15,4.120316e-15,4.314e-14,6.614605e-15,1.43008e-12,1.031763e-13,1.081744e-11,4.965943e-13,,,,,,,17.0
2022-09-19_07-05-44_BLANK,2022-09-19 07:05:44,2022-09-19_07-05-44_BLANK,4.12e-15,6.154023e-15,-2.85448e-12,1.492622e-14,1.634e-14,8.186452e-15,9.116e-14,8.194388e-15,-2.02551e-12,5.676256e-14,2.18e-15,2.88652e-15,-2.53413e-12,2.827131e-14,5.386e-14,5.016772e-15,-2.94434e-12,1.036626e-14,5.94e-15,2.53239e-15,5.7e-15,6.757588e-15,5.598e-14,1.426401e-14,1.59106e-12,1.0585e-13,1.363566e-11,5.976595e-13,,,,,,,19.0
2022-09-20_04-00-59_BLANK,2022-09-20 04:00:59,2022-09-20_04-00-59_BLANK,-7.962e-14,8.326584e-15,-2.49636e-12,2.387544e-14,1.3044e-13,1.495069e-14,1.058e-14,3.077661e-15,1.28627e-12,1.165509e-13,-7.504e-14,1.748714e-15,7.04958e-12,5.42308e-13,4.5362e-13,4.492051e-14,-1.60434e-12,4.994671e-14,-7.14e-14,1.79722e-15,-7.048e-14,4.882827e-15,1.1672e-13,1.510123e-14,6.04846e-12,2.330284e-13,5.961019e-11,2.084026e-12,,,,,,,20.0
2022-09-21_05-02-31_BLANK,2022-09-21 05:02:31,2022-09-21_05-02-31_BLANK,-2.66e-15,2.56963e-15,-1.83516e-12,1.432639e-14,4.02e-15,8.136768e-15,3.902e-14,7.781838e-15,-1.05023e-12,7.060663e-14,-4.62e-15,5.245188e-15,-1.59322e-12,2.44338e-14,2.692e-14,6.759956e-15,-1.914e-12,1.137805e-14,4.82e-15,4.896631e-15,-1.04e-15,3.783913e-15,2.174e-14,6.23402e-15,6.9196e-13,4.260403e-14,5.14122e-12,5.387606e-13,,,,,,,21.0
