<a href="https://colab.research.google.com/github/TillVollmer5/mass_spectroscopy/blob/main/Literature_match_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [49]:
import matplotlib.pyplot as plt
import numpy as np
from google.colab import drive
import pandas as pd
import seaborn as sns

drive.mount('/content/drive', force_remount = True)

Mounted at /content/drive


In [50]:
Auto_5_gp_df = pd.read_csv('/content/drive/My Drive/unknown_screening_data/Auto5_PUF_120k_185_TF_corrected.csv')
QC_df = pd.read_csv('/content/drive/My Drive/unknown_screening_data/QC_PUF4_2_120k_159_TF_corrected.csv')

In [51]:
Auto_5_gp_df['Ion Type'] = 'pm'
Auto_5_gp_df['Sample Name'] = 'Auto_5'

In [59]:
print(Auto_5_gp_df)

                                        Component Name  Retention Time  \
0                         2,6-Octadiene, 2,4-dimethyl-          10.704   
1                       trans-1,2-Diethyl cyclopentane          10.724   
2                           Nonane, 5-methyl-5-propyl-          10.755   
3                 Cycloheptasiloxane, tetradecamethyl-          10.867   
4    (2,6,6-Trimethylcyclohex-1-enylmethanesulfonyl...          10.908   
..                                                 ...             ...   
254                    2-Pentene, 5-(pentyloxy)-, (E)-          31.235   
255                               Triacontane, 1-iodo-          31.368   
256                          Pentane, 2,2,4-trimethyl-          31.667   
257            Benzenamine, 4-octyl-N-(4-octylphenyl)-          33.542   
258  Benzenepropanoic acid, 3,5-bis(1,1-dimethyleth...          33.695   

     Reference m/z        Area     Height         TIC Formula (mol ion)  \
0        83.085503   519824310   684

In [53]:
def area_ratio(df):
    """
    Calculates ratio of area to the internal standard and saves it in a new column
    """
    benzyl_benzoate_area = df.loc[df['Component Name'] == 'Benzyl Benzoate', 'Area'].values[0]
    df['Area Ratio'] = df['Area'] / benzyl_benzoate_area

    return df

def filter_dataframe(df):
    """
    Removing the results below the probability threshold that is set below
    """
    filtered_df = df[df['Total Score'] >= 60]
    return filtered_df

def rearrange_by_area_ratio(df):
    """
    Rearrearanging the rows in descending fashion from the peak area ratio
    """
    if 'Area Ratio' not in df.columns:
        raise ValueError("DataFrame must have a 'Area Ratio' column")

    # Sort the DataFrame based on the 'Area Ratio' column in descending order
    df_sorted = df.sort_values(by='Area Ratio', ascending=False)

    # Reset the index to reflect the new order
    df_sorted = df_sorted.reset_index(drop=True)

    return df_sorted

def find_matching_row(sample_df, ref_df, threshold):
    """
    Function removing the rows that are in the blank under the condition given
    """
    result_df = pd.DataFrame(columns=sample_df.columns)
    matched_indices = set()

    for _, sample_row in sample_df.iterrows():
        matching_rows = find_matching_rows(sample_row, ref_df, threshold)
        if not matching_rows.empty:
            best_match = find_best_match(sample_row, matching_rows)
            if best_match is not None and best_match['Area Ratio'] >= 0.05:
                matched_indices.add(best_match.name)

    result_df = sample_df[~sample_df.index.isin(matched_indices)]
    return result_df

def find_comon_row(sample_df, ref_df, threshold):
    """
    Function showing what compounds were in the sample and blank that matched
    """
    result_df = pd.DataFrame(columns=sample_df.columns)

    for _, sample_row in sample_df.iterrows():
        matching_rows = find_matching_rows(sample_row, ref_df, threshold)
        if not matching_rows.empty:
            best_match = find_best_match(sample_row, matching_rows)
            if best_match is not None and best_match['Area Ratio'] >= 0.05:
                result_df = result_df.append(sample_row, ignore_index=True)

    return result_df

def find_matching_rows(sample_row, ref_df, threshold):
    """
    Retention time threshold matching
    """
    matching_rows = ref_df[
        (ref_df['Retention Time'] >= sample_row['Retention Time'] - threshold) &
        (ref_df['Retention Time'] <= sample_row['Retention Time'] + threshold)
    ]
    return matching_rows

def find_best_match(sample_row, matching_rows):
    """
    Compound comparison for the exclusion parameters
    """
    best_match = None
    min_diff = float('inf')

    for _, ref_row in matching_rows.iterrows():
        diff = compare_strings(sample_row['Formula (mol ion)'], ref_row['Formula (mol ion)'])
        if diff < min_diff:
            min_diff = diff
            best_match = ref_row

    return best_match

def compare_strings(str1, str2):
    # Implement your string comparison logic here
    # You can use fuzzy string matching or any other suitable method
    # For simplicity, let's assume an exact match for now
    return 0 if str1 == str2 else 1

In [54]:
Auto_5_gp_df = area_ratio(Auto_5_gp_df)
QC_df = area_ratio(QC_df)
#print(Auto_5_gp_df.head())
#print(QC_df.head())

In [55]:
threshold_value = 0.05 # Set your desired threshold value

result_df = find_matching_row(Auto_5_gp_df, QC_df, threshold_value)
print(result_df)
result_df.to_csv('/content/drive/My Drive/unknown_screening_data/Auto5_PUF_120k_blank_corrected.csv')

                                        Component Name  Retention Time  \
0                         2,6-Octadiene, 2,4-dimethyl-          10.704   
1                       trans-1,2-Diethyl cyclopentane          10.724   
2                           Nonane, 5-methyl-5-propyl-          10.755   
3                 Cycloheptasiloxane, tetradecamethyl-          10.867   
4    (2,6,6-Trimethylcyclohex-1-enylmethanesulfonyl...          10.908   
..                                                 ...             ...   
252  Phenol, 2,4-bis(1,1-dimethylethyl)-, phosphite...          31.184   
253  1-((3-chlorophenyl)sulfonyl)piperazine, N-acetyl-          31.225   
255                               Triacontane, 1-iodo-          31.368   
256                          Pentane, 2,2,4-trimethyl-          31.667   
258  Benzenepropanoic acid, 3,5-bis(1,1-dimethyleth...          33.695   

     Reference m/z        Area     Height         TIC Formula (mol ion)  \
0        83.085503   519824310   684

In [56]:
comon_df = find_comon_row(Auto_5_gp_df, QC_df, threshold_value)
print(comon_df)
comon_df.to_csv('/content/drive/My Drive/unknown_screening_data/Auto5_PUF_120k_blank_matches.csv')

  result_df = result_df.append(sample_row, ignore_index=True)


                                       Component Name  Retention Time  \
0                           Silane, dodecyltriethoxy-          12.380   
1                            Butylated Hydroxytoluene          13.668   
2                             2,4-Di-tert-butylphenol          13.719   
3     Terephthalic acid, tridec-2-yn-1-yl ethyl ester          13.770   
4                                          1-Docosene          14.323   
5                                   o-Hydroxybiphenyl          14.867   
6   1,4-Bis[2-[N-[6-methoxy-8-quinolyl]amino]propi...          14.898   
7                            Benzene, (1-ethylbutyl)-          15.504   
8                                 Eicosane, 2-methyl-          16.152   
9                      Benzene, 1,1'-oxybis[4-methyl-          16.245   
10  1,4-Methanobenzocyclodecene, 1,2,3,4,4a,5,8,9,...          16.275   
11                            Undecane, 3,9-dimethyl-          16.409   
12            Succinic acid, di(3-methylphenyl) est

In [57]:
rearranged_df = rearrange_by_area_ratio(result_df)
print(rearranged_df)
rearranged_df.to_csv('/content/drive/My Drive/unknown_screening_data/Auto5_PUF_120k_blank_rearranged.csv')

                                        Component Name  Retention Time  \
0                             Butylated Hydroxytoluene          13.668   
1    Phenol, 2,4-bis(1,1-dimethylethyl)-, phosphite...          31.184   
2                              2,4-Di-tert-butylphenol          13.719   
3    Benzenepropanoic acid, 3,5-bis(1,1-dimethyleth...          33.695   
4                   Carbonic acid, undecyl vinyl ester          11.236   
..                                                 ...             ...   
240  Methyl 2-ethyl-1,2,3,4-tetrahydro-.alpha.-(met...          23.218   
241                                       Thioxanthene          20.189   
242                        Phenanthrene, 3,6-dimethyl-          22.455   
243                 cyclohexane, [1,1'-biphenyl]-4-yl-          22.497   
244  Glycyl-L-valine, N-dimethylaminomethylene-, me...          24.327   

     Reference m/z         Area      Height          TIC Formula (mol ion)  \
0       205.158646  18178762411  

In [58]:
filtered_df = filter_dataframe(rearranged_df)
print(filtered_df)
filtered_df.to_csv('/content/drive/My Drive/unknown_screening_data/Auto5_PUF_120k_blank_filtered.csv')

                                        Component Name  Retention Time  \
0                             Butylated Hydroxytoluene          13.668   
1    Phenol, 2,4-bis(1,1-dimethylethyl)-, phosphite...          31.184   
2                              2,4-Di-tert-butylphenol          13.719   
3    Benzenepropanoic acid, 3,5-bis(1,1-dimethyleth...          33.695   
4                   Carbonic acid, undecyl vinyl ester          11.236   
..                                                 ...             ...   
240  Methyl 2-ethyl-1,2,3,4-tetrahydro-.alpha.-(met...          23.218   
241                                       Thioxanthene          20.189   
242                        Phenanthrene, 3,6-dimethyl-          22.455   
243                 cyclohexane, [1,1'-biphenyl]-4-yl-          22.497   
244  Glycyl-L-valine, N-dimethylaminomethylene-, me...          24.327   

     Reference m/z         Area      Height          TIC Formula (mol ion)  \
0       205.158646  18178762411  