<a href="https://colab.research.google.com/github/TillVollmer5/mass_spectroscopy/blob/main/Literature_match_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from google.colab import drive
import pandas as pd
import seaborn as sns

drive.mount('/content/drive', force_remount = True)

Mounted at /content/drive


In [None]:
Auto_5_gp_df = pd.read_csv('/content/drive/My Drive/unknown_screening_data/Auto5_PUF_120k_185_TF_corrected.csv')
QC_df = pd.read_csv('/content/drive/My Drive/unknown_screening_data/QC_PUF4_2_120k_159_TF_corrected.csv')

In [None]:
Auto_5_gp_df['Ion Type'] = 'pm'
Auto_5_gp_df['Sample Name'] = 'Auto_5'

In [None]:
print(Auto_5_gp_df.head())

                                      Component Name  Retention Time  \
0                       2,6-Octadiene, 2,4-dimethyl-          10.704   
1                     trans-1,2-Diethyl cyclopentane          10.724   
2                         Nonane, 5-methyl-5-propyl-          10.755   
3               Cycloheptasiloxane, tetradecamethyl-          10.867   
4  (2,6,6-Trimethylcyclohex-1-enylmethanesulfonyl...          10.908   

   Reference m/z       Area     Height        TIC Formula (mol ion)  \
0      83.085503  519824310   68441127  136144784            C10H18   
1      97.101181  186944428   33988386  111464602             C9H18   
2      43.054165  749753403  135061658  647096634            C13H28   
3     281.051147  829942379  150727858  760637037       C14H42O7Si7   
4      81.069908  178765554   33558875   97384256         C16H22O2S   

      CAS No.   SI  RSI  ...  Total Score  Selected Column Type  \
0  63843-03-8  776  856  ...         72.2                  None   
1    9

In [None]:
def area_ratio(df):
    """
    Calculates ratio of area to the internal standard and saves it in a new column
    """
    benzyl_benzoate_area = df.loc[df['Component Name'] == 'Benzyl Benzoate', 'Area'].values[0]
    df['Area Ratio'] = df['Area'] / benzyl_benzoate_area

    return df

def find_matching_row(sample_df, ref_df, threshold):
    """
    Function removing the rows that are in the blank under the condition given
    """
    result_df = pd.DataFrame(columns=sample_df.columns)
    matched_indices = set()

    for _, sample_row in sample_df.iterrows():
        matching_rows = find_matching_rows(sample_row, ref_df, threshold)
        if not matching_rows.empty:
            best_match = find_best_match(sample_row, matching_rows)
            if best_match is not None and best_match['Area Ratio'] >= 0.05:
                matched_indices.add(best_match.name)

    result_df = sample_df[~sample_df.index.isin(matched_indices)]
    return result_df

def find_comon_row(sample_df, ref_df, threshold):
    """
    Function showing what compounds were in the sample and blank that matched
    """
    result_df = pd.DataFrame(columns=sample_df.columns)

    for _, sample_row in sample_df.iterrows():
        matching_rows = find_matching_rows(sample_row, ref_df, threshold)
        if not matching_rows.empty:
            best_match = find_best_match(sample_row, matching_rows)
            if best_match is not None and best_match['Area Ratio'] >= 0.01:
                result_df = result_df.append(sample_row, ignore_index=True)

    return result_df

def find_matching_rows(sample_row, ref_df, threshold):
    """
    Retention time threshold matching
    """
    matching_rows = ref_df[
        (ref_df['Retention Time'] >= sample_row['Retention Time'] - threshold) &
        (ref_df['Retention Time'] <= sample_row['Retention Time'] + threshold)
    ]
    return matching_rows

def find_best_match(sample_row, matching_rows):
    """
    Compound comparison for the exclusion parameters
    """
    best_match = None
    min_diff = float('inf')

    for _, ref_row in matching_rows.iterrows():
        diff = compare_strings(sample_row['Component Name'], ref_row['Component Name'])
        if diff < min_diff:
            min_diff = diff
            best_match = ref_row

    return best_match

def compare_strings(str1, str2):
    # Implement your string comparison logic here
    # You can use fuzzy string matching or any other suitable method
    # For simplicity, let's assume an exact match for now
    return 0 if str1 == str2 else 1

In [None]:
Auto_5_gp_df = area_ratio(Auto_5_gp_df)
QC_df = area_ratio(QC_df)
#print(Auto_5_gp_df.head())
#print(QC_df.head())

In [None]:
threshold_value = 0.3 # Set your desired threshold value

result_df = find_matching_row(Auto_5_gp_df, QC_df, threshold_value)
print(result_df)
comon_df = find_comon_row(Auto_5_gp_df, QC_df, threshold_value)

                                        Component Name  Retention Time  \
0                         2,6-Octadiene, 2,4-dimethyl-          10.704   
1                       trans-1,2-Diethyl cyclopentane          10.724   
2                           Nonane, 5-methyl-5-propyl-          10.755   
3                 Cycloheptasiloxane, tetradecamethyl-          10.867   
4    (2,6,6-Trimethylcyclohex-1-enylmethanesulfonyl...          10.908   
..                                                 ...             ...   
252  Phenol, 2,4-bis(1,1-dimethylethyl)-, phosphite...          31.184   
253  1-((3-chlorophenyl)sulfonyl)piperazine, N-acetyl-          31.225   
255                               Triacontane, 1-iodo-          31.368   
256                          Pentane, 2,2,4-trimethyl-          31.667   
258  Benzenepropanoic acid, 3,5-bis(1,1-dimethyleth...          33.695   

     Reference m/z        Area     Height         TIC Formula (mol ion)  \
0        83.085503   519824310   684

  result_df = result_df.append(sample_row, ignore_index=True)


In [None]:
comon_df = find_comon_row(Auto_5_gp_df, QC_df, threshold_value)
print(comon_df)

  result_df = result_df.append(sample_row, ignore_index=True)


                                       Component Name  Retention Time  \
0                            1H-Indene, 1-ethylidene-          11.532   
1                            Butylated Hydroxytoluene          13.668   
2                             2,4-Di-tert-butylphenol          13.719   
3   1-(3,4-Dihydro-2H-quinolin-1-yl)-2-(4-methyl-4...          13.903   
4   Benzoic acid, 2,4,6-trimethyl-, 2,4,6-trimethy...          13.964   
5                    Propanal benzyl isopropyl acetal          13.985   
6                       Phenol, 2,4,6-tri-tert-butyl-          14.600   
7                                   o-Hydroxybiphenyl          14.867   
8   Phenol, 2,6-bis(1,1-dimethylethyl)-4-(1-methyl...          15.144   
9   1,2-Cyclohexanedicarboxylic acid, di(3-pentyl)...          16.512   
10                    2-(Methylmercapto)benzothiazole          16.542   
11                                         Hexadecane          16.707   
12                         2,6-Diisopropylnaphthale