# How to use RT filter

## Step 1: predict RT from SMILEs

Example file: 01968_1_L_HC18_M1_openo_result_20220714.csv.bglabeled.csv

**Notice:** you need to clean the smiles comlumn by removing blanks.

Copy the cleaned csv file to wormwood and run:

```bash
docker run -v $PWD:/data shunyang2018/gnn -f /data/clean_01968_1_L_HC18_M1_openo_result_20220714.csv.bglabeled.csv -sc query_smiles -tp c18 -ip data/

```
You can get the prediction.csv file at the same folder

## Step 2: combine predRT with annotaions

Put **prediction.csv** and **example file** at the data folder with this jupyer notebook.

Run next block:

In [36]:
import pandas as pd
from rdkit import Chem
import re
from rdkit.Chem.rdMolDescriptors import CalcMolFormula
def smi2inchikey(smi):
    try:
        i = Chem.MolToInchiKey(Chem.MolFromSmiles(smi))[:14]# first block
        return i
    except:
        return 'mol_error'


In [42]:
def combine(csv_path):
    csv = pd.read_csv(csv_path)
    predRT = pd.read_csv('data/prediction.csv')
    print(f'csv has {csv.shape[0]} rows; \npredRT has {predRT.shape[0]} rows')
    print('match RTs with 2D-inchikey...')
    csv['inchikey'] = csv['query_smiles'].apply(smi2inchikey)
    predRT['inchikey'] = predRT['inchikey'].str[:14]
    csv = csv.merge(predRT, on='inchikey', how='left')
    csv.columns = [*csv.columns[:-1], 'predrt']
    csv['rt_window'] = csv.apply(lambda x: abs(x['predrt']-x['RetentionTime']) <= 0.6, axis=1)
    n = csv['rt_window'].sum()
    print(f'{n} annotations is within the predRT window, \n{round(n/csv.shape[0],3)*100}% of all annotaions')
    csv.to_csv(csv_path.replace('.csv', '_rt.csv'))
    
    
    
    
csv_path = 'data/clean_01968_1_L_HC18_M1_openo_result_20220714.csv.bglabeled.csv'
combine(csv_path)

csv has 1784 rows; 
predRT has 1783 rows
match RTs with 2D-inchikey...
1995 annotations is within the predRT window, 
30.5% of all annotaions


In [41]:
csv = pd.read_csv('data/clean_01968_1_L_HC18_M1_openo_result_20220714.csv.bglabeled.csv')
predRT = pd.read_csv('data/prediction.csv')
print(f'csv has {csv.shape[0]} rows; \npredRT has {predRT.shape[0]} rows')

csv has 1784 rows; 
predRT has 1783 rows


In [38]:
print('match RTs with 2D-inchikey...')
csv['inchikey'] = csv['query_smiles'].apply(smi2inchikey)
predRT['inchikey'] = predRT['inchikey'].str[:14]
csv = csv.merge(predRT, on='inchikey', how='left')
csv.columns = [*csv.columns[:-1], 'predrt']
csv.head()

match RTs with 2D-inchikey...


Unnamed: 0,cdresult_file,cuc_id,brightseed_id,compound,len,library_source,molecular_formula,predicted_score,preferred_name,query_smiles,...,Mass,RetentionTime,Intensity,Area,FileID,StudyFileID,IdentifyingNodeNumber,inchikey,canonical_smiles,predrt
0,01968_1_L_HC18_M1_P_CIDHCD_6232022_SAMPLE_01.c...,2,11005824,01968_1_L_HC18_M1_1,1,brightseed,C26H50NO7P,0.454545,,CCCCCC=CCC=CCCCCCCCC(=O)OCC(COP(=O)([O-])OCC[N...,...,520.339356,6.758627,1442356000.0,4718738000.0,1435,F1,9,SPJFYYJXNPEZDW,CCCCCC=CCC=CCCCCCCCC(=O)OCC(O)COP(=O)([O-])OCC...,6.897665
1,01968_1_L_HC18_M1_P_CIDHCD_6232022_SAMPLE_01.c...,2,11005824,01968_1_L_HC18_M1_1,1,brightseed,C26H50NO7P,0.454545,,CCCCCC=CCC=CCCCCCCCC(=O)OCC(COP(=O)([O-])OCC[N...,...,520.339356,6.758627,1442356000.0,4718738000.0,1435,F1,9,SPJFYYJXNPEZDW,CCCCCC=CCC=CCCCCCCCC(=O)OCC(O)COP(=O)([O-])OCC...,6.897665
2,01968_1_L_HC18_M1_P_CIDHCD_6232022_SAMPLE_01.c...,2,11005824,01968_1_L_HC18_M1_1,1,brightseed,C26H50NO7P,0.454545,,CCCCCC=CCC=CCCCCCCCC(=O)OCC(COP(=O)([O-])OCC[N...,...,520.339356,6.758627,1442356000.0,4718738000.0,1435,F1,9,SPJFYYJXNPEZDW,CCCCCC=CCC=CCCCCCCCC(=O)OCC(O)COP(=O)([O-])OCC...,6.897665
3,01968_1_L_HC18_M1_P_CIDHCD_6232022_SAMPLE_01.c...,2,11005824,01968_1_L_HC18_M1_1,1,brightseed,C26H50NO7P,0.454545,,CCCCCC=CCC=CCCCCCCCC(=O)OCC(COP(=O)([O-])OCC[N...,...,520.339356,6.758627,1442356000.0,4718738000.0,1435,F1,9,SPJFYYJXNPEZDW,CCCCCC=CCC=CCCCCCCCC(=O)OCC(O)COP(=O)([O-])OCC...,6.897665
4,01968_1_L_HC18_M1_P_CIDHCD_6232022_SAMPLE_01.c...,2,11005824,01968_1_L_HC18_M1_1,1,brightseed,C26H50NO7P,0.454545,,CCCCCC=CCC=CCCCCCCCC(=O)OCC(COP(=O)([O-])OCC[N...,...,520.339356,6.758627,1442356000.0,4718738000.0,1437,F3,9,SPJFYYJXNPEZDW,CCCCCC=CCC=CCCCCCCCC(=O)OCC(O)COP(=O)([O-])OCC...,6.897665


## Step 3: use predRT as a filter with 0.6 min window

In [39]:
csv['rt_window'] = csv.apply(lambda x: abs(x['predrt']-x['RetentionTime']) <= 0.6, axis=1)
n = csv['rt_window'].sum()
print(f'{n} annotations is within the predRT window, \n{round(n/csv.shape[0],3)*100}% of all annotaions')

1995 annotations is within the predRT window, 
30.5% of all annotaions


In [40]:
csv.to_csv('rt_result.csv')
csv.head()

Unnamed: 0,cdresult_file,cuc_id,brightseed_id,compound,len,library_source,molecular_formula,predicted_score,preferred_name,query_smiles,...,RetentionTime,Intensity,Area,FileID,StudyFileID,IdentifyingNodeNumber,inchikey,canonical_smiles,predrt,rt_window
0,01968_1_L_HC18_M1_P_CIDHCD_6232022_SAMPLE_01.c...,2,11005824,01968_1_L_HC18_M1_1,1,brightseed,C26H50NO7P,0.454545,,CCCCCC=CCC=CCCCCCCCC(=O)OCC(COP(=O)([O-])OCC[N...,...,6.758627,1442356000.0,4718738000.0,1435,F1,9,SPJFYYJXNPEZDW,CCCCCC=CCC=CCCCCCCCC(=O)OCC(O)COP(=O)([O-])OCC...,6.897665,True
1,01968_1_L_HC18_M1_P_CIDHCD_6232022_SAMPLE_01.c...,2,11005824,01968_1_L_HC18_M1_1,1,brightseed,C26H50NO7P,0.454545,,CCCCCC=CCC=CCCCCCCCC(=O)OCC(COP(=O)([O-])OCC[N...,...,6.758627,1442356000.0,4718738000.0,1435,F1,9,SPJFYYJXNPEZDW,CCCCCC=CCC=CCCCCCCCC(=O)OCC(O)COP(=O)([O-])OCC...,6.897665,True
2,01968_1_L_HC18_M1_P_CIDHCD_6232022_SAMPLE_01.c...,2,11005824,01968_1_L_HC18_M1_1,1,brightseed,C26H50NO7P,0.454545,,CCCCCC=CCC=CCCCCCCCC(=O)OCC(COP(=O)([O-])OCC[N...,...,6.758627,1442356000.0,4718738000.0,1435,F1,9,SPJFYYJXNPEZDW,CCCCCC=CCC=CCCCCCCCC(=O)OCC(O)COP(=O)([O-])OCC...,6.897665,True
3,01968_1_L_HC18_M1_P_CIDHCD_6232022_SAMPLE_01.c...,2,11005824,01968_1_L_HC18_M1_1,1,brightseed,C26H50NO7P,0.454545,,CCCCCC=CCC=CCCCCCCCC(=O)OCC(COP(=O)([O-])OCC[N...,...,6.758627,1442356000.0,4718738000.0,1435,F1,9,SPJFYYJXNPEZDW,CCCCCC=CCC=CCCCCCCCC(=O)OCC(O)COP(=O)([O-])OCC...,6.897665,True
4,01968_1_L_HC18_M1_P_CIDHCD_6232022_SAMPLE_01.c...,2,11005824,01968_1_L_HC18_M1_1,1,brightseed,C26H50NO7P,0.454545,,CCCCCC=CCC=CCCCCCCCC(=O)OCC(COP(=O)([O-])OCC[N...,...,6.758627,1442356000.0,4718738000.0,1437,F3,9,SPJFYYJXNPEZDW,CCCCCC=CCC=CCCCCCCCC(=O)OCC(O)COP(=O)([O-])OCC...,6.897665,True
