# Script to separate covalent and non-covalent fragment .mol files 

In [14]:
import os
from rdkit import Chem
import re
import pandas as pd

def getMolFiles(in_directory, mpro_sum_df):
    '''
    in_directory: Is the directory path to the files found in the Fragprep folder
                  on the DLS server
    mpro_sum_df : Is a Pandas df of the XChem summary found at 
                  https://www.diamond.ac.uk/covid-19/for-scientists/Main-protease-structure-and-XChem/Downloads.html
    
    The function walks through the in_directory and finds .mol files and filters/writes them 
    into covalent & non-covalent data directories by comparing the fragment name and Site information in
    the XChem summary
    '''
    for subdir, dirs, files in os.walk(in_directory):
        for file in files:
            filepath = subdir + os.sep + file
            if filepath.endswith(".mol"):
                frag_name = re.sub('_0.mol', '', file)
                if all(mpro_sum_df[mpro_sum_df['Dataset'] == frag_name].Site == "A - active"):
                    mol = Chem.MolFromMolFile(filepath, sanitize=True)
                    mol.SetProp("_Name",frag_name)
                    w = Chem.SDWriter('out_data/non-cov_frags/{}.mol'.format(frag_name))
                    w.write(mol)
                if all(mpro_sum_df[mpro_sum_df['Dataset'] == frag_name].Site == "B - active - covalent"):
                    mol = Chem.MolFromMolFile(filepath, sanitize=True)
                    mol.SetProp("_Name",frag_name)
                    w = Chem.SDWriter('out_data/cov_frags/{}.mol'.format(frag_name))
                    w.write(mol)                 

In [15]:
# Read in the Mpro screening summary
mpro_sum_df = pd.read_excel("in_data/Mpro full XChem screen - hits summary - ver-2020-05-16.xlsx") 

In [16]:
# Let's sort the frags into a covalent and non-covalent folders
getMolFiles("in_data/Mpro_frags", mpro_sum_df)