In [1]:
# Authors: Samantha Tetef and Vikram Kashyap
# Date: Winter 2020-2021 to Summer 2021

In [2]:
import sys, subprocess
# insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '../moldl/')
import moldl

import importlib
from pathlib import Path

importlib.reload(moldl)
from moldl import *

In [3]:
def PhosphorusSorter(mol):
    """Sort molecule based on phosphorus bonding environment."""
    # get phosphorus atom
    p = mol.getAtoms(element=15)[0]
    # get all bonds to P atom
    bonds = mol.getBonds(atom=p)
    
    # functional and R group counters
    doubleo, singleo, OH = 0,0,0  # check oxygen bonds
    double_S, single_S = 0,0  # check for sulfur bonds
    single_R, double_R = 0,0  # keep track of radicals
    H = 0  # check for just H bonds

    for b in bonds:
        partner = b.partnerof(p)

        if partner.element == 1:
            H += 1
        
        if partner.element == 16:
            # if sulfur
            if b.order == 2: double_S += 1
            if b.order == 1: single_S += 1
        
        if partner.element == 8:
            # if oxygen
            if b.order == 2: doubleo +=1
            if b.order == 1: singleo +=1
            for obond in mol.getBonds(atom=partner):
                if obond.partnerof(partner).element == 1:
                    OH += 1
        else:
            # if anything
            if b.order == 2: double_R += 1
            if b.order == 1: single_R += 1
    
    # Type 9
    if (doubleo, singleo, double_S, single_S) == (0,2,1,0):
        return 'methylphosphonothioate'
    
    # Type 8
    elif (doubleo, singleo, double_S, single_S) == (0,3,1,0):
        return 'phosphorothioate'
    
    # Type 7
    elif (doubleo, singleo, double_S, single_S) == (0,2,1,1):
        return 'dithiophosphate'
    
    # Type 6
    elif (doubleo, singleo, double_R, single_R) == (1,3,0,0):
        return 'phosphate'
    
    # Type 5
    elif (doubleo, singleo, double_R, single_R) == (1,2,0,1):
        if OH == 2:
            return 'phosphonic_acid'
        else:
            return 'phosphonate'
    elif (doubleo, singleo, double_R, single_R) == (0,3,0,0):
        return 'phosphite_ester'
    elif (doubleo, singleo, double_R, single_R) == (1,1,0,0):
        return 'hypophosphite'
                                                     
    # Type 4
    elif (doubleo, singleo, double_R, single_R) == (1,1,0,2):
        if OH == 0:
            return 'phosphinate'
        elif OH == 1:
            return 'phosphenic_acid'
    elif (doubleo, singleo, double_R, single_R) == (0,2,0,1):
        return 'phosphonite'
    
    # Type 3
    elif (doubleo, singleo, double_R, single_R) == (1,0,0,3):
        return 'phosphine_oxide'
    elif (doubleo, singleo, double_R, single_R) == (0,1,0,2):
        return 'phosphinite'
    
    # Type 2
    elif (doubleo, singleo, double_R, single_R) == (0,0,0,3):
        if H == 2:
            return 'phosphane'
        else:
            return 'trialkyl_phosphine'
    elif (doubleo, singleo, double_R, single_R) == (0,0,1,1):
        return 'phosphaalkene'
    
    # Type 1
    elif (doubleo, singleo, double_R, single_R) == (0,0,1,3):
        return 'phosphorane'

In [4]:
db = MolDB('Database/')

""" 
            In terminal, use the command: 
ls ProcessedData/*_xes.dat > processed_cids.list
            to add more CIDs to database
"""
with Path('processed_cids.list').open() as f:
    filedata = f.read()
    filedata = filedata.replace('_xes.dat', '').replace('ProcessedData/', '')
    processed_cids = filedata.splitlines()
processed_cids = set(processed_cids)

# add to moldl database using the processsed spectrum cids
iterator = 0
for cid in processed_cids:
    if cid not in db.molecules:
        db.addMolecule(Molecule.from_cid(cid))
        print(f'{iterator}\r', end="")
        iterator += 1

124

In [5]:
sortedids = db.sort(PhosphorusSorter, processed_cids)

listdir = Path('Categories/')
for category in sortedids:
    with (listdir/(str(category)+'.list')).open('w') as f:
        for cid in sortedids[category]:
            f.write(str(cid)+'\n')