In [1]:
# Authors: Samantha Tetef and Vikram Kashyap
# Date: Winter 2020-2021 to Summer 2021

In [2]:
import sys, subprocess
# insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '../moldl/')
import moldl

import importlib
from pathlib import Path

importlib.reload(moldl)
from moldl import *

In [3]:
def PhosphorusSorter(mol):
    """Sort molecule based on phosphorus bonding environment."""
    # get phosphorus atom
    p = mol.getAtoms(element=15)[0]
    # get all bonds to P atom
    bonds = mol.getBonds(atom=p)
    
    # functional and R group counters
    doubleo, singleo, ohgroup, other = 0,0,0,0 # check oxygen bonds
    double_sulfur, single_sulfur = 0,0  # check for sulfur bonds
    nitrogen = 0 # check for nitrogen bonds
    single_carbon, double_carbon = 0,0 # keep track of carbons

    for b in bonds:
        partner = b.partnerof(p)
        
#         if partner.element == 6 or partner.element == 7:
#             # if carbon
#             if b.order == 2: double_carbon += 1
#             if b.order == 1: single_carbon += 1
        
# #         if partner.element == 7:
# #             # if nitrogen
# #             nitrogen += 1
        
        if partner.element == 16:
            # if sulfur
            if b.order == 2: double_sulfur += 1
            if b.order == 1: single_sulfur += 1
        
        if partner.element == 8:
            # if oxygen
            if b.order == 2: doubleo +=1
            if b.order == 1:
                singleo +=1
                for obond in mol.getBonds(atom=partner):
                    if obond.partnerof(partner).element==1:
                        ohgroup+=1
                        singleo-=1
                # end for loop
            # end if statement

        else:
            # if anything else
            if b.order == 2: double_carbon += 1
            if b.order == 1: single_carbon += 1
#             other +=1
    
    # Type 7
    if (singleo, double_sulfur, single_sulfur) == (2,1,1):
        return 'dithiophosphate'
    
    # Type 6
    if doubleo == 1 and singleo + ohgroup == 3\
            and sum([double_sulfur, single_sulfur]) == 0:
        return 'phosphate'
    
    # Type 5
#     elif (doubleo, singleo, ohgroup, single_carbon) == (1,0,2,1):
#         return 'phosphonic_acid'
    elif (doubleo, single_carbon) == (1,1) and singleo + ohgroup == 2\
            and sum([double_sulfur, single_sulfur]) == 0:
        return 'phosphonate'
    elif doubleo == 0 and singleo == 3\
            and sum([double_sulfur, single_sulfur]) == 0:
        return 'phosphite_ester'
    elif (doubleo, singleo) == (1,1) and sum([single_carbon, nitrogen]) == 0\
            and sum([double_sulfur, single_sulfur]) == 0:
        return 'hypophosphite'
                                                     
    # Type 4
#     elif (doubleo, singleo, ohgroup, single_carbon) == (1,0,1,2):
#         return 'phosphenic_acid'
    elif (doubleo, singleo, ohgroup, single_carbon) == (1,1,0,2)\
            and sum([double_sulfur, single_sulfur]) == 0:
        return 'phosphinate'
    elif (doubleo, single_carbon) == (0,1) and singleo + ohgroup == 2\
            and sum([double_sulfur, single_sulfur]) == 0:
        return 'phosphonite'
    
    # Type 3
    elif (doubleo, singleo, ohgroup, single_carbon) == (1,0,0,3)\
            and sum([double_sulfur, single_sulfur]) == 0:
        return 'phosphine_oxide'
    elif (doubleo, singleo, ohgroup, single_carbon) == (0,1,0,2)\
            and sum([double_sulfur, single_sulfur]) == 0:
        return 'phosphinite'
    
    # Type 2
    elif (doubleo, singleo, ohgroup, single_carbon, double_carbon) == (0,0,0,3,0)\
            and sum([double_sulfur, single_sulfur]) == 0:
        return 'trialkyl_phosphine'
    elif (doubleo, singleo, ohgroup, single_carbon, double_carbon) == (0,0,0,1,1)\
            and sum([double_sulfur, single_sulfur]) == 0:
        return 'phosphaalkene'
    
    # Type 1
    elif (doubleo, singleo, ohgroup, single_carbon, double_carbon) == (0,0,0,3,1)\
            and sum([double_sulfur, single_sulfur]) == 0:
        return 'phosphorane'

In [4]:
db = MolDB('Database/')

# subprocess.call(['ls', 'ProcessedData/*_xes.dat', '>', 'processed_cids.list'])
# in terminal, do: ls ProcessedData/*_xes.dat > processed_cids.list
with Path('processed_cids.list').open() as f:
    filedata = f.read()
    filedata = filedata.replace('_xes.dat', '').replace('ProcessedData/', '')
    processed_cids = filedata.splitlines()
processed_cids = set(processed_cids)

# add to moldl database using the processsed spectrum cids
iterator = 0
for cid in processed_cids:
    if cid not in db.molecules:
        db.addMolecule(Molecule.from_cid(cid))
        print(f'{iterator}\r', end="")
        iterator += 1

In [5]:
sortedids = db.sort(PhosphorusSorter, processed_cids)

listdir = Path('Categories/')
for category in sortedids:
    with (listdir/(str(category)+'.list')).open('w') as f:
        for cid in sortedids[category]:
            f.write(str(cid)+'\n')