# Chemical formula parser

## Classes

In [19]:
class Atom:
    """
    A simple attempt to represent an atom.
    Attributes
    ----------
    number : int, optional
        The number of atoms. Default is equal to 1.
    symbol : str
        The chemical symbol of the atom.
    
    Methods
    ----------
    getSymbol():
        Return the atom symbol.
    getNumber():
        Return the atom number.
    """
    
    def __init__(self, symbol: str = "", number: int = 1) -> None:
        """Initialize the atom's attributes."""
        self.symbol = symbol
        self.number = number
    
    def getSymbol(self) -> str:
        """Return the atom symbol."""
        return self.symbol

    def getNumber(self) -> int:
        """Return the atom number."""
        return self.number

In [20]:
class Molecule:
    """
    A simple attempt to represent a molecule.
    Attributes
    ----------
    number : int, optional
        The number of molecules. Default is equal to 1.
    atoms : list, optional
        list of atoms that make the molecule.
    submolecules : list, optional
        list of submolecules that make the molecule.
    
    Methods
    ----------
    addAtom(atom: Atom):
        Add an Atom object to the molecule.
    addSubmolecule(submolecule: Molecule):
        Add a submolecule (of type Molecule) to the list of submolecules.
    getAtoms():
        Return the molecule atoms.
    getAtomsAsDict():
        Return a dictionary of the atoms that make the molecule.
    """

    def __init__(self, number: int = 1, atoms: list = None, submolecules: list = None) -> None:
        """
        Initialize the molecule's attributes.
        
        Parameters
        ----------
        number : int, optional
            The number of molecules. Default is equal to 1.
        atoms : list, optional
            list of atoms that make the molecule.
        submolecules : list, optional
            list of submolecules that make the molecule.
        """
        if atoms is not None:
            self.atoms = atoms
        else:
            self.atoms = []
        if submolecules is not None:
            self.submolecules = submolecules
        else:
            self.submolecules = []
        self.number = number
    
    def addAtom(self, atom: Atom) -> None:
        """
        Add an Atom object to the molecule.
        
        Parameters
        ----------
        atom : Atom
            The atom to be added to the attribute atoms
        """
        self.atoms.append(atom)
    
    def addSubmolecule(self, submolecule) -> None:
        """
        Add a submolecule (of type Molecule) to the list of submolecules.
        
        Parameters
        ----------
        submolecule : Molecule
            The submolecule to be added to the attribute submolecules
        """
        self.submolecules.append(submolecule)
    
    def getAtoms(self) -> list:
        """
        Return the molecule attribute atoms.
        """
        return self.atoms
    
    def getAtomsAsDict(self) -> dict:
        """
        Return a dictionary of the atoms that make the molecule.
        Keys : str
            the atom symbols, e.g. 'H'.
        Values : int
            the corresponding atom number, e.g. 1.
        """
        res = {}
        for atom in self.atoms:
            if atom.symbol in res:
                res[atom.symbol] += self.number * atom.number
            else:
                res[atom.symbol] = self.number * atom.number
        
        for submolecule in self.submolecules:
            for symbol, number in submolecule.getAtomsAsDict().items():
                if symbol in res:
                    res[symbol] += number * self.number
                else:
                    res[symbol] = number * self.number
        return res

## Parsing method

In [21]:
import re

# see https://docs.python.org/3/library/re.html

# re.findall(pattern, string, flags=0)
# Return all non-overlapping matches of pattern in string, as a list of strings. The string is scanned left-to-right, and matches are returned in the order found. If one or more groups are present in the pattern, return a list of groups; this will be a list of tuples if the pattern has more than one group. Empty matches are included in the result.

## Patterns
# [] 
# Used to indicate a set of characters. Ranges of characters can be indicated by giving two characters and separating them by a '-', for example [a-z] will match any lowercase ASCII letter

# ? 
# Causes the resulting RE to match 0 or 1 repetitions of the preceding RE. ab? will match either 'a' or 'ab'.
# So [A-Z][a-z]? will match either [A-Z] or [A-Z][a-z], e.g. F or Fe (not 'e' alone)

# A|B
# where A and B can be arbitrary REs, creates a regular expression that will match either A or B.

# \d
# Matches any Unicode decimal digit (that is, any character in Unicode character category [Nd]). This includes [0-9], and also many other digit characters. If the ASCII flag is used only [0-9] is matched.

# +
# Causes the resulting RE to match 1 or more repetitions of the preceding RE. ab+ will match 'a' followed by any non-zero number of ‘b’s; it will not match just 'a'.

# .
# (Dot.) In the default mode, this matches any character except a newline.

In [5]:
# We only match capital letters in the given string
print(re.findall('[A-Z]', "Fe2O3"))
print(re.findall('[A-Z]', "(NH4)2HPO4"))
print(re.findall('[A-Z]', "Mg2[CH4{NNi2(Li2O4)5}14]3"))

['F', 'O']
['N', 'H', 'H', 'P', 'O']
['M', 'C', 'H', 'N', 'N', 'L', 'O']


In [8]:
# We only match groups of upper and lower case letters (2 consecutive letters only)
print(re.findall('[A-Z][a-z]', "Fe2O3"))
print(re.findall('[A-Z][a-z]', "(NH4)2HPO4"))
print(re.findall('[A-Z][a-z]', "Mg2[CH4{NNi2(Li2O4)5}14]3"))

['Fe']
[]
['Mg', 'Ni', 'Li']


In [10]:
# We only match upper letters or groups of upper and lower case letters (2 consecutive letters only)
print(re.findall('[A-Z][a-z]?', "Fe2O3"))
print(re.findall('[A-Z][a-z]?', "(NH4)2HPO4"))
print(re.findall('[A-Z][a-z]?', "Mg2[CH4{NNi2(Li2O4)5}14]3"))

['Fe', 'O']
['N', 'H', 'H', 'P', 'O']
['Mg', 'C', 'H', 'N', 'Ni', 'Li', 'O']


In [14]:
# We match upper letters or groups of upper and lower case letters (2 consecutive letters only)
# or decimal digits (one by one)
print(re.findall('\d', "123"))
print(re.findall('[A-Z][a-z]?|\d', "Fe2O3"))
print(re.findall('[A-Z][a-z]?|\d', "(NH4)2HPO4"))
print(re.findall('[A-Z][a-z]?|\d', "Mg2[CH4{NNi2(Li2O4)5}14]3"))

['1', '2', '3']
['Fe', '2', 'O', '3']
['N', 'H', '4', '2', 'H', 'P', 'O', '4']
['Mg', '2', 'C', 'H', '4', 'N', 'Ni', '2', 'Li', '2', 'O', '4', '5', '1', '4', '3']


In [15]:
# We match upper letters or groups of upper and lower case letters (2 consecutive letters only)
# or decimal digits
print(re.findall('\d+', "123"))
print(re.findall('[A-Z][a-z]?|\d+', "Fe2O3"))
print(re.findall('[A-Z][a-z]?|\d+', "(NH4)2HPO4"))
print(re.findall('[A-Z][a-z]?|\d+', "Mg2[CH4{NNi2(Li2O4)5}14]3"))

['123']
['Fe', '2', 'O', '3']
['N', 'H', '4', '2', 'H', 'P', 'O', '4']
['Mg', '2', 'C', 'H', '4', 'N', 'Ni', '2', 'Li', '2', 'O', '4', '5', '14', '3']


In [17]:
# We match every character including brackets
print(re.findall('[A-Z][a-z]?|\d+|.', "Fe2O3"))
print(re.findall('[A-Z][a-z]?|\d+|.', "(NH4)2HPO4"))
print(re.findall('[A-Z][a-z]?|\d+|.', "Mg2[CH4{NNi2(Li2O4)5}14]3"))

['Fe', '2', 'O', '3']
['(', 'N', 'H', '4', ')', '2', 'H', 'P', 'O', '4']
['Mg', '2', '[', 'C', 'H', '4', '{', 'N', 'Ni', '2', '(', 'Li', '2', 'O', '4', ')', '5', '}', '14', ']', '3']


## V1: No brackets

In [47]:
def parse_v1(molecule_str):
    molecule_tokens = re.findall('[A-Z][a-z]?|\d+|.', molecule_str)
    isPreviousAnAlpha = False
    
    final_molecule = Molecule()

    for ind, token in enumerate(molecule_tokens):

        if token.isalpha():
            if isPreviousAnAlpha:
                final_molecule.addAtom(atom)
            
            atom = Atom(token)
            isPreviousAnAlpha = True
            # if we are at the end of the list, add atom
            if ind == len(molecule_tokens) - 1:
                final_molecule.addAtom(atom)

        elif token.isdecimal():
            c = int(token)
            isPreviousAnAlpha = False
            
            # Number corresponds to atom
            atom.number = c
            final_molecule.addAtom(atom)
    
    return final_molecule.getAtomsAsDict()

In [48]:
print(parse_v1("Fe2O3"))
print(parse_v1("C456H78910"))

{'Fe': 2, 'O': 3}
{'C': 456, 'H': 78910}


## V2: Linear brackets (no nested brackets)

In [129]:
def parse_v2(molecule_str):
    molecule_tokens = re.findall('[A-Z][a-z]?|\d+|.', molecule_str)
    isPreviousAnAlpha = False

    #########################################################
    isPreviousTokenAClosingParenthesis = False
    isSubmolecule = False
    #########################################################

    final_molecule = Molecule()

    for ind, token in enumerate(molecule_tokens):

        if token.isalpha():
            if isPreviousAnAlpha:
                #########################################################
                if isSubmolecule:
                    # add case if submolecule => add atom in submolecule
                    submolecule.addAtom(atom)
                #########################################################
                else:
                    final_molecule.addAtom(atom)
                
                # isPreviousAnAlpha = False
            
            atom = Atom(token)
            isPreviousAnAlpha = True

            # if we are at the end of the list, add atom
            if ind == len(molecule_tokens) - 1:
                final_molecule.addAtom(atom)

        elif token.isdecimal():
            c = int(token)
            isPreviousAnAlpha = False

            #########################################################
            if isPreviousTokenAClosingParenthesis:
                # Number corresponds to submolecule
                submolecule.number = c
                # at this point, we can add this submolecule to the final_molecule
                final_molecule.addSubmolecule(submolecule)
                isPreviousTokenAClosingParenthesis = False
                isSubmolecule = False
            else:
                # Number corresponds to atom
                atom.number = c
                if isSubmolecule:
                    submolecule.addAtom(atom)
                else:
                    final_molecule.addAtom(atom)
            #########################################################

        #########################################################
        elif token in "([{":
           
            if isPreviousAnAlpha:
                if isSubmolecule:
                    submolecule.addAtom(atom)
                else:
                    final_molecule.addAtom(atom)
                isPreviousAnAlpha = False
            
            # create submolecule
            submolecule = Molecule()
            isSubmolecule = True

        elif token in ")]}":
            if isPreviousAnAlpha:
                if isSubmolecule:
                    submolecule.addAtom(atom)
                else:
                    final_molecule.addAtom(atom)
                isPreviousAnAlpha = False
            
            isPreviousTokenAClosingParenthesis = True
        #########################################################
    ########################################################
    # DEBUG
    ########################################################
    print("\n### FINAL MOLECULE ###")

    print("-input")
    print("\t{}".format(molecule_str))

    

    print("\n-Final molecule info")
    print("\tfinal_molecule.number: {}".format(final_molecule.number))

    for i, elt in enumerate(final_molecule.getAtoms()):
        print("\tAtom #{}\t{}: {}".format(str(i+1), elt.getSymbol(), elt.getNumber()))
    

    print("\t-Submolecules info")
    for i, elt in enumerate(final_molecule.submolecules):
        print("\t\tSubmolecule.number: {}".format(elt.number))
        for j, atom in enumerate(elt.getAtoms()):
            print("\t\tAtom #{}\t{}: {}".format(str(j+1), atom.getSymbol(), atom.getNumber()))
        print("\t\tSubmolecule #{}: {}".format(str(i+1), elt.getAtomsAsDict()))
        
    
    print("\nfinal_molecule: {}".format(final_molecule.getAtomsAsDict()))
    print("###\n")
    ########################################################

    return final_molecule.getAtomsAsDict()

In [130]:
print(parse_v2("Al2(SO4)3"))
# print(parse_v2("(H2O)3"))


### FINAL MOLECULE ###
-input
	Al2(SO4)3

-Final molecule info
	final_molecule.number: 1
	Atom #1	Al: 2
	-Submolecules info
		Submolecule.number: 3
		Atom #1	S: 1
		Atom #2	O: 4
		Submolecule #1: {'S': 3, 'O': 12}

final_molecule: {'Al': 2, 'S': 3, 'O': 12}
###

{'Al': 2, 'S': 3, 'O': 12}


## V3: Nested brackets

In [163]:
def parse_v3(molecule_str):
    molecule_tokens = re.findall('[A-Z][a-z]?|\d+|.', molecule_str)
    isPreviousAnAlpha = False
    isPreviousTokenAClosingParenthesis = False

    #########################################################
    # for nested submolecules
    list_submolecules = []
    #########################################################

    final_molecule = Molecule()

    for ind, token in enumerate(molecule_tokens):
    
        if token.isalpha():
            if isPreviousAnAlpha:
                if list_submolecules:
                    list_submolecules[-1].addAtom(atom)
                else:
                    final_molecule.addAtom(atom)
                isPreviousAnAlpha = False
            
            atom = Atom(token)
            isPreviousAnAlpha = True

            # if we are at the end of the list, add atom
            if ind == len(molecule_tokens) - 1:
                final_molecule.addAtom(atom)

        elif token.isdecimal():
            c = int(token)
            isPreviousAnAlpha = False
            if isPreviousTokenAClosingParenthesis:
                #########################################################
                # Number corresponds to the last submolecule in list_submolecules
                # Here len(list_submolecules) >= 1
                list_submolecules[-1].number = c
                # at this point, we can add this submolecule to the upper submolecule
                # or add it to final_molecule if len(list_submolecules) == 1
                if len(list_submolecules) == 1:
                    final_molecule.addSubmolecule(list_submolecules[-1])
                else:
                    # add the submolecule to the upper submolecule
                    list_submolecules[-2].addSubmolecule(list_submolecules[-1])
                    
                # remove the last submolecule from the list
                list_submolecules = list_submolecules[:-1]
                #########################################################

                isPreviousTokenAClosingParenthesis = False
            else:
                #########################################################
                # Number corresponds to atom
                atom.number = c
                if list_submolecules:
                    list_submolecules[-1].addAtom(atom)
                else:
                    final_molecule.addAtom(atom)
                #########################################################

        elif token in "([{":
            if isPreviousAnAlpha:
                if list_submolecules:
                    list_submolecules[-1].addAtom(atom)
                else:
                    final_molecule.addAtom(atom)
                isPreviousAnAlpha = False
            
            #########################################################
            # add new submolecule to list_submolecules
            list_submolecules.append(Molecule())
            #########################################################

        elif token in ")]}":
            if isPreviousAnAlpha:
                if list_submolecules:
                    list_submolecules[-1].addAtom(atom)
                else:
                    final_molecule.addAtom(atom)
                isPreviousAnAlpha = False

            isPreviousTokenAClosingParenthesis = True
        
        # DEBUG
        print("Token: {}".format(token))
        if list_submolecules:
            for i, elt in enumerate(list_submolecules):
                print("Submolecule #{}: {}".format(i+1, elt.getAtomsAsDict()))
                
    ########################################################
    # DEBUG
    ########################################################
    print("\n### FINAL MOLECULE ###")

    print("-input")
    print("\t{}".format(molecule_str))

    print("\n-Final molecule info")
    print("\tfinal_molecule.number: {}".format(final_molecule.number))

    for i, elt in enumerate(final_molecule.getAtoms()):
        print("\tAtom #{}\t{}: {}".format(str(i+1), elt.getSymbol(), elt.getNumber()))
    

    print("\t-Submolecules info")
    for i, elt in enumerate(final_molecule.submolecules):
        print("\t\tSubmolecule.number: {}".format(elt.number))
        for j, atom in enumerate(elt.getAtoms()):
            print("\t\tAtom #{}\t{}: {}".format(str(j+1), atom.getSymbol(), atom.getNumber()))
        print("\t\tSubmolecule #{}: {}".format(str(i+1), elt.getAtomsAsDict()))
        
    
    print("\nfinal_molecule: {}".format(final_molecule.getAtomsAsDict()))
    print("###\n")
    ########################################################
    
    return final_molecule.getAtomsAsDict()

In [164]:
print(parse_v3("Mg2[CH4{NNi2(Li2O4)5}14]3"))

Token: Mg
Token: 2
Token: [
Submolecule #1: {}
Token: C
Submolecule #1: {}
Token: H
Submolecule #1: {'C': 1}
Token: 4
Submolecule #1: {'C': 1, 'H': 4}
Token: {
Submolecule #1: {'C': 1, 'H': 4}
Submolecule #2: {}
Token: N
Submolecule #1: {'C': 1, 'H': 4}
Submolecule #2: {}
Token: Ni
Submolecule #1: {'C': 1, 'H': 4}
Submolecule #2: {'N': 1}
Token: 2
Submolecule #1: {'C': 1, 'H': 4}
Submolecule #2: {'N': 1, 'Ni': 2}
Token: (
Submolecule #1: {'C': 1, 'H': 4}
Submolecule #2: {'N': 1, 'Ni': 2}
Submolecule #3: {}
Token: Li
Submolecule #1: {'C': 1, 'H': 4}
Submolecule #2: {'N': 1, 'Ni': 2}
Submolecule #3: {}
Token: 2
Submolecule #1: {'C': 1, 'H': 4}
Submolecule #2: {'N': 1, 'Ni': 2}
Submolecule #3: {'Li': 2}
Token: O
Submolecule #1: {'C': 1, 'H': 4}
Submolecule #2: {'N': 1, 'Ni': 2}
Submolecule #3: {'Li': 2}
Token: 4
Submolecule #1: {'C': 1, 'H': 4}
Submolecule #2: {'N': 1, 'Ni': 2}
Submolecule #3: {'Li': 2, 'O': 4}
Token: )
Submolecule #1: {'C': 1, 'H': 4}
Submolecule #2: {'N': 1, 'Ni': 2}
Su

## V4: useless brackets & number in front of formula

In [166]:
def parse_v4(molecule_str):
    molecule_tokens = re.findall('[A-Z][a-z]?|\d+|.', molecule_str)
    isPreviousAnAlpha = False
    isPreviousTokenAClosingParenthesis = False
    list_submolecules = []
    final_molecule = Molecule()

    for ind, token in enumerate(molecule_tokens):
        
        if token.isalpha():
            if isPreviousAnAlpha:
                if list_submolecules:
                    list_submolecules[-1].addAtom(atom)
                else:
                    final_molecule.addAtom(atom)
                isPreviousAnAlpha = False
            
            if isPreviousTokenAClosingParenthesis:
                if len(list_submolecules) == 1:
                    final_molecule.addSubmolecule(list_submolecules[-1])
                else:
                    # add the lowest submolecule to the upper one
                    list_submolecules[-2].addSubmolecule(list_submolecules[-1])
                
                # remove the lowest submolecule from the list
                list_submolecules = list_submolecules[:-1]

                isPreviousTokenAClosingParenthesis = False
            
            atom = Atom(token)
            isPreviousAnAlpha = True

            # if we are at the end of the list, add atom
            if ind == len(molecule_tokens) - 1:
                final_molecule.addAtom(atom)

        elif token.isdecimal():
            c = int(token)
            isPreviousAnAlpha = False
            
            #########################################################
            # Case first token is number. Example: "3(H20)"
            if ind == 0:
                final_molecule.number = c
            #########################################################
            
            elif isPreviousTokenAClosingParenthesis:
                # Number corresponds to the lowest submolecule
                # Here len(list_submolecules) >= 1
                list_submolecules[-1].number = c
                # at this point, we can add this submolecule to the upper one,
                # or add it to final_molecule if len(list_submolecules) == 1
                if len(list_submolecules) == 1:
                    final_molecule.addSubmolecule(list_submolecules[-1])
                else:
                    # add the lowest submolecule to the upper one
                    list_submolecules[-2].addSubmolecule(list_submolecules[-1])
                
                # remove the lowest submolecule from the list
                list_submolecules = list_submolecules[:-1]

                isPreviousTokenAClosingParenthesis = False
            else:
                # Number corresponds to atom
                atom.number = c
                if list_submolecules:
                    list_submolecules[-1].addAtom(atom)
                else:
                    final_molecule.addAtom(atom)

        elif token in "([{":
            if isPreviousAnAlpha:
                if list_submolecules:
                    list_submolecules[-1].addAtom(atom)
                else:
                    final_molecule.addAtom(atom)
                isPreviousAnAlpha = False
            
            #########################################################
            # case previous token in ")]}", i.e. in closing brackets
            if isPreviousTokenAClosingParenthesis:
                if len(list_submolecules) == 1 or ind == len(molecule_tokens) - 1:
                    final_molecule.addSubmolecule(list_submolecules[-1])
                else:
                    # add the submolecule to the upper submolecule
                    list_submolecules[-2].addSubmolecule(list_submolecules[-1])
                
                # remove the lowest submolecule from the list
                list_submolecules = list_submolecules[:-1]

                isPreviousTokenAClosingParenthesis = False
            #########################################################

            # add new submolecule to list_submolecules
            list_submolecules.append(Molecule())

        elif token in ")]}":
            if isPreviousAnAlpha:
                if list_submolecules:
                    list_submolecules[-1].addAtom(atom)
                else:
                    final_molecule.addAtom(atom)
                isPreviousAnAlpha = False
            
            #########################################################
            # case no decimal after (previous) closing bracket
            if isPreviousTokenAClosingParenthesis:
                if len(list_submolecules) == 1 or ind == len(molecule_tokens) - 1:
                    final_molecule.addSubmolecule(list_submolecules[-1])
                else:
                    # add the submolecule to the upper submolecule
                    list_submolecules[-2].addSubmolecule(list_submolecules[-1])
                
                # remove the lowest submolecule from the list
                list_submolecules = list_submolecules[:-1]
            
            # case this closing bracket is the last element of the string
            elif ind == len(molecule_tokens) - 1:
                final_molecule.addSubmolecule(list_submolecules[-1])
            #########################################################
            
            isPreviousTokenAClosingParenthesis = True

    return final_molecule.getAtomsAsDict()

In [167]:
print(parse_v4("{((Fe3))}"))
print(parse_v4("((Fe3))8"))
print(parse_v4("8(Fe3)"))
print(parse_v4("8((Fe3))"))
print(parse_v4("3(H2O)"))
print(parse_v4("()H2O"))
print(parse_v4("4((O2)3)"))
print(parse_v4("((Fe3))"))
print(parse_v4("CH4(O)"))

{'Fe': 3}
{'Fe': 24}
{'Fe': 24}
{'Fe': 24}
{'H': 6, 'O': 3}
{'H': 2, 'O': 1}
{'O': 24}
{'Fe': 3}
{'C': 1, 'H': 4, 'O': 1}


## parser.py

In [172]:
# https://stackoverflow.com/questions/1450393/how-do-you-read-from-stdin
# readlines(): split the content by newline automatically 
# rstrip(): Remove any white spaces at the end of the string
# map(fun, iterable): Returns a list of the results after applying the given function to each item of a given iterable (list, tuple etc.)

In [None]:
# json
# https://docs.python.org/3/library/json.html
# json.dumps(): Serialize obj as a JSON formatted stream
## If sort_keys is true (default: False), then the output of dictionaries will be sorted by key.
## If indent is a non-negative integer or string, then JSON array elements and object members will be pretty-printed with that indent level.