Python Script um aus einer Liste von SMILES herrauszufinden welche Periodensymbole und wie oft vorkommen um ein Vocabular für das Neuronale Netz zu geniereren 

1.SMILES
2.DEEPSMILES

In [10]:
import re
from typing import Union
import rdkit
import numpy as np
from typing import Union, Dict

In [11]:
class Vocabulary:
    """
    Stores the tokens and their conversion to vocabulary indexes, along with their frequencies.
    """
    def __init__(self, tokens: Union[Dict[str, int], None] = None, starting_id: int = 0) -> None:
        self._tokens = {}
        self._frequencies = {}
        self._current_id = starting_id

        if tokens:
            for token, idx in tokens.items():
                self._add(token, idx)
                self._current_id = max(self._current_id, idx + 1)

    def __getitem__(self, token_or_id: Union[str, int]) -> int:
        """
        Allows getting the index of a token or token of an index.
        """
        return self._tokens[token_or_id]

    def add(self, token: str) -> int:
        """
        Adds a token to the vocabulary or updates its frequency if it already exists.
        """
        if not isinstance(token, str):
            raise TypeError("Token is not a string")
        if token in self._tokens:
            self._frequencies[token] += 1
            return self._tokens[token]
        else:
            self._add(token, self._current_id)
            self._frequencies[token] = 1
            self._current_id += 1
            return self._current_id - 1

    def update(self, tokens: list) -> None:
        """
        Adds multiple tokens to the vocabulary.
        """
        for token in tokens:
            self.add(token)

    def __delitem__(self, token_or_id: Union[str, int]) -> None:
        """
        Deletes a token or index from the vocabulary.
        """
        other_val = self._tokens[token_or_id]
        del self._tokens[other_val]
        del self._tokens[token_or_id]
        del self._frequencies[other_val if isinstance(token_or_id, int) else token_or_id]

    def __contains__(self, token_or_id: Union[str, int]) -> bool:
        """
        Checks if a token or index is in the vocabulary.
        """
        return token_or_id in self._tokens

    def __len__(self) -> int:
        """
        Returns the number of unique tokens.
        """
        return len(self._tokens) // 2

    def tokens(self) -> list:
        """
        Returns the tokens from the vocabulary sorted by their frequencies in descending order.
        """
        return sorted(self._frequencies, key=self._frequencies.get, reverse=True)

    def frequencies(self) -> Dict[str, int]:
        """
        Returns a dictionary of token frequencies.
        """
        return self._frequencies

    def _add(self, token: str, idx: int) -> None:
        """
        Adds a token and its index to the internal dictionaries.
        """
        if idx not in self._tokens:
            self._tokens[token] = idx
            self._tokens[idx] = token
            self._frequencies[token] = 0  # Initialize frequency
        else:
            raise ValueError("IDX already present in vocabulary")




class SMILESTokenizer:
    """
    Handles the tokenization and untokenization of SMILES.
    """

    REGEXPS = {
        "brackets": re.compile(r"(\[[^\]]*\])"),
        "2_ring_nums": re.compile(r"(%\d{2})"),
        "brcl": re.compile(r"(Br|Cl)")
    }
    REGEXP_ORDER = ["brackets", "2_ring_nums", "brcl"]

    def tokenize(self, data : str, with_begin_and_end : bool=True) -> list:
        """
        Tokenizes a SMILES string.
        """
        def split_by(data, regexps):
            if not regexps:
                return list(data)
            regexp = self.REGEXPS[regexps[0]]
            splitted = regexp.split(data)
            tokens = []
            for i, split in enumerate(splitted):
                if i % 2 == 0:
                    tokens += split_by(split, regexps[1:])
                else:
                    tokens.append(split)
            return tokens

        tokens = split_by(data, self.REGEXP_ORDER)
        if with_begin_and_end:
            tokens = ["^"] + tokens + ["$"]
        return tokens

    def untokenize(self, tokens : list) -> str:
        """
        Untokenizes a SMILES string.
        """
        smi = ""
        for token in tokens:
            if token == "$":
                break
            if token != "^":
                smi += token
        return smi


def create_vocabulary(smiles_list : list, tokenizer : SMILESTokenizer,
                      canonical : bool=True) -> Vocabulary:
    """
    Creates a vocabulary for the SMILES syntax.
    """
    if not canonical:
        noncanon_smiles_list = []
        for smiles in smiles_list:
            molecule = rdkit.Chem.MolFromSmiles(smiles)

            try:
                noncanon_smiles_list.append(
                    rdkit.Chem.MolToSmiles(molecule,
                                           canonical=False,
                                           doRandom=True,
                                           isomericSmiles=False)
                )
            except:
                pass

        smiles_list += noncanon_smiles_list

    tokens = set()
    for smi in smiles_list:
        tokens.update(tokenizer.tokenize(smi, with_begin_and_end=False))

    vocabulary = Vocabulary()
    vocabulary.update(["$", "^"] + sorted(tokens))  # end token is 0 (also counts as padding)
    return vocabulary

In [13]:
# Ihre vorhandenen Klassendefinitionen wie `Vocabulary` und `SMILESTokenizer` bleiben unverändert.

# Funktion, um das Vokabular mit Daten aus einer Textdatei zu erstellen
def create_vocabulary_from_file(file_path: str, tokenizer: SMILESTokenizer) -> Vocabulary:
    vocabulary = Vocabulary()
    with open(file_path, 'r') as file:
        smiles_data = file.readlines()

    # Entfernen von Zeilenumbrüchen und Überprüfen auf leere Zeilen
    smiles_data = [line.strip() for line in smiles_data if line.strip()]

    # Tokenisieren der SMILES-Daten und Hinzufügen zum Vokabular
    for smi in smiles_data:
        tokens = tokenizer.tokenize(smi, with_begin_and_end=False)
        vocabulary.update(tokens)

    return vocabulary

# Erstellen einer Instanz von SMILESTokenizer
tokenizer = SMILESTokenizer()

# Pfad zur SMILES-Textdatei
file_path = 'C:\\Users\\SchockWav3\\Desktop\\Masterarbeit\\chembl_smiles.txt'

# Erstellen des Vokabulars aus der Textdatei
vocabulary = create_vocabulary_from_file(file_path, tokenizer)

# Ausgabe der Informationen über das Vokabular
print(f'There are {len(vocabulary)} unique tokens in the vocabulary.\n')
print(f'The unique tokens are: \n{vocabulary.tokens()}')


There are 142 unique tokens in the vocabulary.

The unique tokens are: 
['c', 'C', '(', ')', 'O', '1', '2', '=', 'N', '3', 'n', '[C@H]', '[C@@H]', 'Cl', 'F', '.', '4', 'S', '/', '-', '[O-]', '[C@]', '[nH]', '[C@@]', '[N+]', 'o', 's', '\\', '#', '5', '[Na+]', 'P', 'Br', '[n+]', '[Cl-]', '[2H]', '[Br-]', 'I', '[K+]', '[N-]', '[S+]', '[Ca+2]', '6', '[Mg+2]', '[I-]', '[Si]', 'B', '[18F]', '[OH-]', '[n-]', '[Al+3]', '[Al]', '[S@@+]', '[As]', '[123I]', '[Li+]', '[S-]', '[11CH3]', '[Zn+2]', '[Se]', '[Cl+]', '7', '[131I]', '[Mg]', '[I+]', '[K]', '[NH-]', '[PH]', '[B-]', '[125I]', '[N@+]', '[O+]', '[PH2]', '[Sr+2]', '[11C]', '[C-]', '[Ag+]', '[se]', '[P@]', '[Ba+2]', '[Cl+3]', '[Ca]', '[S-2]', '[85Sr+2]', '[O]', '[P+]', '[S@]', '[Te]', '[s+]', '[223Ra]', '[Ra]', '[Ag]', '[TeH]', '[3H]', '[22Na+]', '[Ba]', '[Kr]', '[Mg+]', '[18FH]', '[11C-]', '[AsH3]', '[81Kr]', '[75Se]', '[SrH2]', '[42K+]', '[124I]', '[Rb]', '[85SrH2]', '[LiH]', '[82Rb+]', '[129Xe]', '[CaH2]', '[H+]', '[C]', '[N]', '[F-]', '[Rb

In [14]:
# Nachdem das Vokabular erstellt wurde
print(f"Total unique tokens: {len(vocabulary)}")
print("Tokens and their frequencies:")
for token, frequency in vocabulary.frequencies().items():
    print(f"{token}: {frequency}")


Total unique tokens: 142
Tokens and their frequencies:
C: 82635
(: 42117
): 42117
=: 17437
O: 29965
#: 498
N: 14270
c: 85593
1: 27224
2: 18214
Cl: 3088
n: 7103
.: 2649
S: 2131
[C@]: 1026
[C@H]: 4492
[C@@H]: 4258
3: 7312
4: 2308
[C@@]: 783
[nH]: 978
-: 1338
s: 552
[N+]: 753
[O-]: 1158
/: 1898
\: 530
5: 448
[Cl-]: 107
P: 404
o: 585
F: 2706
[n+]: 160
I: 78
[Na+]: 419
[2H]: 98
[N-]: 66
Br: 246
[Mg+2]: 38
[18F]: 26
6: 44
[As]: 11
[S+]: 60
[22Na+]: 1
[Ca+2]: 55
[PH]: 4
[S@@+]: 12
[Ba]: 1
[I+]: 6
[Al+3]: 14
[I-]: 33
[Br-]: 88
[K+]: 78
[Al]: 14
[Si]: 28
[Kr]: 1
[N@+]: 3
[123I]: 11
[Mg+]: 1
[18FH]: 1
[S-]: 8
[11C-]: 1
[O+]: 3
[131I]: 7
[AsH3]: 1
[K]: 5
[PH2]: 3
[P@]: 2
[Sr+2]: 3
[NH-]: 5
[Ba+2]: 2
[OH-]: 19
B: 27
[11CH3]: 8
[Cl+3]: 2
[n-]: 16
[Ca]: 2
[Zn+2]: 8
[S-2]: 2
[Li+]: 10
[81Kr]: 1
[75Se]: 1
[Se]: 8
[SrH2]: 1
[11C]: 3
[42K+]: 1
[C-]: 3
[85Sr+2]: 2
[O]: 2
[P+]: 2
[124I]: 1
[Rb]: 1
[85SrH2]: 1
[Ag+]: 3
[S@]: 2
[Te]: 2
[s+]: 2
[LiH]: 1
[82Rb+]: 1
[129Xe]: 1
[CaH2]: 1
[B-]: 4
[223Ra]: 2
[H+]

deepSMILES

In [32]:
import re
from typing import Union, Dict
import rdkit
import deepsmiles

class Vocabulary:
    def __init__(self, tokens: Union[Dict[str, int], None] = None, starting_id: int = 0) -> None:
        self._tokens = {}
        self._frequencies = {}
        self._current_id = starting_id

        if tokens:
            for token, idx in tokens.items():
                self._add(token, idx)
                self._current_id = max(self._current_id, idx + 1)

    def __getitem__(self, token_or_id: Union[str, int]) -> int:
        return self._tokens[token_or_id]

    def add(self, token: str) -> int:
        if not isinstance(token, str):
            raise TypeError("Token is not a string")
        if token in self._tokens:
            self._frequencies[token] += 1
            return self._tokens[token]
        else:
            self._add(token, self._current_id)
            self._frequencies[token] = 1
            self._current_id += 1
            return self._current_id - 1

    def update(self, tokens: list) -> None:
        for token in tokens:
            self.add(token)

    def __delitem__(self, token_or_id: Union[str, int]) -> None:
        other_val = self._tokens[token_or_id]
        del self._tokens[other_val]
        del self._tokens[token_or_id]
        del self._frequencies[other_val if isinstance(token_or_id, int) else token_or_id]

    def __contains__(self, token_or_id: Union[str, int]) -> bool:
        return token_or_id in self._tokens

    def __len__(self) -> int:
        return len(self._tokens) // 2

    def tokens(self) -> list:
        return sorted(self._frequencies, key=self._frequencies.get, reverse=True)

    def frequencies(self) -> Dict[str, int]:
        return self._frequencies

    def _add(self, token: str, idx: int) -> None:
        if idx not in self._tokens:
            self._tokens[token] = idx
            self._tokens[idx] = token
            self._frequencies[token] = 0
        else:
            raise ValueError("IDX already present in vocabulary")


class DeepSMILESTokenizer:
    def __init__(self):
        self.converter = deepsmiles.Converter(rings=True, branches=True)

        self.REGEXPS = {
            "brackets": re.compile(r"(\[[^\]]*\])"),
            "2_ring_nums": re.compile(r"(%\d{2})"),
            "brcl": re.compile(r"(Br|Cl)")
        }
        self.REGEXP_ORDER = ["brackets", "2_ring_nums", "brcl"]

    def encode(self, smiles: str) -> str:
        try:
            deep_smiles = self.converter.encode(smiles)
            return deep_smiles
        except Exception as e:
            print("Could not convert SMILES to DeepSMILES:", e)
            return "UNK"

    def tokenize(self, deep_smiles: str, with_begin_and_end: bool = True) -> list:
        def split_by(data, regexps):
            if not regexps:
                return list(data)
            regexp = self.REGEXPS[regexps[0]]
            splitted = regexp.split(data)
            tokens = []
            for i, split in enumerate(splitted):
                if i % 2 == 0:
                    tokens += split_by(split, regexps[1:])
                else:
                    tokens.append(split)
            return tokens

        tokens = split_by(deep_smiles, self.REGEXP_ORDER)
        if with_begin_and_end:
            tokens = ["^"] + tokens + ["$"]
        return tokens

    def untokenize(self, tokens: list) -> str:
        return ''.join(token for token in tokens if token not in ['^', '$', 'UNK'])


def create_vocabulary_from_file(file_path: str, tokenizer: DeepSMILESTokenizer) -> Vocabulary:
    
    vocabulary = Vocabulary()
    
    with open(file_path, 'r') as file:
        smiles_data = file.readlines()

    smiles_data = [line.strip() for line in smiles_data if line.strip()]

    for i, smi in enumerate(smiles_data):
        deep_smiles = tokenizer.encode(smi)
        if i % 1000 == 0:
            print(f"DeepSMILES [{i}]: {deep_smiles}")  # Debugging-Ausgabe
        token_list = tokenizer.tokenize(deep_smiles, with_begin_and_end=False)
        if i % 1000 == 0:
            print(f"Token List [{i}]: {token_list}")  # Debugging-Ausgabe
        vocabulary.update(token_list)
    
    return vocabulary

# Pfad zur SMILES-Textdatei
file_path = 'C:\\Users\\SchockWav3\\Desktop\\Masterarbeit\\chembl_smiles.txt'

# Erstellen einer Instanz von DeepSMILESTokenizer
tokenizer = DeepSMILESTokenizer()

# Erstellen des Vokabulars aus der Textdatei
vocabulary = create_vocabulary_from_file(file_path, tokenizer)

# Ausgabe der Informationen über das Vokabular
print(f'There are {len(vocabulary)} unique tokens in the vocabulary.\n')
print(f'The unique tokens are: \n{vocabulary.tokens()}')


DeepSMILES [0]: CCC)CC=O)OCC#N))cccccOcccccc6)))))))c6)))))))))ccccCl)cc6
Token List [0]: ['C', 'C', 'C', ')', 'C', 'C', '=', 'O', ')', 'O', 'C', 'C', '#', 'N', ')', ')', 'c', 'c', 'c', 'c', 'c', 'O', 'c', 'c', 'c', 'c', 'c', 'c', '6', ')', ')', ')', ')', ')', ')', ')', 'c', '6', ')', ')', ')', ')', ')', ')', ')', ')', ')', 'c', 'c', 'c', 'c', 'Cl', ')', 'c', 'c', '6']
DeepSMILES [1000]: FccccCCCCNCCNCCNcccccc6)))))))))CC6)))))))))ccccF)cc6)))))))cc6
Token List [1000]: ['F', 'c', 'c', 'c', 'c', 'C', 'C', 'C', 'C', 'N', 'C', 'C', 'N', 'C', 'C', 'N', 'c', 'c', 'c', 'c', 'c', 'c', '6', ')', ')', ')', ')', ')', ')', ')', ')', ')', 'C', 'C', '6', ')', ')', ')', ')', ')', ')', ')', ')', ')', 'c', 'c', 'c', 'c', 'F', ')', 'c', 'c', '6', ')', ')', ')', ')', ')', ')', ')', 'c', 'c', '6']
DeepSMILES [2000]: CccccCO)CC)NCCCcccccc6))))))))))))cc6O
Token List [2000]: ['C', 'c', 'c', 'c', 'c', 'C', 'O', ')', 'C', 'C', ')', 'N', 'C', 'C', 'C', 'c', 'c', 'c', 'c', 'c', 'c', '6', ')', ')', ')', ')', ')

In [35]:
# Nachdem das Vokabular erstellt wurde
print(f"Total unique tokens: {len(vocabulary)}")
print("Tokens and their frequencies:")
for token, frequency in vocabulary.frequencies().items():
    print(f"{token}: {frequency}")


Total unique tokens: 157
Tokens and their frequencies:
C: 82635
): 113298
=: 17437
O: 29965
#: 498
N: 14270
c: 85593
6: 16786
Cl: 3088
n: 7103
5: 4252
.: 2649
S: 2131
[C@]: 1026
[C@H]: 4492
[C@@H]: 4258
%10: 1950
[C@@]: 783
%14: 614
%17: 334
[nH]: 978
9: 1622
-: 1338
s: 552
[N+]: 753
[O-]: 1158
%16: 26
%12: 106
/: 1898
\: 530
%13: 235
%20: 33
[Cl-]: 107
P: 404
o: 585
F: 2706
[n+]: 160
8: 253
I: 78
[Na+]: 419
3: 426
[2H]: 98
%15: 200
%11: 264
[N-]: 66
7: 345
Br: 246
[Mg+2]: 38
[18F]: 26
%18: 43
[As]: 11
[S+]: 60
[22Na+]: 1
4: 249
[Ca+2]: 55
%25: 2
[PH]: 4
[S@@+]: 12
[Ba]: 1
[I+]: 6
[Al+3]: 14
[I-]: 33
[Br-]: 88
[K+]: 78
[Al]: 14
[Si]: 28
%21: 13
%19: 20
[Kr]: 1
[N@+]: 3
[123I]: 11
[Mg+]: 1
[18FH]: 1
[S-]: 8
[11C-]: 1
[O+]: 3
[131I]: 7
[AsH3]: 1
[K]: 5
[PH2]: 3
[P@]: 2
[Sr+2]: 3
[NH-]: 5
[Ba+2]: 2
[OH-]: 19
B: 27
[11CH3]: 8
[Cl+3]: 2
[n-]: 16
[Ca]: 2
[Zn+2]: 8
%24: 3
[S-2]: 2
[Li+]: 10
[81Kr]: 1
[75Se]: 1
[Se]: 8
[SrH2]: 1
[11C]: 3
[42K+]: 1
[C-]: 3
[85Sr+2]: 2
[O]: 2
[P+]: 2
[124I]: 1
[