In [1]:
!pip install --pre deepchem[tensorflow]

Collecting deepchem[tensorflow]
  Downloading deepchem-2.8.1.dev20240529195736-py3-none-any.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
Collecting rdkit (from deepchem[tensorflow])
  Downloading rdkit-2023.9.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.9/34.9 MB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
Collecting tensorflow-addons (from deepchem[tensorflow])
  Downloading tensorflow_addons-0.23.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (611 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m611.8/611.8 kB[0m [31m30.7 MB/s[0m eta [36m0:00:00[0m
Collecting typeguard<3.0.0,>=2.7 (from tensorflow-addons->deepchem[tensorflow])
  Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Installing collected packages: typeguard, rdkit, tensorflow-addons, deepchem
Successfully 

In [2]:
import deepchem as dc
import pandas as pd
import numpy as np
import csv
from sklearn.gaussian_process import GaussianProcessClassifier, GaussianProcessRegressor
from sklearn.gaussian_process.kernels import  RationalQuadratic, WhiteKernel

Instructions for updating:
experimental_relax_shapes is deprecated, use reduce_retracing instead


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
#the model that predicts the states
def PredictState(Smile, Ts, descriptors):

    #get the decsriptors rigth
    featurizer = dc.feat.RDKitDescriptors(descriptors=descriptors)
    Feats = featurizer.featurize(Smile)
    RDKitFeats = np.nan_to_num(Feats, copy=True, nan=0.0, posinf=0)
    AllFeats = np.append(RDKitFeats,Ts).reshape(1,-1)

    #create a deepchem datastructure
    PredicDataset = dc.data.DiskDataset.from_numpy(X=AllFeats, tasks = ["State"])

    #load the model
    kernel = 1 * RationalQuadratic()
    Class_model = dc.models.SklearnModel(GaussianProcessClassifier(kernel=kernel, n_restarts_optimizer=1),model_dir = '/content/drive/MyDrive/Colab Notebooks/Master_Thesis/Predicting_T_dependency/Predicting_unknown_samples/SavedModels/SavedClassificationModel')
    Class_model.reload()

    #Predict the state
    prediction = Class_model.predict(PredicDataset)

    #the models predicts the probability on which state a certain molecule belongs to
    #the state with which the highest probability is correlated will be considered to be "the" state of the molecule
    state = np.argmax(prediction)
    #print(f"The model predicted sate {state} for compound {Smile}.")

    return state

In [5]:
#the model that predicts the states
def PredictDensity_State0(Smile, Ts, descriptors):

    featurizer = dc.feat.RDKitDescriptors(descriptors=descriptors)
    Feats = featurizer.featurize(Smile)
    RDKitFeats = np.nan_to_num(Feats, copy=True, nan=0.0, posinf=0)
    AllFeats = np.append(RDKitFeats,Ts).reshape(1,-1)

    #create a deepchem datastructure
    PredicDataset = dc.data.DiskDataset.from_numpy(X=AllFeats, tasks = ["Density"])

    #load the model
    kernel = 1 * RationalQuadratic() +WhiteKernel()
    State0_model = dc.models.SklearnModel(GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=1),model_dir = '/content/drive/MyDrive/Colab Notebooks/Master_Thesis/Predicting_T_dependency/Predicting_unknown_samples/SavedModels/SavedModel_state0')
    State0_model.reload()

    #Predict the state
    prediction = State0_model.predict(PredicDataset)

    #the models predicts the desnity of the molecule
    density  = round(prediction[0],3)
    #print(f"For {Smile} at {Ts}°C the density is: {density} +- 0.562 kg/m^3")

    return density

In [6]:
#the model that predicts the states
def PredictDensity_State1(Smile, Ts, descriptors):

    featurizer = dc.feat.RDKitDescriptors(descriptors=descriptors)
    Feats = featurizer.featurize(Smile)
    RDKitFeats = np.nan_to_num(Feats, copy=True, nan=0.0, posinf=0)
    AllFeats = np.append(RDKitFeats,Ts).reshape(1,-1)

    #create a deepchem datastructure
    PredicDataset = dc.data.DiskDataset.from_numpy(X=AllFeats, tasks = ["Density"])

    #load the model
    kernel = 1 * RationalQuadratic() +WhiteKernel()
    State1_model = dc.models.SklearnModel(GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=1),model_dir = '/content/drive/MyDrive/Colab Notebooks/Master_Thesis/Predicting_T_dependency/Predicting_unknown_samples/SavedModels/SavedModel_state1')
    State1_model.reload()

    #Predict the state
    prediction = State1_model.predict(PredicDataset)

    #the models predicts the desnity of the molecule
    density  = round(prediction[0],3)
    #print(f"For {Smile} at {Ts}°C the density is: {density} +- 135 kg/m^3")

    return density

In [7]:
#the model that predicts the states
def PredictDensity_State2(Smile, Ts, descriptors):

    featurizer = dc.feat.RDKitDescriptors(descriptors=descriptors)
    Feats = featurizer.featurize(Smile)
    RDKitFeats = np.nan_to_num(Feats, copy=True, nan=0.0, posinf=0)
    AllFeats = np.append(RDKitFeats,Ts).reshape(1,-1)

    #create a deepchem datastructure
    PredicDataset = dc.data.DiskDataset.from_numpy(X=AllFeats, tasks = ["Density"])

    #load the model
    kernel = 1 * RationalQuadratic() +WhiteKernel()
    State2_model = dc.models.SklearnModel(GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=1),model_dir = '/content/drive/MyDrive/Colab Notebooks/Master_Thesis/Predicting_T_dependency/Predicting_unknown_samples/SavedModels/SavedModel_state2')
    State2_model.reload()

    #Predict the state
    prediction = State2_model.predict(PredicDataset)

    #the models predicts the desnity of the molecule
    density  = round(prediction[0],3)
    #print(f"For {Smile} at {Ts}°C the density is: {density} +- 19.2 kg/m^3")

    return density

In [None]:
def main():

    #first load the rigth set of RDKit descriptors for each state/model
    print("Loading RDKit descriptors...")

    desc_class = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Master_Thesis/Predicting_T_dependency/csv_files/classification_feats.csv')
    Classdescriptors = desc_class["descriptors"].to_list()

    desc_s0 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Master_Thesis/Predicting_T_dependency/csv_files/State0_feats.csv')
    State0descriptors = desc_s0["descriptors"].to_list()

    desc_s1 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Master_Thesis/Predicting_T_dependency/csv_files/State1_feats.csv')
    State1descriptors = desc_s1["descriptors"].to_list()

    desc_s2 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Master_Thesis/Predicting_T_dependency/csv_files/State2_feats.csv')
    State2descriptors = desc_s2["descriptors"].to_list()

    #then load the samples that must be predicted
    with open('/content/drive/MyDrive/Colab Notebooks/Master_Thesis/Predicting_T_dependency/Predicting_unknown_samples/UnknownSamples.csv','r') as file:
        reader = csv.reader(file, delimiter = ',')
        Smiles = []
        Ts = []

        for idx,row in enumerate(reader):
            if idx != 0:
                Smiles.append(row[0])
                Ts.append(int(row[1]))

    # Now loop over the smile strings and make predictions for every
    for i, molecule in enumerate(Smiles):

        #start by predicting the state
        State = PredictState(molecule, Ts[i], Classdescriptors)
        if State == 0:
            dens = PredictDensity_State0(molecule, Ts[i], State0descriptors)
            print(f"For {molecule} at {Ts[i]}°C, State: {State}  with 93.38% accuracy, Density: {dens} +- 0.562 kg/m^3")
        elif State == 1:
            dens = PredictDensity_State1(molecule, Ts[i], State1descriptors)
            print(f"For {molecule} at {Ts[i]}°C, State: {State} with 93.38% accuracy, Density: {dens} +- 135 kg/m^3")
        elif State == 2:
            dens = PredictDensity_State2(molecule, Ts[i], State2descriptors)
            print(f"For {molecule} at {Ts[i]}°C, State: {State} with 93.38% accuracy, Density: {dens} +- 19.2 kg/m^3")
main()

Loading RDKit descriptors...
For CCCCCCCCCCCCCCCCCCCCCCCCCCCC=C at 25°C, State: 0 (93.38% certainty), Density: 0.005 (+- 0.562 kg/m^3)
For CCCCCC(CCCCC(CCCCC)CCC(C)C)CCC(C)C at 25°C, State: 2 (93.38% certainty), Density: 973.139 (+- 19.2 kg/m^3)
For CCCCCC(CC)CCC(CCCCC)CCC(CC)CCCCC at 25°C, State: 2 (93.38% certainty), Density: 960.201 (+- 19.2 kg/m^3)
For CCCCCCCCCCCCCCCCCCCCCCCCCCCC=C at 500°C, State: 0 (93.38% certainty), Density: -0.131 (+- 0.562 kg/m^3)
For CCCCCC(CCCCC(CCCCC)CCC(C)C)CCC(C)C at 500°C, State: 0 (93.38% certainty), Density: 6.149 (+- 0.562 kg/m^3)
For CCCCCC(CC)CCC(CCCCC)CCC(CC)CCCCC at 500°C, State: 0 (93.38% certainty), Density: 5.635 (+- 0.562 kg/m^3)
