In [1]:
!pip install --pre deepchem[tensorflow]

Collecting deepchem[tensorflow]
  Downloading deepchem-2.8.1.dev20240517170323-py3-none-any.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Collecting rdkit (from deepchem[tensorflow])
  Downloading rdkit-2023.9.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.9/34.9 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
Collecting tensorflow-addons (from deepchem[tensorflow])
  Downloading tensorflow_addons-0.23.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (611 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m611.8/611.8 kB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
Collecting typeguard<3.0.0,>=2.7 (from tensorflow-addons->deepchem[tensorflow])
  Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Installing collected packages: typeguard, rdkit, tensorflow-addons, deepchem
Successfully i

In [2]:
import deepchem as dc
dc.__version__

Instructions for updating:
experimental_relax_shapes is deprecated, use reduce_retracing instead


'2.8.1.dev'

In [3]:
import pandas as pd
import numpy as np
import math
from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
@ignore_warnings(category=ConvergenceWarning)
def main():
  # load the descriptors
  featsDf = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Master_Thesis/Predicting_T_dependency/csv_files/State1_feats.csv')
  descriptors = featsDf["descriptors"].to_list()

  #load the dataset (polymer smiles and their density)
  print("loading the data...")
  loader = dc.data.CSVLoader(["density"], feature_field="smiles", featurizer=dc.feat.RDKitDescriptors(descriptors = descriptors))
  Data = loader.create_dataset('/content/drive/MyDrive/Colab Notebooks/Master_Thesis/Predicting_T_dependency/csv_files/Combined_dens_Dataset_WithState1.csv')

  #some RDKit descriptors return nan, make these 0
  X = np.nan_to_num(Data.X, copy=True, nan=0.0, posinf=0)
  print("RDKit:",X.shape)

  # now load the simulation temperature
  df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Master_Thesis/Predicting_T_dependency/csv_files/Combined_dens_Dataset_WithState1.csv')
  Ts = df["temp"].to_numpy()

  # combine the RDKit descriptors with the simulation temperature
  input=np.column_stack((X,Ts))
  print("With TS added:",input.shape)

  #add data to dataset
  Dataset = dc.data.DiskDataset.from_numpy(X=input, y=Data.y, w=Data.w, ids=Data.ids, tasks = ["denstiy"])


  #initiate lists to keep the results
  train_r2scores = []
  valid_r2scores = []
  test_r2scores = []
  RMSE_scores = []

  # now execute 10 rounds
  for i in range(10):
    print("round:", i, "out of 9")

    #split the dataset using the random splitter
    splitter = dc.splits.RandomSplitter()
    train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(Dataset, frac_train = 0.5, frac_valid= 0.25, frac_test = 0.25)
    print("Data is splitted into: train, valid, test")

    # create the GPR model & fit the model
    from sklearn.gaussian_process import GaussianProcessRegressor
    from sklearn.gaussian_process.kernels import RBF, DotProduct, ConstantKernel, Matern, RationalQuadratic, ExpSineSquared, DotProduct, WhiteKernel

    kernel = 1 * RationalQuadratic() +WhiteKernel()
    model = dc.models.SklearnModel(GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=5))

    print("fitting model...")
    model.fit(train_dataset)
    print("model is fitted")

    #predict the test set
    predicted = model.predict(test_dataset)


    #calculate r2 scores
    metric = dc.metrics.Metric(dc.metrics.pearson_r2_score)
    train_r2score = model.evaluate(train_dataset, metric)
    valid_r2score = model.evaluate(valid_dataset, metric)
    test_r2score= model.evaluate(test_dataset, metric)

    #make then useable
    testr2=list(test_r2score.values())[0]
    validr2=list(valid_r2score.values())[0]
    trainr2=list(train_r2score.values())[0]

    #calculate RMSE score
    from sklearn.metrics import mean_squared_error
    MSE_score = mean_squared_error(test_dataset.y,predicted)
    RMSE_score = math.sqrt(MSE_score)

    #add them to the list:
    train_r2scores.append(trainr2)
    valid_r2scores.append(validr2)
    test_r2scores.append(testr2)
    RMSE_scores.append(RMSE_score)

  #average the results and print to screen
  print("average training r2-score:",round(np.mean(train_r2scores),3))
  print("average valid r2-score:",round(np.mean(valid_r2scores),3) )
  print("average test r2-score:",round(np.mean(test_r2scores),3) )
  print("average test RMSE-score:",round(np.mean(RMSE_scores),4) )

  return None


In [8]:
main()

loading the data...
RDKit: (693, 40)
With TS added: (693, 41)
round: 0 out of 9
Data is splitted into: train, valid, test
fitting model...
model is fitted
round: 1 out of 9
Data is splitted into: train, valid, test
fitting model...
model is fitted
round: 2 out of 9
Data is splitted into: train, valid, test
fitting model...
model is fitted
round: 3 out of 9
Data is splitted into: train, valid, test
fitting model...
model is fitted
round: 4 out of 9
Data is splitted into: train, valid, test
fitting model...
model is fitted
round: 5 out of 9
Data is splitted into: train, valid, test
fitting model...
model is fitted
round: 6 out of 9
Data is splitted into: train, valid, test
fitting model...
model is fitted
round: 7 out of 9
Data is splitted into: train, valid, test
fitting model...
model is fitted
round: 8 out of 9
Data is splitted into: train, valid, test
fitting model...
model is fitted
round: 9 out of 9
Data is splitted into: train, valid, test
fitting model...
model is fitted
average 