In [1]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import torch.nn as nn
import torch
from torch.nn import MSELoss
from torch.utils.data import TensorDataset, DataLoader
from tqdm import tqdm
from sklearn.metrics import r2_score, root_mean_squared_error
import pickle
from ray import train
from ray import tune
from ray.tune.schedulers import ASHAScheduler
import os
from functools import partial
from ray.tune.search.hyperopt import HyperOptSearch

2025-01-10 22:30:35,308	INFO util.py:154 -- Outdated packages:
  ipywidgets==7.8.1 found, needs ipywidgets>=8
Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2025-01-10 22:30:35,440	INFO util.py:154 -- Outdated packages:
  ipywidgets==7.8.1 found, needs ipywidgets>=8
Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [2]:
abs_path = os.path.abspath('.')+'\\table_with_desriptors.csv'

In [3]:
def load_data(abs_path = abs_path):
    descriptors = pd.read_csv(abs_path, index_col = 0)
    descriptors = descriptors[~descriptors['omega'].isnull()]
    y = descriptors['omega']
    descriptors = descriptors.drop(columns = ['SMILES', 'Tc', 'Pc', 'omega', 'mol'])
    quant = y.quantile(q = 0.95)
    mask = y < quant
    y = y[mask]
    descriptors = descriptors[mask]
   
    X_train, X_test, y_train, y_test = train_test_split(descriptors, y, test_size = 0.15, random_state=0)
    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size = 0.15, random_state = 0)
    columns_to_scale = list(X_train.columns[:156])
    ct = ColumnTransformer([('Scaler', MinMaxScaler(),columns_to_scale)], remainder= 'passthrough')
    X_train = ct.fit_transform(X_train)
    X_test = ct.transform(X_test)
    X_valid = ct.transform(X_valid)
    X_train_ds = TensorDataset(torch.tensor(X_train, dtype = torch.float32), torch.tensor(y_train.values, dtype = torch.float32))
    X_test_ds = TensorDataset(torch.tensor(X_test, dtype = torch.float32), torch.tensor(y_test.values, dtype = torch.float32))
    X_valid_ds = TensorDataset(torch.tensor(X_valid, dtype = torch.float32), torch.tensor(y_valid.values, dtype = torch.float32))
    return X_train_ds, X_valid_ds, X_test_ds, ct

In [4]:
res = load_data()
X_train, X_valid, X_test = res[0], res[1], res[2]

In [5]:
class MyModel(nn.Module):
    def __init__(self, l1 = 2204, l2 = 2204):
        super().__init__()
        self.linear_1 = nn.Linear(2204, l1)
        self.a1 = nn.CELU(alpha = 0.01)
        self.dropout_1 = nn.Dropout(p = 0.3)
        self.linear_2 = nn.Linear(l1, l2)
        self.a2 = nn.CELU(alpha = 0.01)
        self.dropout_2 = nn.Dropout(p = 0.3)
        self.linear_3 = nn.Linear(l2, 1)

    def forward(self, x):
        x = self.linear_1(x)
        x = self.a1(x)
        x = self.dropout_1(x)
        x = self.linear_2(x)
        x = self.a2(x)
        x = self.dropout_2(x)
        x = self.linear_3(x)
        return x


In [6]:
def train_model(config, abs_path = abs_path, is_tune = True):
    torch.manual_seed(1)
    model = MyModel(config['l1'], config['l2'])
    model.to('cuda:0')
    loss_fn = MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), config['lr'], weight_decay = 1e-4)
    data = load_data()
    trainset, validset, testset = data[0], data[1], data[2]
    column_transformer = data[3]
    if is_tune:
        X_train_dl = DataLoader(trainset, shuffle = True, batch_size = 24)
        X_valid_dl = DataLoader(validset, shuffle = True, batch_size = 24)
    else:
        X_train_dl = DataLoader(torch.utils.data.ConcatDataset([trainset, validset]), shuffle = True, batch_size = 24)
    for epoch in range(200):
        model.train()
        train_loss = 0
        for X_b, y_b in X_train_dl:
            X_b = X_b.to('cuda:0')
            y_b = y_b.to('cuda:0')
            optimizer.zero_grad()
            pred = model(X_b)
            loss = loss_fn(pred.squeeze(), y_b)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        model.eval()
        valid_loss = 0
        r2_valid = 0
        rmse_valid = 0
        valid_steps = 0
        if is_tune:
            for X_b, y_b in X_valid_dl:
                X_b = X_b.to('cuda:0')
                y_b = y_b.to('cuda:0')
                pred = model(X_b)
                loss = loss_fn(pred.squeeze(), y_b)
                valid_loss += loss.detach().cpu().numpy()
                r2_valid += r2_score(pred.squeeze().detach().cpu().numpy(), y_b.cpu().numpy())
                rmse_valid += root_mean_squared_error(pred.squeeze().detach().cpu().numpy(), y_b.cpu().numpy())
                valid_steps += 1
            
        if is_tune:
            train.report(
                {"loss": valid_loss / valid_steps, "r2":r2_valid / valid_steps, "rmse":rmse_valid / valid_steps})
    print('Finished training')
    if not is_tune:
        return model, column_transformer

In [7]:
max_num_epochs = 200
scheduler = ASHAScheduler(
        metric="loss",
        mode="min",
        max_t=max_num_epochs,
        grace_period=5,
        reduction_factor=2,
)

In [8]:
config = {
    "l1": tune.qrandint(100, 3000, 100),
    "l2": tune.qrandint(100, 3000, 100),
    "lr": tune.loguniform(1e-4, 1e-2)}

In [9]:
hyperopt = HyperOptSearch(metric = 'loss', mode = 'min')

In [10]:
result = tune.run(partial(train_model), search_alg= hyperopt, config = config, num_samples = 50, scheduler = scheduler, resources_per_trial={"gpu": 1})

2025-01-10 22:30:59,191	INFO worker.py:1810 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m
2025-01-10 22:31:02,827	INFO tune.py:253 -- Initializing Ray automatically. For cluster usage or custom Ray initialization, call `ray.init(...)` before `tune.run(...)`.
2025-01-10 22:31:02,829	INFO tune.py:616 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


0,1
Current time:,2025-01-10 22:51:08
Running for:,00:20:05.36
Memory:,13.6/31.9 GiB

Trial name,status,loc,l1,l2,lr,iter,total time (s),loss,r2,rmse
train_model_4979ee72,TERMINATED,127.0.0.1:9996,1500,500,0.00189489,200,118.399,0.0064176,0.703312,0.0795358
train_model_9b59712b,TERMINATED,127.0.0.1:19704,700,1000,0.000164888,200,113.792,0.00349067,0.864177,0.0570092
train_model_55701991,TERMINATED,127.0.0.1:18348,200,1900,0.00209532,5,3.26982,0.00575784,0.775925,0.0739432
train_model_9a447c21,TERMINATED,127.0.0.1:3644,2100,1300,0.00112651,10,7.79467,0.00568612,0.804213,0.0740547
train_model_173c807c,TERMINATED,127.0.0.1:12520,1600,1200,0.00320251,5,4.1351,0.00968606,0.784124,0.096908
train_model_f24e80cb,TERMINATED,127.0.0.1:19204,1500,300,0.000443712,5,3.89522,0.00588142,0.766374,0.0751456
train_model_a4b93e66,TERMINATED,127.0.0.1:19584,2700,900,0.000105252,200,145.528,0.00347051,0.875234,0.0568117
train_model_814c5e07,TERMINATED,127.0.0.1:6924,2000,900,0.00445917,5,4.27834,0.00783922,0.648862,0.0876426
train_model_bd91a5c5,TERMINATED,127.0.0.1:632,1000,1700,0.00803663,5,3.80478,0.00702321,0.611356,0.0817575
train_model_605ad371,TERMINATED,127.0.0.1:17148,500,500,0.00477107,5,3.41635,0.0111817,0.341739,0.104694


Trial name,loss,r2,rmse
train_model_0298399d,0.0064393,0.720147,0.0783241
train_model_03eb9393,0.0061874,0.751546,0.0768095
train_model_057d2cf9,0.00495173,0.818462,0.0677481
train_model_173c807c,0.00968606,0.784124,0.096908
train_model_1b12721f,0.00473166,0.816345,0.0667979
train_model_1c58c16c,0.00675041,0.810519,0.080492
train_model_1d88d041,0.00613697,0.774627,0.0772013
train_model_44678687,0.00548432,0.792225,0.0718124
train_model_4979ee72,0.0064176,0.703312,0.0795358
train_model_4af76339,0.0088906,0.449816,0.0934966


[36m(func pid=9996)[0m D:\bld\apache-arrow_1692865689659\work\cpp\src\arrow\filesystem\s3fs.cc:2829:  arrow::fs::FinalizeS3 was not called even though S3 was initialized.  This could lead to a segmentation fault at exit
[36m(func pid=19704)[0m D:\bld\apache-arrow_1692865689659\work\cpp\src\arrow\filesystem\s3fs.cc:2829:  arrow::fs::FinalizeS3 was not called even though S3 was initialized.  This could lead to a segmentation fault at exit
[36m(func pid=18348)[0m D:\bld\apache-arrow_1692865689659\work\cpp\src\arrow\filesystem\s3fs.cc:2829:  arrow::fs::FinalizeS3 was not called even though S3 was initialized.  This could lead to a segmentation fault at exit
[36m(func pid=3644)[0m D:\bld\apache-arrow_1692865689659\work\cpp\src\arrow\filesystem\s3fs.cc:2829:  arrow::fs::FinalizeS3 was not called even though S3 was initialized.  This could lead to a segmentation fault at exit
[36m(func pid=12520)[0m D:\bld\apache-arrow_1692865689659\work\cpp\src\arrow\filesystem\s3fs.cc:2829:  arrow

In [11]:
with open('run_result.pickle', 'wb') as output:
    pickle.dump(result, output)

In [18]:
with open('run_result.pickle', 'rb') as inp:
    result = pickle.load(inp)

In [12]:
best_config = result.get_best_config(metric = 'rmse', mode = 'min')

In [14]:
model_omega, column_transformer = train_model(config = best_config, is_tune = False)

Finished training


In [23]:
model_omega.eval()
pred = model_omega(X_test.tensors[0].to('cuda:0')).squeeze().detach().cpu().numpy()
print('R2 score is {}.'.format(r2_score(pred, X_test.tensors[1].cpu().numpy())))
print('RMSE score is {}.'.format(root_mean_squared_error(pred, X_test.tensors[1].cpu().numpy())))

R2 score is 0.7974129915237427.
RMSE score is 0.07805970311164856.


In [16]:
torch.save(model_omega, 'omega_model.pth')

In [17]:
with open('column_transformer_omega.pickle', 'wb') as output:
    pickle.dump(column_transformer, output)