In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import copy

from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

In [2]:
#load data
x_pretrain = pd.read_csv("pretrain_features.csv.zip", index_col="Id", compression='zip').drop("smiles", axis=1).to_numpy()
y_pretrain = pd.read_csv("pretrain_labels.csv.zip", index_col="Id", compression='zip').to_numpy().squeeze(-1)
x_train = pd.read_csv("train_features.csv.zip", index_col="Id", compression='zip').drop("smiles", axis=1).to_numpy()
y_train = pd.read_csv("train_labels.csv.zip", index_col="Id", compression='zip').to_numpy().squeeze(-1)
x_test = pd.read_csv("test_features.csv.zip", index_col="Id", compression='zip').drop("smiles", axis=1)

In [3]:
class Net(nn.Module):
    """
    The model class, which defines our feature extractor used in pretraining.
    """
    def __init__(self):
        super(Net, self).__init__()
        self.fc = torch.nn.Sequential(
            nn.Linear(1000, 800),
            nn.BatchNorm1d(800),
            nn.Tanh(),
                        
            nn.Linear(800, 666),
            nn.BatchNorm1d(666),
            nn.Tanh(),
                        
            nn.Linear(666, 1),
        )

    def forward(self, x):
        x = self.fc(x)
        return x

# model declaration
model = Net()
model.train()

Net(
  (fc): Sequential(
    (0): Linear(in_features=1000, out_features=800, bias=True)
    (1): BatchNorm1d(800, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): Tanh()
    (3): Linear(in_features=800, out_features=666, bias=True)
    (4): BatchNorm1d(666, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): Tanh()
    (6): Linear(in_features=666, out_features=1, bias=True)
  )
)

In [4]:
#pretraining function
def pretraining(epochs_pretrain,optimizer):
    print("start pretraining")
    for epoch in range(epochs_pretrain):
        model.train()  # Set the model to training mode
        epoch_loss = 0.0

        # Create data loader for training set
        train_dataset = TensorDataset(x_ptr, y_ptr)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

        for inputs, labels in train_loader:
            optimizer.zero_grad()  # Zero the gradients
            outputs = torch.squeeze(model(inputs))
            loss = torch.sqrt(loss_fn(outputs, labels))
            loss.backward()
            optimizer.step()
            #print(loss)

            epoch_loss += loss.item() * inputs.size(0)

        # Calculate average loss for the epoch
        avg_loss = epoch_loss / len(train_dataset)

        # Evaluation on the validation set
        model.eval()  # Set the model to evaluation mode
        val_outputs = torch.squeeze(model(x_val))
        val_loss = torch.sqrt(loss_fn(val_outputs, y_val))

        print(f"Epoch {epoch+1}/{epochs_pretrain} - Train Loss: {avg_loss:.4f} - Val Loss: {val_loss:.4f}")


In [13]:
#get pretraining

#parameter
batch_size=256
eval_size=2000
epochs_pretrain = 165
epochs_pretrain2 = 50
lr_pretrain = 0.001
loss_fn = nn.MSELoss()
optimizer = optim.AdamW(model.parameters(), lr = lr_pretrain)
optimizer2 = optim.AdamW(model.parameters(), lr = 0.0001)

#prepare data for pretraining
x_ptr, x_val, y_ptr, y_val = train_test_split(x_pretrain, y_pretrain, test_size=eval_size, random_state=0, shuffle=True)
x_ptr, x_val = torch.tensor(x_ptr, dtype=torch.float), torch.tensor(x_val, dtype=torch.float)
y_ptr, y_val = torch.tensor(y_ptr, dtype=torch.float), torch.tensor(y_val, dtype=torch.float)

#pretraining(epochs_pretrain,optimizer)
pretraining(epochs_pretrain2,optimizer2)

start pretraining
Epoch 1/50 - Train Loss: 0.0142 - Val Loss: 0.0444
Epoch 2/50 - Train Loss: 0.0126 - Val Loss: 0.0441
Epoch 3/50 - Train Loss: 0.0126 - Val Loss: 0.0445
Epoch 4/50 - Train Loss: 0.0134 - Val Loss: 0.0439
Epoch 5/50 - Train Loss: 0.0129 - Val Loss: 0.0438
Epoch 6/50 - Train Loss: 0.0124 - Val Loss: 0.0438
Epoch 7/50 - Train Loss: 0.0123 - Val Loss: 0.0440
Epoch 8/50 - Train Loss: 0.0116 - Val Loss: 0.0443
Epoch 9/50 - Train Loss: 0.0112 - Val Loss: 0.0441
Epoch 10/50 - Train Loss: 0.0118 - Val Loss: 0.0443
Epoch 11/50 - Train Loss: 0.0123 - Val Loss: 0.0438
Epoch 12/50 - Train Loss: 0.0119 - Val Loss: 0.0438
Epoch 13/50 - Train Loss: 0.0116 - Val Loss: 0.0439
Epoch 14/50 - Train Loss: 0.0110 - Val Loss: 0.0437
Epoch 15/50 - Train Loss: 0.0115 - Val Loss: 0.0438
Epoch 16/50 - Train Loss: 0.0117 - Val Loss: 0.0442
Epoch 17/50 - Train Loss: 0.0114 - Val Loss: 0.0440
Epoch 18/50 - Train Loss: 0.0116 - Val Loss: 0.0447
Epoch 19/50 - Train Loss: 0.0116 - Val Loss: 0.0439
Epo

In [9]:
#create copy model: model_test
model_trans = copy.deepcopy(model)
print(model_trans.fc)
#"""
model_trans.fc.pop(6)
model_trans.fc.pop(5)
model_trans.fc.pop(4)
print(model_trans.fc)
#"""

Sequential(
  (0): Linear(in_features=1000, out_features=800, bias=True)
  (1): BatchNorm1d(800, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): Tanh()
  (3): Linear(in_features=800, out_features=666, bias=True)
  (4): BatchNorm1d(666, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (5): Tanh()
  (6): Linear(in_features=666, out_features=1, bias=True)
)
Sequential(
  (0): Linear(in_features=1000, out_features=800, bias=True)
  (1): BatchNorm1d(800, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): Tanh()
  (3): Linear(in_features=800, out_features=666, bias=True)
)


In [10]:
#use model_trans to preprocess x_train,x_test
x_tr = torch.tensor(x_train, dtype=torch.float)
x_ts = torch.tensor(x_test.to_numpy(), dtype=torch.float)

#transform x_tr, x_ts into 
x_tr_trans = model_trans(x_tr).detach().numpy() #transform x_train
x_ts_trans = model_trans(x_ts).detach().numpy() #transform x_test
x_tr_trans = StandardScaler().fit_transform(x_tr_trans)
x_ts_trans = StandardScaler().fit_transform(x_ts_trans)

In [11]:
#find regression
from sklearn.linear_model import LinearRegression
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel, RBF, Matern, RationalQuadratic, PairwiseKernel
from sklearn.metrics import r2_score
from scipy import optimize
from xgboost import XGBRegressor

def myOptimizer(obj_func,initial_theta,bounds):
    x,f,d = optimize.fmin_l_bfgs_b(obj_func,initial_theta, bounds=bounds, maxfun=15000,maxiter=15000)
    return x,f 

alpha = 0.01
n_restarts_optimizer = 10
np.random.seed(0)

clf = LinearRegression()
gpr = GaussianProcessRegressor(kernel=None, alpha=alpha,optimizer = myOptimizer, n_restarts_optimizer=n_restarts_optimizer)
gpr_PK = GaussianProcessRegressor(kernel=PairwiseKernel(), alpha=alpha,optimizer = myOptimizer, n_restarts_optimizer=n_restarts_optimizer)
gpr_RQ = GaussianProcessRegressor(kernel=RationalQuadratic(), alpha=alpha,optimizer = myOptimizer, n_restarts_optimizer=n_restarts_optimizer)
gpr_DP = GaussianProcessRegressor(kernel=DotProduct(), alpha=alpha, optimizer = myOptimizer, n_restarts_optimizer=n_restarts_optimizer)

gpr_DW = GaussianProcessRegressor(kernel=DotProduct() + WhiteKernel(), alpha=alpha, optimizer = myOptimizer, n_restarts_optimizer=n_restarts_optimizer)
xgbr = XGBRegressor(n_estimators = 200, max_depth = 3, learning_rate = 0.1, gamma =0.01, random_state = 1002)

from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_tr_trans, y_train, test_size=0.15, random_state=42)

print('kernel= none')
gpr.fit(x_train_split, y_train_split)
y_pred = gpr.predict(x_test_split)
print(r2_score(y_test_split, y_pred))

print('kernel= DotProduct() + WhiteKernel()')
gpr_DW.fit(x_train_split, y_train_split)
y_pred = gpr_DW.predict(x_test_split)
print(r2_score(y_test_split, y_pred))

print('kernel= Matern')
gpr_M = GaussianProcessRegressor(kernel=Matern(), alpha=alpha, optimizer = myOptimizer, n_restarts_optimizer=n_restarts_optimizer)
gpr_M.fit(x_train_split, y_train_split)
y_pred = gpr_M.predict(x_test_split)
print(r2_score(y_test_split, y_pred))

print('kernel= DotProduct()')
gpr_DP.fit(x_train_split, y_train_split)
y_pred = gpr_DP.predict(x_test_split)
print(r2_score(y_test_split, y_pred))

print('kernel= RBF')
gpr_RBF = GaussianProcessRegressor(kernel=RBF(), alpha=alpha, optimizer = myOptimizer, n_restarts_optimizer=n_restarts_optimizer)
gpr_RBF.fit(x_train_split, y_train_split)
y_pred = gpr_RBF.predict(x_test_split)
print(r2_score(y_test_split, y_pred))

print('RationalQuadratic')
gpr_RQ.fit(x_train_split, y_train_split)
y_pred = gpr_RQ.predict(x_test_split)
print(r2_score(y_test_split, y_pred))

print('LinearRegression')
clf.fit(x_train_split, y_train_split)
y_pred = clf.predict(x_test_split)
print(r2_score(y_test_split, y_pred))

print('kernel= PairwiseKernel')
gpr_PK.fit(x_train_split, y_train_split)
y_pred = gpr_PK.predict(x_test_split)
print(r2_score(y_test_split, y_pred)) 

print('XRGB')
xgbr.fit(x_train_split,y_train_split)
y_pred = xgbr.predict(x_test_split)
print(r2_score(y_test_split, y_pred))

kernel= none
-33.928342323777244
kernel= DotProduct() + WhiteKernel()




0.6414806403215638
kernel= Matern
0.7473394737355064
kernel= DotProduct()
0.6414067518084992
kernel= RBF
0.7472415804829806
RationalQuadratic
0.7455283781442483
LinearRegression
0.6475436822541492
kernel= PairwiseKernel
-1697.073582619434
XRGB
0.5358578302215052


In [57]:
#final choosen regression
alpha = 0.01
gpr_RBF = GaussianProcessRegressor(kernel=RBF(), alpha=alpha, optimizer = myOptimizer, n_restarts_optimizer=n_restarts_optimizer)
gpr_RBF.fit(x_tr_trans,y_train)
y_pred = gpr_RBF.predict(x_ts_trans)
print(y_pred[:5])

[1.79091019 2.02977916 1.69006001 2.34312941 1.76000909]


In [58]:
#final
x_test = pd.read_csv("test_features.csv.zip", index_col="Id", compression='zip').drop("smiles", axis=1)
assert y_pred.shape == (x_test.shape[0],)
y_pred = pd.DataFrame({"y": y_pred}, index=x_test.index)
y_pred.to_csv("results.csv", index_label="Id")
print("Predictions saved, all done!")

Predictions saved, all done!
