In [3]:
!pip install transformers==4.44.2 joblib==1.4.2 scikit-learn==1.6.0 numpy==1.26.4 pandas==2.2.3 scipy==1.13.1 seaborn==0.13.2 tqdm==4.66.5 

Collecting transformers==4.44.2
  Downloading transformers-4.44.2-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting joblib==1.4.2
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting scikit-learn==1.6.0
  Downloading scikit_learn-1.6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting scipy==1.13.1
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting seaborn==0.13.2
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting tqdm==4.66.5
  Downloading tqdm-4.66.5-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0

In [4]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
import torch
from torch.utils.data import DataLoader, Dataset


# Load datasets
train_df = pd.read_csv('/kaggle/input/caco2-set/Train_Caco2.csv')
train_df = train_df[['ID', 'SMILES', 'Permeability']]
test_df = pd.read_csv('/kaggle/input/caco2-set/Test_Caco2.csv')
test_df = test_df[['ID', 'SMILES', 'Permeability']]

In [5]:
tokenizer = AutoTokenizer.from_pretrained("seyonec/PubChem10M_SMILES_BPE_450k")
model = AutoModelForSequenceClassification.from_pretrained('seyonec/PubChem10M_SMILES_BPE_450k', num_labels=1)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

tokenizer_config.json:   0%|          | 0.00/62.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/515 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/336M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at seyonec/PubChem10M_SMILES_BPE_450k and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# Custom dataset class
class SMILESDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=325):
        self.tokenizer = tokenizer
        self.dataframe = dataframe
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        smiles = self.dataframe.iloc[idx]['SMILES']
        permeability = self.dataframe.iloc[idx]['Permeability']
        inputs = self.tokenizer(smiles, return_tensors='pt', padding="max_length", truncation=True, max_length=self.max_length)
        
        input_ids = inputs['input_ids'].squeeze(0)  # Shape: (sequence_length,)
        attention_mask = inputs['attention_mask'].squeeze(0)  # Shape: (sequence_length,)
        
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(permeability, dtype=torch.float)
        }

In [7]:
# datasets
train_dataset = SMILESDataset(train_df, tokenizer)
test_dataset = SMILESDataset(test_df, tokenizer)
batch_size = 16
# data loaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [8]:
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
num_epochs = 20

In [9]:
# Training loop
from tqdm import tqdm
for epoch in range(num_epochs):
    print(f"Entered Epoch {epoch + 1}")
    model.train()
    train_loss = 0

    for batch in tqdm(train_loader, desc=f'Training Epoch {epoch + 1}/{num_epochs}', unit='batch'):
        optimizer.zero_grad()

        # Move all batch tensors to device
        batch = {k: v.to(device) for k, v in batch.items()}
        labels = batch["labels"].unsqueeze(1)  # still shape: (batch_size, 1)

        # Forward pass
        outputs = model(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            labels=labels
        )
        loss = outputs.loss
        train_loss += loss.item()

        # Backprop and optimizer step
        loss.backward()
        optimizer.step()

    avg_train_loss = train_loss / len(train_loader)
    print(f'Epoch {epoch + 1}/{num_epochs} - Train Loss: {avg_train_loss:.4f}')

Entered Epoch 1


Training Epoch 1/20: 100%|██████████| 63/63 [00:27<00:00,  2.29batch/s]


Epoch 1/20 - Train Loss: 2.1550
Entered Epoch 2


Training Epoch 2/20: 100%|██████████| 63/63 [00:27<00:00,  2.26batch/s]


Epoch 2/20 - Train Loss: 0.5417
Entered Epoch 3


Training Epoch 3/20: 100%|██████████| 63/63 [00:28<00:00,  2.18batch/s]


Epoch 3/20 - Train Loss: 0.4104
Entered Epoch 4


Training Epoch 4/20: 100%|██████████| 63/63 [00:30<00:00,  2.09batch/s]


Epoch 4/20 - Train Loss: 0.3314
Entered Epoch 5


Training Epoch 5/20: 100%|██████████| 63/63 [00:30<00:00,  2.07batch/s]


Epoch 5/20 - Train Loss: 0.3225
Entered Epoch 6


Training Epoch 6/20: 100%|██████████| 63/63 [00:29<00:00,  2.11batch/s]


Epoch 6/20 - Train Loss: 0.2700
Entered Epoch 7


Training Epoch 7/20: 100%|██████████| 63/63 [00:30<00:00,  2.10batch/s]


Epoch 7/20 - Train Loss: 0.2585
Entered Epoch 8


Training Epoch 8/20: 100%|██████████| 63/63 [00:30<00:00,  2.09batch/s]


Epoch 8/20 - Train Loss: 0.1930
Entered Epoch 9


Training Epoch 9/20: 100%|██████████| 63/63 [00:29<00:00,  2.10batch/s]


Epoch 9/20 - Train Loss: 0.1869
Entered Epoch 10


Training Epoch 10/20: 100%|██████████| 63/63 [00:30<00:00,  2.10batch/s]


Epoch 10/20 - Train Loss: 0.1891
Entered Epoch 11


Training Epoch 11/20: 100%|██████████| 63/63 [00:30<00:00,  2.09batch/s]


Epoch 11/20 - Train Loss: 0.1714
Entered Epoch 12


Training Epoch 12/20: 100%|██████████| 63/63 [00:30<00:00,  2.09batch/s]


Epoch 12/20 - Train Loss: 0.1350
Entered Epoch 13


Training Epoch 13/20: 100%|██████████| 63/63 [00:30<00:00,  2.09batch/s]


Epoch 13/20 - Train Loss: 0.1271
Entered Epoch 14


Training Epoch 14/20: 100%|██████████| 63/63 [00:30<00:00,  2.09batch/s]


Epoch 14/20 - Train Loss: 0.1141
Entered Epoch 15


Training Epoch 15/20: 100%|██████████| 63/63 [00:30<00:00,  2.09batch/s]


Epoch 15/20 - Train Loss: 0.1114
Entered Epoch 16


Training Epoch 16/20: 100%|██████████| 63/63 [00:30<00:00,  2.09batch/s]


Epoch 16/20 - Train Loss: 0.0917
Entered Epoch 17


Training Epoch 17/20: 100%|██████████| 63/63 [00:30<00:00,  2.10batch/s]


Epoch 17/20 - Train Loss: 0.0950
Entered Epoch 18


Training Epoch 18/20: 100%|██████████| 63/63 [00:30<00:00,  2.10batch/s]


Epoch 18/20 - Train Loss: 0.0904
Entered Epoch 19


Training Epoch 19/20: 100%|██████████| 63/63 [00:29<00:00,  2.10batch/s]


Epoch 19/20 - Train Loss: 0.0903
Entered Epoch 20


Training Epoch 20/20: 100%|██████████| 63/63 [00:29<00:00,  2.10batch/s]

Epoch 20/20 - Train Loss: 0.1015





In [10]:
# Saving the model after training
model_name = 'PubChem10M_SMILES_BPE_450k_model_1_caco2'
model_save_path = f'/kaggle/working/{model_name}'
os.makedirs(model_save_path, exist_ok=True)

tokenizer.save_pretrained(model_save_path)
model.save_pretrained(model_save_path)

print(f'Model and tokenizer saved to {model_save_path}')

Model and tokenizer saved to /kaggle/working/PubChem10M_SMILES_BPE_450k_model_1_caco2


In [11]:
from scipy.stats import pearsonr, spearmanr

model.eval()
test_loss = 0
test_true_labels = []
predictions = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc='Testing', unit='batch'):
      
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].unsqueeze(1).to(device).float()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        test_loss += loss.item()

        test_true_labels.extend(labels.cpu().numpy())
        preds = outputs.logits.squeeze().cpu().numpy()  
        predictions.extend(preds)

# Final test loss
avg_test_loss = test_loss / len(test_loader)
print(f'Test Loss: {avg_test_loss:.4f}')

test_true_labels = np.array(test_true_labels).flatten()
predictions = np.array(predictions)
print(test_true_labels.shape)
print(predictions.shape)

# Performance metrics
mse = mean_squared_error(test_true_labels, predictions)
rmse = np.sqrt(mse)
mae = mean_absolute_error(test_true_labels, predictions)
r2 = r2_score(test_true_labels, predictions)
PCC,_ = pearsonr(test_true_labels, predictions)
SCC,_ = spearmanr(test_true_labels, predictions)

# Print performance metrics
print(f'Mean Squared Error: {mse:.4f}')
print(f'Root Mean Squared Error: {rmse:.4f}')
print(f'Mean Absolute Error: {mae:.4f}')
print(f'R^2 Score: {r2:.4f}')
print(f'Pearson Correlation Coefficient: {PCC:.4f}')
print(f'Spearman Correlation Coefficient: {SCC:.4f}')

# Print hyperparameters
print("Hyperparameters:")
print(f"Learning Rate: {5e-5}")
print(f"Batch Size: 16")
print(f"Epochs: {num_epochs}")

Testing: 100%|██████████| 16/16 [00:02<00:00,  6.17batch/s]

Test Loss: 0.3848
(252,)
(252,)
Mean Squared Error: 0.3807
Root Mean Squared Error: 0.6170
Mean Absolute Error: 0.4834
R^2 Score: 0.3422
Pearson Correlation Coefficient: 0.7487
Spearman Correlation Coefficient: 0.7293
Hyperparameters:
Learning Rate: 5e-05
Batch Size: 16
Epochs: 20





In [12]:
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModel

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

model_name = 'PubChem10M_SMILES_BPE_450k_model_1_caco2'
model_save_path = f'/kaggle/working/{model_name}'

if not os.path.exists(model_save_path):
    raise FileNotFoundError(f"The model directory {model_save_path} does not exist.")

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_save_path, trust_remote_code=True)
model = AutoModel.from_pretrained(model_save_path, trust_remote_code=True).to(device)

Some weights of RobertaModel were not initialized from the model checkpoint at /kaggle/working/PubChem10M_SMILES_BPE_450k_model_1_caco2 and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
# Load your datasets
train_df = pd.read_csv('/kaggle/input/caco2-set/Train_Caco2.csv')
train_df = train_df[['ID', 'SMILES', 'Permeability']]
test_df = pd.read_csv('/kaggle/input/caco2-set/Test_Caco2.csv')
test_df = test_df[['ID', 'SMILES', 'Permeability']]

In [14]:
train_encodings = tokenizer(list(train_df['SMILES']), truncation=True, padding=True, max_length=325, return_tensors="pt")
test_encodings = tokenizer(list(test_df['SMILES']), truncation=True, padding=True, max_length=325, return_tensors="pt")

In [15]:
from tqdm import tqdm 
batch_size = 16 

def generate_embeddings(encodings, batch_size):
    embeddings = []
    model.eval() 
    with torch.no_grad():
        for i in tqdm(range(0, len(encodings['input_ids']), batch_size), desc="Processing batches"):
            batch = {key: val[i:i + batch_size].to(device) for key, val in encodings.items()}  
            outputs = model(**batch)
            embeddings.append(outputs.last_hidden_state)
    return torch.cat(embeddings, dim=0)


In [16]:
train_embeddings = generate_embeddings(train_encodings, batch_size)
print(train_embeddings.shape)
train_embeddings = torch.mean(train_embeddings, dim=1)
print(train_embeddings.shape)

Processing batches: 100%|██████████| 63/63 [00:06<00:00,  9.66it/s]


torch.Size([1008, 222, 768])
torch.Size([1008, 768])


In [17]:
column_names = [f'x_fine_emb_pubchem{i}' for i in range(train_embeddings.shape[1])]
embeddings_df = pd.DataFrame(data=train_embeddings.cpu().numpy(), columns=column_names)
train_data = pd.concat([train_df, embeddings_df], axis=1)

In [18]:
test_embeddings = generate_embeddings(test_encodings, batch_size)
print(test_embeddings.shape)
test_embeddings = torch.mean(test_embeddings, dim=1)
print(test_embeddings.shape)

Processing batches: 100%|██████████| 16/16 [00:01<00:00,  9.92it/s]

torch.Size([252, 216, 768])
torch.Size([252, 768])





In [19]:
column_names = [f'x_fine_emb_pubchem{i}' for i in range(test_embeddings.shape[1])]
embeddings_df = pd.DataFrame(data=test_embeddings.cpu().numpy(), columns=column_names)
test_data = pd.concat([test_df, embeddings_df], axis=1)

In [20]:
train_data.to_csv("/kaggle/working/Train_PubChem10M_SMILES_BPE_450k_model_1_fine_tuned_embeddings_caco2.csv",index=False)
test_data.to_csv("/kaggle/working/Test_PubChem10M_SMILES_BPE_450k_model_1_fine_tuned_embeddings_caco2.csv",index=False)

In [21]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression  # LogisticRegression is not used for regression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.stats import pearsonr, spearmanr
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [24]:
train_data = pd.read_csv("/kaggle/working/Train_PubChem10M_SMILES_BPE_450k_model_1_fine_tuned_embeddings_caco2.csv")
test_data = pd.read_csv("/kaggle/working/Test_PubChem10M_SMILES_BPE_450k_model_1_fine_tuned_embeddings_caco2.csv")

In [25]:
def train_and_test_predict(models, X_train, y_train, X_test, y_test):
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    results = {}
    predictions = []  

    for model in models:
        model_name = model.__class__.__name__
        predictions_train = []
        actual_y_train = []

        test_predictions_folds = []

        

        for train_index, val_index in kf.split(X_train):
            X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
            y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

            model.fit(X_train_fold, y_train_fold)

            y_pred_fold = model.predict(X_val_fold)
            y_pred_fold = np.clip(y_pred_fold, -10, -3.9)
            predictions_train.extend(y_pred_fold)
            actual_y_train.extend(y_val_fold)

            predictions_test_fold = model.predict(X_test)
            predictions_test_fold = np.clip(predictions_test_fold, -10, -3.9)
            test_predictions_folds.append(predictions_test_fold)


        mse_train = mean_squared_error(actual_y_train, predictions_train)
        mae_train = mean_absolute_error(actual_y_train, predictions_train)
        rmse_train = np.sqrt(mse_train)
        r2_train = r2_score(actual_y_train, predictions_train)
        pearson_train, _ = pearsonr(actual_y_train, predictions_train)
        spearman_train, _ = spearmanr(actual_y_train, predictions_train)


        predictions_test_mean = np.mean(test_predictions_folds, axis=0)
        predictions_test_std = np.std(test_predictions_folds, axis=0)

        mse_test = mean_squared_error(y_test, predictions_test_mean)
        mae_test = mean_absolute_error(y_test, predictions_test_mean)
        rmse_test = np.sqrt(mse_test)
        r2_test = r2_score(y_test, predictions_test_mean)
        print(r2_test)
        pearson_test, _ = pearsonr(y_test, predictions_test_mean)
        spearman_test, _ = spearmanr(y_test, predictions_test_mean)
        
        

        predictions.append({
            'Model': model_name,
            'Y Train pred': predictions_train,
            'Y Test actual': y_test,
            'Test prediction folds': test_predictions_folds,
            'Test Predictions Mean': predictions_test_mean,
            'Test Predictions Std': predictions_test_std,

        })

        results[model_name] = {
            'Train MSE (5 fold cv)': f"{mse_train:.4f}",
            'Train MAE (5 fold cv)': f"{mae_train:.4f}",
            'Train RMSE (5 fold cv)': f"{rmse_train:.4f}",
            'Train RMSE (5 fold cv)': f"{rmse_train:.4f}",
            'Train R2 (5 fold cv)': f"{r2_train:.4f}",
            'Train PCC (5 fold cv)': f"{pearson_train:.4f}",
            'Train SCC (5 fold cv)': f"{spearman_train:.4f}",
            'Test MSE': f"{mse_test:.4f}",
            'Test MAE': f"{mae_test:.4f}",
            'Test RMSE': f"{rmse_test:.4f}",
            'Test R2': f"{r2_test:.4f}",
            'Test Pearson Correlation': f"{pearson_test:.4f}",
            'Test Spearman Correlation': f"{spearman_test:.4f}",
        }

    results_df = pd.DataFrame(results).T
    predictions_df = pd.DataFrame(predictions)

    return results_df, predictions_df



In [26]:
X_train = train_data.drop(['ID','SMILES','Permeability'],axis=1)
y_train = train_data['Permeability']
print("X_train shape: ",X_train.shape)
print("y_train shape: ",y_train.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")

X_test = test_data.drop(['ID','SMILES','Permeability'],axis=1)
y_test = test_data['Permeability']
print("X_test shape: ",X_test.shape)
print("y_test shape: ",y_test.shape)
print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
models = [
    lgb.LGBMRegressor(objective='regression',metric='rmse',boosting_type='gbdt',num_leaves=31,learning_rate=0.05,random_state=42),
    DecisionTreeRegressor(random_state=42),
    RandomForestRegressor(n_jobs=-1, random_state=42),
    GradientBoostingRegressor(random_state=42),
    AdaBoostRegressor(random_state=42),
    xgb.XGBRegressor(random_state=42),
    ExtraTreesRegressor(n_jobs=-1, n_estimators=100, random_state=42),
    LinearRegression(), 
    KNeighborsRegressor(n_neighbors=3),
    SVR(),  
    MLPRegressor(random_state=42)
]
result_df, prediction_df = train_and_test_predict(models, X_train,y_train, X_test,  y_test)
result_df

X_train shape:  (1008, 768)
y_train shape:  (1008,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
X_test shape:  (252, 768)
y_test shape:  (252,)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013147 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195840
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 768
[LightGBM] [Info] Start training from score -6.268840
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012139 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195840
[LightGBM] [Info] Number of data points in the train set: 806, number of used features: 768
[LightGBM] [Info] Start training from score -6.267690
[LightGBM] [Info] Auto-choosing col-wise multi-threading, th

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.0589,0.1839,0.2427,0.9028,0.9502,0.9364,0.2554,0.3847,0.5053,0.5587,0.7587,0.7356
DecisionTreeRegressor,0.1342,0.2783,0.3664,0.7786,0.8868,0.8709,0.2724,0.3889,0.5219,0.5294,0.7425,0.714
RandomForestRegressor,0.0626,0.1905,0.2502,0.8967,0.9477,0.9347,0.2566,0.382,0.5065,0.5567,0.7556,0.7319
GradientBoostingRegressor,0.0601,0.1888,0.2451,0.9009,0.9492,0.9363,0.2566,0.381,0.5065,0.5567,0.7594,0.7379
AdaBoostRegressor,0.0675,0.2022,0.2599,0.8886,0.9447,0.9294,0.2573,0.3907,0.5072,0.5554,0.756,0.7279
XGBRegressor,0.0689,0.2003,0.2624,0.8864,0.9415,0.9235,0.2623,0.3873,0.5122,0.5467,0.7497,0.7263
ExtraTreesRegressor,0.059,0.1846,0.2428,0.9028,0.9508,0.9365,0.2543,0.3817,0.5043,0.5605,0.7588,0.7327
LinearRegression,1.7921,1.0047,1.3387,-1.9557,0.415,0.4452,0.6349,0.6016,0.7968,-0.0971,0.5356,0.5542
KNeighborsRegressor,0.084,0.2204,0.2898,0.8615,0.9285,0.9027,0.2418,0.3693,0.4917,0.5822,0.7756,0.7599
SVR,0.0519,0.1736,0.2278,0.9144,0.9569,0.9468,0.2567,0.3943,0.5067,0.5564,0.7528,0.7385


In [27]:
result_df

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.0589,0.1839,0.2427,0.9028,0.9502,0.9364,0.2554,0.3847,0.5053,0.5587,0.7587,0.7356
DecisionTreeRegressor,0.1342,0.2783,0.3664,0.7786,0.8868,0.8709,0.2724,0.3889,0.5219,0.5294,0.7425,0.714
RandomForestRegressor,0.0626,0.1905,0.2502,0.8967,0.9477,0.9347,0.2566,0.382,0.5065,0.5567,0.7556,0.7319
GradientBoostingRegressor,0.0601,0.1888,0.2451,0.9009,0.9492,0.9363,0.2566,0.381,0.5065,0.5567,0.7594,0.7379
AdaBoostRegressor,0.0675,0.2022,0.2599,0.8886,0.9447,0.9294,0.2573,0.3907,0.5072,0.5554,0.756,0.7279
XGBRegressor,0.0689,0.2003,0.2624,0.8864,0.9415,0.9235,0.2623,0.3873,0.5122,0.5467,0.7497,0.7263
ExtraTreesRegressor,0.059,0.1846,0.2428,0.9028,0.9508,0.9365,0.2543,0.3817,0.5043,0.5605,0.7588,0.7327
LinearRegression,1.7921,1.0047,1.3387,-1.9557,0.415,0.4452,0.6349,0.6016,0.7968,-0.0971,0.5356,0.5542
KNeighborsRegressor,0.084,0.2204,0.2898,0.8615,0.9285,0.9027,0.2418,0.3693,0.4917,0.5822,0.7756,0.7599
SVR,0.0519,0.1736,0.2278,0.9144,0.9569,0.9468,0.2567,0.3943,0.5067,0.5564,0.7528,0.7385


In [28]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.88946470664453, -6.741674346212738, -5.689...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.307198690392564, -6.009846887388639, -6.0...","[-6.224978960468546, -6.1186901143714625, -5.9...","[0.06641748983022505, 0.0800822517327156, 0.02..."
1,DecisionTreeRegressor,"[-6.24, -6.68, -5.77, -6.85, -5.85, -5.48, -6....",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-5.96, -6.92, -5.92, -5.92, -6.19, -6.19, -6...","[-6.023999999999999, -6.601999999999999, -6.0,...","[0.14827002394280506, 0.26049184248263896, 0.1..."
2,RandomForestRegressor,"[-6.809485994589998, -6.747925752940001, -5.66...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.291792249100002, -6.03266539254, -5.96749...","[-6.185440052434002, -6.146793212551999, -5.96...","[0.08483427712278474, 0.07571663870713209, 0.0..."
3,GradientBoostingRegressor,"[-6.777830679840646, -6.6825990061861065, -5.7...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.531562089033785, -5.995794537608191, -5.9...","[-6.308012671108061, -6.0827360783652065, -5.9...","[0.12552035422590224, 0.09019827425044218, 0.0..."
4,AdaBoostRegressor,"[-6.771821566404143, -7.060086206896558, -5.74...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.255623082666671, -5.834925373134333, -5.9...","[-6.320234379734061, -6.025062524175468, -5.96...","[0.05344832745908739, 0.17169273598407583, 0.0..."
5,XGBRegressor,"[-7.027199, -6.671399, -5.679867, -6.8658967, ...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.36648, -6.040941, -5.9818664, -5.8952036,...","[-6.196779, -6.297438, -5.9653788, -5.844796, ...","[0.09715414, 0.22086369, 0.0397732, 0.03707328..."
6,ExtraTreesRegressor,"[-6.838438719640001, -6.827685776809997, -5.69...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.292249387370001, -6.048106902010002, -6.0...","[-6.211253827118, -6.070517575670001, -5.98204...","[0.04251864991970869, 0.0672292665176976, 0.02..."
7,LinearRegression,"[-8.319526332072641, -6.219751157709182, -5.26...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-5.945484607495458, -7.6043390504951125, -4....","[-6.1052107793218955, -6.6167132177718475, -5....","[0.2669877268459953, 1.0795011052288122, 0.993..."
8,KNeighborsRegressor,"[-7.033333333333334, -7.3999999999999995, -6.2...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.44, -5.98, -5.883333333333333, -5.6766666...","[-6.382666666666666, -5.946666666666666, -5.86...","[0.13156156142446987, 0.05129435749779008, 0.0..."
9,SVR,"[-7.198340504171654, -6.971496010089172, -5.70...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.339095339244995, -5.924362530556767, -6.0...","[-6.2884180171699215, -5.842075319134414, -6.0...","[0.036699224547097, 0.04508241972313802, 0.021..."


In [29]:
result_df.to_csv('/kaggle/working/Results_PubChem10M_SMILES_BPE_450k_model_1_fine_tuned_embeddings_caco2.csv')
prediction_df.to_csv('/kaggle/working/Prediction_data_PubChem10M_SMILES_BPE_450k_model_1_fine_tuned_embeddings_caco2.csv')

In [30]:
result_df

Unnamed: 0,Train MSE (5 fold cv),Train MAE (5 fold cv),Train RMSE (5 fold cv),Train R2 (5 fold cv),Train PCC (5 fold cv),Train SCC (5 fold cv),Test MSE,Test MAE,Test RMSE,Test R2,Test Pearson Correlation,Test Spearman Correlation
LGBMRegressor,0.0589,0.1839,0.2427,0.9028,0.9502,0.9364,0.2554,0.3847,0.5053,0.5587,0.7587,0.7356
DecisionTreeRegressor,0.1342,0.2783,0.3664,0.7786,0.8868,0.8709,0.2724,0.3889,0.5219,0.5294,0.7425,0.714
RandomForestRegressor,0.0626,0.1905,0.2502,0.8967,0.9477,0.9347,0.2566,0.382,0.5065,0.5567,0.7556,0.7319
GradientBoostingRegressor,0.0601,0.1888,0.2451,0.9009,0.9492,0.9363,0.2566,0.381,0.5065,0.5567,0.7594,0.7379
AdaBoostRegressor,0.0675,0.2022,0.2599,0.8886,0.9447,0.9294,0.2573,0.3907,0.5072,0.5554,0.756,0.7279
XGBRegressor,0.0689,0.2003,0.2624,0.8864,0.9415,0.9235,0.2623,0.3873,0.5122,0.5467,0.7497,0.7263
ExtraTreesRegressor,0.059,0.1846,0.2428,0.9028,0.9508,0.9365,0.2543,0.3817,0.5043,0.5605,0.7588,0.7327
LinearRegression,1.7921,1.0047,1.3387,-1.9557,0.415,0.4452,0.6349,0.6016,0.7968,-0.0971,0.5356,0.5542
KNeighborsRegressor,0.084,0.2204,0.2898,0.8615,0.9285,0.9027,0.2418,0.3693,0.4917,0.5822,0.7756,0.7599
SVR,0.0519,0.1736,0.2278,0.9144,0.9569,0.9468,0.2567,0.3943,0.5067,0.5564,0.7528,0.7385


In [31]:
prediction_df

Unnamed: 0,Model,Y Train pred,Y Test actual,Test prediction folds,Test Predictions Mean,Test Predictions Std
0,LGBMRegressor,"[-6.88946470664453, -6.741674346212738, -5.689...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.307198690392564, -6.009846887388639, -6.0...","[-6.224978960468546, -6.1186901143714625, -5.9...","[0.06641748983022505, 0.0800822517327156, 0.02..."
1,DecisionTreeRegressor,"[-6.24, -6.68, -5.77, -6.85, -5.85, -5.48, -6....",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-5.96, -6.92, -5.92, -5.92, -6.19, -6.19, -6...","[-6.023999999999999, -6.601999999999999, -6.0,...","[0.14827002394280506, 0.26049184248263896, 0.1..."
2,RandomForestRegressor,"[-6.809485994589998, -6.747925752940001, -5.66...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.291792249100002, -6.03266539254, -5.96749...","[-6.185440052434002, -6.146793212551999, -5.96...","[0.08483427712278474, 0.07571663870713209, 0.0..."
3,GradientBoostingRegressor,"[-6.777830679840646, -6.6825990061861065, -5.7...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.531562089033785, -5.995794537608191, -5.9...","[-6.308012671108061, -6.0827360783652065, -5.9...","[0.12552035422590224, 0.09019827425044218, 0.0..."
4,AdaBoostRegressor,"[-6.771821566404143, -7.060086206896558, -5.74...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.255623082666671, -5.834925373134333, -5.9...","[-6.320234379734061, -6.025062524175468, -5.96...","[0.05344832745908739, 0.17169273598407583, 0.0..."
5,XGBRegressor,"[-7.027199, -6.671399, -5.679867, -6.8658967, ...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.36648, -6.040941, -5.9818664, -5.8952036,...","[-6.196779, -6.297438, -5.9653788, -5.844796, ...","[0.09715414, 0.22086369, 0.0397732, 0.03707328..."
6,ExtraTreesRegressor,"[-6.838438719640001, -6.827685776809997, -5.69...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.292249387370001, -6.048106902010002, -6.0...","[-6.211253827118, -6.070517575670001, -5.98204...","[0.04251864991970869, 0.0672292665176976, 0.02..."
7,LinearRegression,"[-8.319526332072641, -6.219751157709182, -5.26...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-5.945484607495458, -7.6043390504951125, -4....","[-6.1052107793218955, -6.6167132177718475, -5....","[0.2669877268459953, 1.0795011052288122, 0.993..."
8,KNeighborsRegressor,"[-7.033333333333334, -7.3999999999999995, -6.2...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.44, -5.98, -5.883333333333333, -5.6766666...","[-6.382666666666666, -5.946666666666666, -5.86...","[0.13156156142446987, 0.05129435749779008, 0.0..."
9,SVR,"[-7.198340504171654, -6.971496010089172, -5.70...",0 -7.19 1 -6.21 2 -7.24 3 -5.8...,"[[-6.339095339244995, -5.924362530556767, -6.0...","[-6.2884180171699215, -5.842075319134414, -6.0...","[0.036699224547097, 0.04508241972313802, 0.021..."
