**Homework 9:**



# Setup

First, let's import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures. We also check that Python 3.5 or later is installed (although Python 2.x may work, it is deprecated so we strongly recommend you use Python 3 instead), as well as Scikit-Learn ≥0.20.

In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import pandas as pd
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "HW9a"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [2]:
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem

from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit import DataStructs
from rdkit.DataStructs import ConvertToNumpyArray

from rdkit.Chem import PandasTools
import openchem

# Load the data

In [3]:
#
datapath = os.path.join(".", "data/raw/esol.csv")
esol_data = pd.read_csv(datapath)
esol_data.head()

Unnamed: 0,cano_smiles,activity,group
0,Cc1occc1C(=O)Nc1ccccc1,-3.3,train
1,CC(C)=CCCC(C)=CC=O,-2.06,train
2,c1ccc2c(c1)ccc1c2ccc2c3ccccc3ccc21,-7.87,train
3,c1ccsc1,-1.33,train
4,c1ccc2scnc2c1,-1.5,train


## Data preprocessing

In [4]:
esol_data.describe()

Unnamed: 0,activity
count,1127.0
mean,-3.05201
std,2.096392
min,-11.6
25%,-4.321
50%,-2.86
75%,-1.6
max,1.58


In [5]:
#Generate data exploration
esol_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1127 entries, 0 to 1126
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   cano_smiles  1127 non-null   object 
 1   activity     1127 non-null   float64
 2   group        1127 non-null   object 
dtypes: float64(1), object(2)
memory usage: 26.5+ KB


In [12]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
  
tokenizer = AutoTokenizer.from_pretrained("seyonec/PubChem10M_SMILES_BPE_240k")

model = AutoModelForMaskedLM.from_pretrained("seyonec/PubChem10M_SMILES_BPE_240k")

for param in model.parameters():
    param.requires_grad = False

In [13]:
esol_data["embedding"] = esol_data["cano_smiles"].map(lambda smiles: model(**tokenizer(smiles, return_tensors="pt", padding=True))["logits"][0, 0, :])

In [14]:
data_train = esol_data[esol_data["group"]=="train"]
data_valid = esol_data[esol_data["group"]=="valid"]
data_test = esol_data[esol_data["group"]=="test"]

In [15]:

import torch
X_train = torch.stack(data_train["embedding"].tolist()).numpy()
y_train = data_train["activity"].values
X_valid = torch.stack(data_valid["embedding"].tolist()).numpy()
y_valid = data_valid["activity"].values
X_test = torch.stack(data_test["embedding"].tolist()).numpy()
y_test = data_test["activity"].values

### XGBoost

In [16]:
from xgboost import XGBRegressor

xgb = XGBRegressor(random_state=42)
model = xgb.fit(X_train, y_train)

In [17]:
from sklearn.metrics import mean_squared_error

y_train_pred = model.predict(X_train)
xgb_mse = mean_squared_error(y_train, y_train_pred)
xgb_rmse = np.sqrt(xgb_mse)
print("train RMSE ", xgb_rmse)

train RMSE  0.02479203800699789


In [18]:
y_valid_pred = model.predict(X_valid)
xgb_mse = mean_squared_error(y_valid, y_valid_pred)
xgb_rmse = np.sqrt(xgb_mse)
xgb_rmse

1.2873312363328144

In [19]:
y_test_pred = model.predict(X_test)
xgb_mse = mean_squared_error(y_test, y_test_pred)
xgb_rmse = np.sqrt(xgb_mse)
xgb_rmse

1.252281500058468

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
lin = RandomForestRegressor()
lin.fit(X_train, y_train)
model = lin

In [None]:
from sklearn.metrics import mean_squared_error

y_train_pred = model.predict(X_train)
xgb_mse = mean_squared_error(y_train, y_train_pred)
xgb_rmse = np.sqrt(xgb_mse)
print("train RMSE ", xgb_rmse)

y_valid_pred = model.predict(X_valid)
xgb_mse = mean_squared_error(y_valid, y_valid_pred)
xgb_rmse = np.sqrt(xgb_mse)
print("valid RMSE ", xgb_rmse)

y_test_pred = model.predict(X_test)
xgb_mse = mean_squared_error(y_test, y_test_pred)
xgb_rmse = np.sqrt(xgb_mse)
print("test RMSE ", xgb_rmse)

### Dense NN

In [None]:
import torch.nn as nn
nn_dense = nn.Sequential(
    nn.ReLU(),
    nn.Linear(767, 384),
    nn.ReLU(),
    nn.Linear(384, 190),
    nn.ReLU(),
    nn.Linear(190, 1)
)

In [None]:
import torch
from tqdm import tqdm
reg_model = nn_dense
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(reg_model.parameters(), lr=1e-3)
n_epoches = 10
# data_train.shape[0]
index_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(torch.arange(100))
                                           , batch_size=8, shuffle=True)
for i in range(n_epoches):
    for train_idx in (index_loader):
        optimizer.zero_grad()
        train_idx = train_idx[0].tolist()
        pred = reg_model(torch.as_tensor(X_train[train_idx]))
        label = torch.as_tensor(data_train["activity"][train_idx].values).float()
        loss = loss_fn(pred.reshape(-1), label.reshape(-1))
        loss.backward()
        optimizer.step()
        
    val_loss = 999
    with torch.no_grad():
        pred = reg_model(torch.as_tensor(X_valid))
        label = torch.as_tensor(data_valid["activity"].values)
        val_loss = loss_fn(pred.reshape(-1), label.reshape(-1))
    print("train loss:", loss, "valid loss:", val_loss)