**Homework 9:**



# Setup

First, let's import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures. We also check that Python 3.5 or later is installed (although Python 2.x may work, it is deprecated so we strongly recommend you use Python 3 instead), as well as Scikit-Learn ≥0.20.

In [None]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import pandas as pd
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "ensembles"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [None]:
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem

from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit import DataStructs
from rdkit.DataStructs import ConvertToNumpyArray

from rdkit.Chem import PandasTools

# Load the data

In [None]:
#
datapath = os.path.join(".", "esol.csv")
esol_data = pd.read_csv(datapath)
esol_data.head()

## Data preprocessing

In [None]:
esol_data.describe()

In [None]:
#Generate data exploration
esol_data.info()

## Molecule genaration

In [None]:
# Create Mol rdkit objects for each smile
PandasTools.AddMoleculeColumnToFrame(frame=esol_data, smilesCol='cano_smiles', molCol='Molecule')

In [None]:
# Create Mol rdkit objects for each smile (option)
#bbbp_data["Mol"] = [Chem.MolFromSmiles(x) for x in bbbp_data["cano_smiles"]]

There are a couple of SMILES that could be parsed by RDKit. For wrong SMILES, rdkit would return 'None'.

In [None]:
#sum(esol_data['Molecule'].map(lambda x: x is None))

In [None]:
esol_data.head(1)

In [None]:
#Generate data exploration
esol_data.info()

### Morgan Fingerprint generation
A fingerprint is generated for each compound in the "Molecule" column with a radius of 2 and a bit length of 2048.

In [None]:
#create a column for Morgan fingerprints
n_Bits = 2048
esol_data["MorgFP"] = [AllChem.GetMorganFingerprintAsBitVect(m, 2, nBits = n_Bits, useFeatures=True) for m in esol_data['Molecule']]## Molecule genaration

In [None]:
esol_data["MorgFP"].head()

In [None]:
type(esol_data["MorgFP"])

In [None]:
esol_data.info()

### Splitting and Visualizing the data

In [None]:
#Finds Null data in any row if any
sample_incomplete_rows = esol_data[esol_data.isnull().any(axis=1)]
sample_incomplete_rows.head(len(sample_incomplete_rows))

In [None]:
#Returns number of unique values in the specifed column to make sure that no duplications there
esol_data['cano_smiles'].nunique()

In [None]:
#Returns number of total rows
esol_data['cano_smiles'].count()

In [None]:
#Drops the duplicated ones
esol_data = esol_data.drop_duplicates(['cano_smiles'])

In [None]:
esol_data.info()

In [None]:
#Returns the training set
train = esol_data[esol_data["group"]=="train"]
len(train)

In [None]:
#Returns the test set
test = esol_data[esol_data["group"]=="test"]
len(test)

In [None]:
#Returns the valid set
valid = esol_data[esol_data["group"]=="valid"]
len(valid)

In [None]:
# Specifing the columns for dependent and independent variables (Training dataset)
X_train = train["MorgFP"].values
y_train = train["activity"].values

In [None]:
# Specifing the columns for dependent and independent variables (Test dataset)
X_test = test["MorgFP"].values
y_test = test["activity"].values

In [None]:
# Specifing the columns for dependent and independent variables (Validation dataset)
X_valid = valid["MorgFP"].values
y_valid = valid["activity"].values

**Convert the RDKit explicit vectors into numpy arrays**

In [None]:
def convert_to_Numpy(column):
    col_np = []
    for fp in column:
        arr = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(fp, arr)
        col_np.append(arr)
    return col_np

In [None]:
X_train = convert_to_Numpy(X_train)
X_test = convert_to_Numpy(X_test)
X_valid = convert_to_Numpy(X_valid)

In [None]:
X_train = np.array(X_train)
X_test = np.array(X_test)
X_valid = np.array(X_valid)

In [None]:
print(type(X_train))
print(type(y_train))
print(type(X_test))
print(type(y_test))
print(type(X_valid))
print(type(y_valid))

In [None]:
X_train

In [None]:
y_train

In [None]:
X_test

In [None]:
y_test

In [None]:
X_valid

In [None]:
y_valid

## Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_valid = sc.transform(X_valid)
X_test = sc.transform(X_test)

## XGboost Regressor

In [None]:
from xgboost import XGBRegressor

xgb = XGBRegressor(random_state=42)
model = xgb.fit(X_train, y_train)


In [None]:
model.get_params()

### Training dataset

In [None]:
from sklearn.metrics import mean_squared_error

y_train_pred = model.predict(X_train)
xgb_mse = mean_squared_error(y_train, y_train_pred)
xgb_rmse = np.sqrt(xgb_mse)
xgb_rmse

### Validation dataset

In [None]:
y_valid_pred = model.predict(X_valid)
xgb_mse = mean_squared_error(y_valid, y_valid_pred)
xgb_rmse = np.sqrt(xgb_mse)
xgb_rmse

### Test dataset

In [None]:
y_test_pred = model.predict(X_test)
xgb_mse = mean_squared_error(y_test, y_test_pred)
xgb_rmse = np.sqrt(xgb_mse)
xgb_rmse

# Grid search:

In [None]:
param_grid = {'learning_rate': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7],
          'gamma': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7],
          'min_child_weight': range(1, 10),
          'subsample': np.arange(0.1, 1.0, 0.1),
          'colsample_bytree': np.arange(0.1, 1.0, 0.1),
          'max_depth': range(3, 10),
          'n_estimators': range(200, 5000, 200)
        }

In [None]:
from sklearn.model_selection import ParameterGrid
grid = ParameterGrid(param_grid)

In [None]:
len(grid)

In [None]:
from tqdm import tqdm
from sklearn.metrics import mean_squared_error

best_model = None
best_rmse=np.inf
best_param = {}
for this_param in tqdm(grid):
    this_model = XGBRegressor(**this_param, random_state=42)
    this_model.fit(X_train, y_train)
    xgb_mse = mean_squared_error(y_valid, this_model.predict(X_valid))
    xgb_rmse = np.sqrt(xgb_mse)
    if best_rmse > xgb_rmse:
        best_model = this_model
        best_rmse = xgb_rmse
        best_param = this_param
        

In [None]:
print(best_param)

### Training dataset

In [None]:
y_train_pred = best_model.predict(X_train)

In [None]:
xgb_mse = mean_squared_error(y_train, y_train_pred)
xgb_rmse = np.sqrt(xgb_mse)
xgb_rmse

### Validation dataset

In [None]:
y_valid_pred =best_model.predict(X_valid)

In [None]:
xgb_mse = mean_squared_error(y_valid, y_valid_pred)
xgb_rmse = np.sqrt(xgb_mse)
xgb_rmse

### Test dataset

In [None]:
y_test_pred = best_model.predict(X_test)

In [None]:
xgb_mse = mean_squared_error(y_test, y_test_pred)
xgb_rmse = np.sqrt(xgb_mse)
xgb_rmse

In [None]:
print("Done")