In [None]:
import sklearn
import sklearn.linear_model
import time
import seaborn as sb
import matplotlib.pyplot as plt 
%matplotlib inline
import numpy as np
import pandas as pd
import shap
import xgboost

from numpy import arange
from pandas import read_csv
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import ElasticNetCV
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.cross_decomposition import PLSRegression
from sklearn.decomposition import PCA
from sklearn_genetic import GASearchCV

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
#from scikeras.wrappers import KerasRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Dense, Dropout
from keras.regularizers import l2
from mapie.regression import MapieRegressor

import pickle


## Deep Neural Network 

In [None]:
rocky = pd.read_excel('FINAL-trainingset.xlsx')

In [None]:
rocky.columns[0:3]

In [None]:
rocky = rocky.drop(columns=['ID','Molecule_Name','SMILES'])
rocky = rocky.drop(columns=['Surface Tension'])
rocky = rocky.drop(columns=['Viscosity'])
#rocky = rocky.drop(columns=['Density'])
print(rocky)

In [None]:
rocky.shape

In [None]:
rocky1 = rocky.dropna(axis=0, subset=['Density'])
#rocky1 = rocky.dropna(axis=0, subset=['Viscosity'])
#rocky1 = rocky.dropna(axis=0, subset=['Surface Tension'])

In [None]:
rocky1.head()

In [None]:
# Import and Wrangle Data
d_X = rocky1.drop(columns=['Density'])
#d_X = rocky1.drop(columns=['Viscosity'])
#d_X = rocky1.drop(columns=['Surface Tension'])
d_y = rocky1['Density']
#d_y = rocky1['Viscosity']
#d_y = rocky1['Surface Tension']


In [None]:
X_train_full, X_test, y_train_full, y_test = train_test_split(d_X, d_y)

X_train, X_valid, y_train, y_valid = train_test_split(X_train_full,y_train_full)

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)
X_test = scaler.transform(X_test)

## Keras-Tuner

In [None]:
import tensorflow as tf
from tensorflow import keras
from kerastuner.tuners import RandomSearch

In [None]:
import numpy as np
import keras_tuner as kt
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense


# Sample data dimensions
input_dim = 1058  # This should be set to the number of features in your dataset

# Define the model architecture using a function
def build_model(hp):
    model = Sequential()
    model.add(Dense(units=hp.Int('units1', min_value=8, max_value=256, step=8),
                    activation=hp.Choice('activation1', values=['relu', 'tanh']),
                    input_dim=input_dim))
    model.add(Dense(units=hp.Int('units2', min_value=8, max_value=256, step=8),
                    activation=hp.Choice('activation2', values=['relu', 'tanh'])))
    if hp.Boolean('add_layer3'):
            model.add(Dense(units=hp.Int('units3', min_value=8, max_value=256, step=8),
                            activation=hp.Choice('activation3', values=['relu', 'tanh'])))
    model.add(Dropout(rate=hp.Float('dropout', 0.0, 0.5, step=0.1)))
    model.add(Dense(1, activation='linear'))  # Output layer for regression

    # Compile model
    model.compile(
        optimizer=keras.optimizers.get({
            "class_name": hp.Choice('optimizer', ['adam', 'rmsprop', 'sgd', 'adagrad', 'adamax', 'Nadam']),
            "config": {"learning_rate": hp.Choice('learning_rate', [1e-1, 1e-2, 1e-3, 1e-4, 1e-5])}
        }),
        loss='mean_squared_error',
        metrics=[keras.metrics.MeanSquaredError()]
    )

    return model

# Custom callback to calculate R-squared
class R2Callback(keras.callbacks.Callback):
    def __init__(self, train_data, val_data, **kwargs):
        super().__init__(**kwargs)
        self.train_data = train_data
        self.val_data = val_data

    def on_epoch_end(self, epoch, logs=None):
        X_train, y_train = self.train_data
        X_val, y_val = self.val_data
        
        y_pred_train = self.model.predict(X_train)
        r2_train = r2_score(y_train, y_pred_train)
        logs['r2_train'] = r2_train
        
        y_pred_val = self.model.predict(X_val)
        r2_val = r2_score(y_val, y_pred_val)
        logs['r2_val'] = r2_val
        
        print(f" - r2_train: {r2_train:.4f} - r2_val: {r2_val:.4f}")

# Instantiate the tuner
tuner = kt.Hyperband(
    build_model,
    objective='val_loss',
    max_epochs=100,
    factor=3,
    hyperband_iterations=10,
)

# Early stopping callback to avoid overfitting
stop_early = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)

# Start the tuner
tuner.search(X_train, y_train, epochs=150, validation_data=(X_valid, y_valid), callbacks=[stop_early])

# Get the optimal hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

summary = f"""
The hyperparameter search is complete. The optimal number of units in the first densely-connected
layer is {best_hps.get('units1')}, the optimal activation is {best_hps.get('activation1')},
the optimal number of units in the second densely-connected layer is {best_hps.get('units2')},
the optimal activation for the second layer is {best_hps.get('activation2')},
"""

if 'units3' in best_hps.values:
    summary += f"the optimal number of units in the third densely-connected layer is {best_hps.get('units3')}, the optimal activation for the third layer is {best_hps.get('activation3')},\n"

if 'dropout' in best_hps.values:
    summary += f"the optimal dropout rate is {best_hps.get('dropout')},\n"

summary += f"the optimal optimizer is {best_hps.get('optimizer')}, and the optimal learning rate for the optimizer is {best_hps.get('learning_rate')}.\n"

print(summary)

# Build the model with the optimal hyperparameters and train it on the data
model = tuner.hypermodel.build(best_hps)

# Train the model
history = model.fit(X_train, y_train, epochs=50, validation_data=(X_valid, y_valid), callbacks=[stop_early, R2Callback(train_data=(X_train, y_train), val_data=(X_valid, y_valid))])

In [None]:
dy_pred_dnn = model.predict(X_test)

# evaluate the model on test set
r2_density_dnn = sklearn.metrics.r2_score(y_test, dy_pred_dnn)
print('R-squared on Test Set: %0.2f' %r2_density_dnn)

RMSE_test_density_dnn = sklearn.metrics.mean_squared_error(y_test, dy_pred_dnn, squared=False)
print('RMSE on Test Set: %0.2f' %RMSE_test_density_dnn)

## Save Models

In [None]:
# Assume `model` is your trained Keras model
model_path = 'density_best_model.h5'

# Save the Keras model
model.save(model_path)

# Save the model path using pickle
with open('density_model_path.pkl', 'wb') as f:
    pickle.dump(model_path, f)

In [None]:
# # Assume `model` is your trained Keras model
# model_path = 'viscosity_best_model.h5'

# # Save the Keras model
# model.save(model_path)

# # Save the model path using pickle
# with open('viscosity_model_path.pkl', 'wb') as f:
#     pickle.dump(model_path, f)

In [None]:
# # Assume `model` is your trained Keras model
# model_path = 'surface_best_model.h5'

# # Save the Keras model
# model.save(model_path)

# # Save the model path using pickle
# with open('surface_model_path.pkl', 'wb') as f:
#     pickle.dump(model_path, f)

## Basic

In [None]:
model = keras.models.Sequential([
    keras.layers.Dense(64, activation='relu', input_shape=X_train.shape[1:]),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(1)
    ])


In [None]:
#opt = keras.optimizers.adam(learning_rate=0.001)
model.compile(loss='mean_squared_error', optimizer='adam')
history = model.fit(X_train, y_train, epochs=100, validation_data=(X_valid,y_valid))


In [None]:
mse_test = model.evaluate(X_test, y_test)
print(f"Test MSE: {mse_test}")



In [None]:
dropout_rate = 0.5
def build_model():
    model = keras.models.Sequential([
        keras.layers.Dense(512, activation='relu', input_shape=(1135,)),
        keras.layers.Dense(256, activation='relu'),
        keras.layers.Dropout(dropout_rate),  
        keras.layers.Dense(256, activation='relu'),
        keras.layers.Dropout(dropout_rate),  
        keras.layers.Dense(64, activation='relu'),
        keras.layers.Dropout(dropout_rate),
        keras.layers.Dense(64, activation='relu'),
        keras.layers.Dense(1)
        ])
    model.compile(optimizer="adam", loss='mse', metrics=['mae'])
    return model

In [None]:
k = 4
num_val_samples = len(X_train) // k
num_epochs = 100
all_scores = []
all_r2_scores = []

for i in range(k):
    print(f"Processing Fold #{i}")
    val_data = X_train[i * num_val_samples: (i+1) * num_val_samples]
    val_targets = y_train[i * num_val_samples: (i+1) * num_val_samples]
    partial_train_data = np.concatenate([X_train[:i*num_val_samples], X_train[(i+1)*num_val_samples:]],
                                        axis=0)
    partial_test_data = np.concatenate([y_train[:i*num_val_samples], y_train[(i+1)*num_val_samples:]],
                                        axis=0)
    model = build_model()
    model.fit(partial_train_data, partial_test_data, epochs=num_epochs, batch_size=16, verbose=0)
    val_mse, val_mae = model.evaluate(val_data,val_targets, verbose=0)
    all_scores.append(val_mae)

    predictions = model.predict(val_data)
    r2 = r2_score(val_targets, predictions)
    all_r2_scores.append(r2)



In [None]:
print("All MAE scores:", all_scores)
print(np.mean(all_scores))
print("All R^2 scores:", all_r2_scores)
print(np.mean(all_r2_scores))

In [None]:
def build_model(n_hidden=1, n_neurons=30, learning_rate=3e-3, input_shape=X_train.shape[1:]):
    model = keras.models.Sequential()
    model.add(keras.layers.InputLayer(input_shape=input_shape))
    for layer in range(n_hidden):
        model.add(keras.layers.Dense(n_neurons, activation="relu"))
    model.add(keras.layers.Dense(1))
    optimizer = keras.optimizers.SGD(learning_rate=learning_rate)
    model.compile(loss="mse", optimizer=optimizer)
    return model


In [None]:
keras_reg = tf.keras.wrappers.scikit.KerasRegressor(build_model)

In [None]:
keras_reg.fit(X_train, y_train, epochs=100, validation_data = (X_valid,y_valid), 
              callbacks = [keras.callbacks.EarlyStopping(patience=10)])
mse_test_reg = keras_reg.score(X_test,y_test)
y_pred_reg = keras_reg.predict(X_new)

# MAPIE Module

In [None]:
# Define mapie regressor
mapie = MapieRegressor(estimator = model, # Prediction Model to use
                       n_jobs = -1,
                       agg_function = "median",
                       random_state = 42)

# Fit mapie regressor on training data
mapie.fit(train_X, train_y)

alpha = 0.1 # for 90% target coverage

# Use mapie.predict() to get predicted values and intervals
y_test_pred, y_test_pi = mapie.predict(test_X, alpha = alpha)

In [None]:
# Predicted values
y_test_pred

In [None]:
# Prediction Intervals
y_test_pis

In [None]:
# Storing results in a dataframe
predictions = test_y.to_frame()
predictions.columns = ['Actual Value']
predictions["Predicted Value"] = y_test_pred.round()
predictions["Lower Value"] = y_test_pis.reshape(-1,2)[:,0].round()
predictions["Upper Value"] = y_test_pis.reshape(-1,2)[:,1].round()

# Take a quick look
predictions

In [None]:
predictions["Error"] = predictions["Predicted Value"] - predictions["Actual Value"]

predictions["Error_upper"] =   (predictions["Upper Value"] - predictions["Predicted Value"])
predictions["Error_lower"] =  -(predictions["Predicted Value"] - predictions["Lower Value"])

# Sort by total interval width
predictions["Interval_width"] = predictions["Upper Value"] - predictions["Lower Value"]
sorted_predictions = predictions.sort_values(by=['Interval_width']).reset_index(drop=True)

sorted_predictions

In [None]:
fig, ax = plt.subplots(figsize=(18, 8))

plt.plot(sorted_predictions["Error"], 'o', markersize = 3, label = "Error (y_pred - y_true)")

plt.fill_between(np.arange(len(sorted_predictions)),
                 sorted_predictions["Error_lower"],
                 sorted_predictions["Error_upper"],
                 alpha=0.5, color="grey", label = "Prediction Interval")

ax.axline([0, 0], [1, 0], color = "red", linestyle='--', lw=2, zorder=3, label="y_true")
plt.xticks([])
plt.xlim([0, len(sorted_predictions)])
plt.ylabel("Errors")
plt.legend(loc="upper left", fontsize=14)
plt.show()

In [None]:
# count number of points outside of predicted interval
sorted_predictions["is_outside_range"] = 0
sorted_predictions["is_outside_range"] = sorted_predictions["is_outside_range"].where((
    (sorted_predictions["Error"] < sorted_predictions["Error_upper"]) & (sorted_predictions["Error"] > sorted_predictions["Error_lower"]) ),
    other=1)

print(round(100-(100/len(sorted_predictions))*sorted_predictions["is_outside_range"].sum(),1))

In [None]:
# count number of prediction intervals that actually contain the ground truth value
sorted_predictions["gt_within_PI"] = 0
sorted_predictions["gt_within_PI"] = sorted_predictions["gt_within_PI"].where((
    (sorted_predictions["Actual Value"] < sorted_predictions["Upper Value"]) & (sorted_predictions["Actual Value"] > sorted_predictions["Lower Value"]) ),
    other=1)

print(round(100-(100/len(sorted_predictions))*sorted_predictions["gt_within_PI"].sum(),1))

In [None]:
# re-sort for plot
sorted_predictions = predictions.sort_values(by=['Actual Value']).reset_index(drop=True)

fig, ax = plt.subplots(figsize=(30, 9))

plt.plot(sorted_predictions["Actual Value"], 'o', markersize=3, label="Actual Value")

plt.fill_between(np.arange(len(sorted_predictions)),
                 sorted_predictions["Lower Value"],
                 sorted_predictions["Upper Value"],
                 alpha=0.5, color="grey", label="prediction interval")

plt.xticks([])
plt.xlim([0, len(sorted_predictions)])
plt.ylabel("True value")
plt.legend(loc="upper left", fontsize=14)
plt.show()

# PREDICTION

In [None]:
# Later, to load the model
with open('density_model_path.pkl', 'rb') as f:
    loaded_model_path = pickle.load(f)

# Load the Keras model
loaded_model = load_model(loaded_model_path)

In [None]:
# # Later, to load the model
# with open('viscosity_model_path.pkl', 'rb') as f:
#     loaded_model_path = pickle.load(f)

# # Load the Keras model
# loaded_model = load_model(loaded_model_path)

In [None]:
# # Later, to load the model
# with open('surface_model_path.pkl', 'rb') as f:
#     loaded_model_path = pickle.load(f)

# # Load the Keras model
# loaded_model = load_model(loaded_model_path)