In [2]:
#import python/ml packages
import pandas as pd
import numpy as np
import os

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models, optimizers
from tensorflow.keras.layers import Input, Dense, Flatten, Concatenate, LSTM
from tensorflow.keras.models import Model
from keras_tuner import HyperModel
from keras_tuner.tuners import BayesianOptimization
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [7]:
#taking a look at the data
final_data_df = pd.read_csv('final_data.csv')
final_data_df.head()

Unnamed: 0,Structure_Combination,Atom_Removed_Location,Atom_Name_Vector_List,Atom_Location_Vector_List,Coordinate_Vector_List,Energy_Value_Change
0,1_1,7,"[[1, 0, 0, 0, 0], [1, 0, 0, 0, 0], [1, 0, 0, 0...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[[-0.0, 0.00408, 0.49999], [0.125, 0.25408, 0....",-6.8879
1,1_1,3,"[[1, 0, 0, 0, 0], [1, 0, 0, 0, 0], [1, 0, 0, 0...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[[-0.0, 0.00408, 0.49999], [0.125, 0.25408, 0....",-7.0395
2,1_1,1,"[[1, 0, 0, 0, 0], [1, 0, 0, 0, 0], [1, 0, 0, 0...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[[-0.0, 0.00408, 0.49999], [0.125, 0.25408, 0....",-6.3746
3,1_11,2,"[[1, 0, 0, 0, 0], [1, 0, 0, 0, 0], [1, 0, 0, 0...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[[-0.0, 0.00408, 0.49999], [0.125, 0.25408, 0....",-5.4809
4,1_11,13,"[[1, 0, 0, 0, 0], [1, 0, 0, 0, 0], [1, 0, 0, 0...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[[-0.0, 0.00408, 0.49999], [0.125, 0.25408, 0....",-4.8951


In [8]:
final_data_df.describe()

Unnamed: 0,Atom_Removed_Location,Energy_Value_Change
count,3726.0,3726.0
mean,22.039721,-4.256561
std,14.238092,1.432308
min,1.0,-7.6434
25%,10.0,-5.540275
50%,20.0,-3.58985
75%,34.0,-3.2812
max,48.0,-2.1122


In [ ]:
#fine-tuning data to make it optimal for deep learning neural network
#Implement more sophisticated preprocessing or feature engineering



In [ ]:
#Define the model-building function
class EnergyPredictionModel(HyperModel):
    def __init__(self, input_shapes):
        self.input_structured_shape = input_shapes['structured']
        self.input_atom_name_shape = input_shapes['atom_name']
        self.input_atom_location_shape = input_shapes['atom_location']
        self.input_coordinate_shape = input_shapes['coordinate']
    
    def build(self, hp):
        input_structured = Input(shape=self.input_structured_shape, name='Structured_Input')
        input_atom_name = Input(shape=self.input_atom_name_shape, name='Atom_Name_Input')
        input_atom_location = Input(shape=self.input_atom_location_shape, name='Atom_Location_Input')
        input_coordinate = Input(shape=self.input_coordinate_shape, name='Coordinate_Input')

        # Example of using hyperparameters to define the number of units in LSTM layers
        atom_name_processed = LSTM(hp.Int('units_atom_name', min_value=16, max_value=64, step=16))(input_atom_name)
        atom_location_processed = Dense(hp.Int('units_atom_location', min_value=16, max_value=64, step=16), activation='relu')(input_atom_location)
        coordinate_processed = LSTM(hp.Int('units_coordinate', min_value=16, max_value=64, step=16))(input_coordinate)

        combined = Concatenate()([input_structured, atom_name_processed, atom_location_processed, coordinate_processed])

        # Further processing layers can also use hyperparameters
        x = Dense(hp.Int('units_dense1', min_value=32, max_value=128, step=32), activation='relu')(combined)
        x = Dense(hp.Int('units_dense2', min_value=16, max_value=64, step=16), activation='relu')(x)

        output = Dense(1, name='Output')(x)
        model = Model(inputs=[input_structured, input_atom_name, input_atom_location, input_coordinate], outputs=[output])

        # Compile model
        model.compile(optimizer=Adam(hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])), loss='mean_squared_error', metrics=['mae'])

        return model

In [ ]:
#Hyperparameter tuning
input_shapes = {
    'structured': X_structured.shape[1:],
    'atom_name': atom_name_vector_list_padded.shape[1:],
    'atom_location': atom_location_vector_list_padded.shape[1:],
    'coordinate': coordinate_vector_list_padded.shape[1:],
}

hypermodel = EnergyPredictionModel(input_shapes)

tuner = BayesianOptimization(
    hypermodel,
    objective='val_mae',
    max_trials=10,
    executions_per_trial=2,
    directory='hyperparam_tuning',
    project_name='energy_prediction'
)

# Split data
# Assuming you have a split_data function ready
X_train, X_test, y_train, y_test = split_data(atom_name_vector_list_padded, atom_location_vector_list_padded, coordinate_vector_list_padded, X_structured, energy_value_change)

# Prepare the data in a format that the model expects
# Assuming X_train_structured, X_train_atom_name, etc., are prepared

# Start the hyperparameter search
tuner.search([X_train_structured, X_train_atom_name, X_train_atom_location, X_train_coordinate], y_train, 
             epochs=10, validation_split=0.2, callbacks=[earlystop_callback])

# Get the best hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

In [ ]:
#Rebuild and retrain model using the best hyperparameters and then evaluate it on the test-test (possibly use cross-validation)