In [2]:
import this

# Standard libraries
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt

# Scikit-learn for data handling and preprocessing
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Keras & TensorFlow for model building and training
from tensorflow import keras
from tensorflow.keras import layers, models, optimizers
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, Flatten, Concatenate, LSTM
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Hyperparameter optimization
from hyperopt import hp, tpe, Trials, fmin
from hyperopt.pyll.base import scope
from keras_tuner import HyperModel
from keras_tuner.tuners import BayesianOptimization


In [7]:
#taking a look at the data
final_data_df = pd.read_csv('final_data.csv')
final_data_df.head()

Unnamed: 0,Structure_Combination,Atom_Removed_Location,Atom_Name_Vector_List,Atom_Location_Vector_List,Coordinate_Vector_List,Energy_Value_Change
0,1_1,7,"[[1, 0, 0, 0, 0], [1, 0, 0, 0, 0], [1, 0, 0, 0...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[[-0.0, 0.00408, 0.49999], [0.125, 0.25408, 0....",-6.8879
1,1_1,3,"[[1, 0, 0, 0, 0], [1, 0, 0, 0, 0], [1, 0, 0, 0...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[[-0.0, 0.00408, 0.49999], [0.125, 0.25408, 0....",-7.0395
2,1_1,1,"[[1, 0, 0, 0, 0], [1, 0, 0, 0, 0], [1, 0, 0, 0...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[[-0.0, 0.00408, 0.49999], [0.125, 0.25408, 0....",-6.3746
3,1_11,2,"[[1, 0, 0, 0, 0], [1, 0, 0, 0, 0], [1, 0, 0, 0...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[[-0.0, 0.00408, 0.49999], [0.125, 0.25408, 0....",-5.4809
4,1_11,13,"[[1, 0, 0, 0, 0], [1, 0, 0, 0, 0], [1, 0, 0, 0...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[[-0.0, 0.00408, 0.49999], [0.125, 0.25408, 0....",-4.8951


In [8]:
final_data_df.describe()

Unnamed: 0,Atom_Removed_Location,Energy_Value_Change
count,3726.0,3726.0
mean,22.039721,-4.256561
std,14.238092,1.432308
min,1.0,-7.6434
25%,10.0,-5.540275
50%,20.0,-3.58985
75%,34.0,-3.2812
max,48.0,-2.1122


In [ ]:
class DataPreparation:
    def __init__(self, final_data_df):
        # Initialize variables that will hold data from individual columns
        self.struct_comb_col = final_data_df['Structure_Combination']
        self.atom_removed_loc_col = final_data_df['Atom_Removed_Location']
        self.atom_name_vector_list_col = final_data_df['Atom_Name_Vector_List']
        self.atom_loc_vector_list_col = final_data_df['Atom_Location_Vector_List']
        self.coordinate_vector_list_col = final_data_df['Coordinate_Vector_List']
        self.energy_value_change_col = final_data_df['Energy_Value_Change']
        
    def fine_tune_data(self):
        # Transform struct_comb_col
        self.struct_comb_col = self.struct_comb_col.apply(lambda x: int(x.split('_')[0]) * 1000 + int(x.split('_')[1])).values.reshape(-1, 1)
        
        # Standardize atom_removed_loc_col and energy_value_change_col
        scaler = StandardScaler()
        self.atom_removed_loc_col = scaler.fit_transform(self.atom_removed_loc_col.values.reshape(-1, 1))
        self.energy_value_change_col = scaler.fit_transform(self.energy_value_change_col.values.reshape(-1, 1))
        
        # Pad vector data
        # Assuming atom_name_vector_list_col, atom_loc_vector_list_col, and coordinate_vector_list_col are lists of lists
        # Convert them to a 3D numpy array first if not already in that form
        self.atom_name_vector_list_col = pad_sequences(self.atom_name_vector_list_col, dtype='float32', padding='post')
        self.atom_loc_vector_list_col = pad_sequences(self.atom_loc_vector_list_col, dtype='float32', padding='post')
        self.coordinate_vector_list_col = pad_sequences(self.coordinate_vector_list_col, dtype='float32', padding='post')
        
    def split_data(self, test_size=0.2, random_state=42):
        # Prepare a combined features array
        # Note: This simplistic approach assumes all features are ready to be concatenated.
        # In practice, you may need to adjust shapes or normalize certain features separately.
        X = np.hstack([self.struct_comb_col, self.atom_removed_loc_col, 
                       self.atom_name_vector_list_col.reshape(len(self.atom_name_vector_list_col), -1), 
                       self.atom_loc_vector_list_col.reshape(len(self.atom_loc_vector_list_col), -1), 
                       self.coordinate_vector_list_col.reshape(len(self.coordinate_vector_list_col), -1)])
        y = self.energy_value_change_col
        
        # Split data into train and test sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
        return X_train, X_test, y_train, y_test

In [ ]:
class EnergyPredictionModel(HyperModel):
    def __init__(self, X_train, y_train):
        # Determine input shapes based on X_train
        if isinstance(X_train, list):
            # Multiple inputs
            self.input_shapes = [x.shape[1:] for x in X_train]
        else:
            # Single input
            self.input_shapes = [X_train.shape[1:]]

        # Use y_train if needed for model architecture decisions (rarely needed)
        self.output_shape = y_train.shape[1:]

    def build(self, hp):
        inputs = []
        processed_inputs = []

        for shape in self.input_shapes:
            input_layer = Input(shape=shape)
            inputs.append(input_layer)

            # Process each input; for now, simply use a dense layer
            processed = Dense(hp.Int('units_input', min_value=16, max_value=64, step=16), activation='relu')(input_layer)
            processed_inputs.append(processed)

        # Combine all processed inputs
        if len(processed_inputs) > 1:
            combined = Concatenate()(processed_inputs)
        else:
            combined = processed_inputs[0]

        # Additional layers
        x = Dense(hp.Int('units_dense1', min_value=32, max_value=128, step=32), activation='relu')(combined)
        x = Dense(hp.Int('units_dense2', min_value=16, max_value=64, step=16), activation='relu')(x)

        # Output layer
        output = Dense(self.output_shape[0], activation='linear')(x)

        model = Model(inputs=inputs, outputs=output)
        model.compile(optimizer=Adam(hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])), loss='mean_squared_error', metrics=['mae'])

        return model

In [ ]:
class HyperParameterTuning
    def __init__(self, feature_list):
        
    
    #Hyperparameter tuning
    input_shapes = {
        'structured': X_structured.shape[1:],
        'atom_name': atom_name_vector_list_padded.shape[1:],
        'atom_location': atom_location_vector_list_padded.shape[1:],
        'coordinate': coordinate_vector_list_padded.shape[1:],
    }

    hypermodel = EnergyPredictionModel(input_shapes)

    tuner = BayesianOptimization(
        hypermodel,
        objective='val_mae',
        max_trials=10,
        executions_per_trial=2,
        directory='hyperparam_tuning',
        project_name='energy_prediction'
    )

    # Split data
    # Assuming you have a split_data function ready
    X_train, X_test, y_train, y_test = split_data(atom_name_vector_list_padded, atom_location_vector_list_padded, coordinate_vector_list_padded, X_structured, energy_value_change)

    # Prepare the data in a format that the model expects
    # Assuming X_train_structured, X_train_atom_name, etc., are prepared

    # Start the hyperparameter search
    tuner.search([X_train_structured, X_train_atom_name, X_train_atom_location, X_train_coordinate], y_train, 
             epochs=10, validation_split=0.2, callbacks=[earlystop_callback])

    # Get the best hyperparameters
    best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

In [ ]:
    #Rebuild and retrain model using the best hyperparameters and then evaluate it on the test-test (possibly use cross-validation)
    class TestBestModel(model, X_train, X_test, y_train, y_test):
        

In [ ]:
def main():
    
    

In [ ]:
if __name__ == '__main__':
    main()