In [2]:
"""
#Task: Predict the change in energy of the structure given the host structure (xyz data (which contains the x,y,z coordinates of each atom in the structure) + element list data (which contains the elements within the structure and match up in order with their corresponding coordinates in space as the xyz data coordinates)) and the index of the molecule that is removed (use hot one encoding to represent the molecule that was removed: Mo: Molybdenum, S: Sulfur, Se: Selenium, Te: Tellurium, W: Tungsten)

#Machine Learning Task Type: Deep Learning Neural Network that learns a relationship between the associated change in the structure of the host and the corresponding change in energy of the structure
"""

'\n#Task: Predict the change in energy of the structure given the host structure (xyz data (which contains the x,y,z coordinates of each atom in the structure) + element list data (which contains the elements within the structure and match up in order with their corresponding coordinates in space as the xyz data coordinates)) and the index of the molecule that is removed (use hot one encoding to represent the molecule that was removed: Mo: Molybdenum, S: Sulfur, Se: Selenium, Te: Tellurium, W: Tungsten)\n\n#Machine Learning Task Type: Deep Learning Neural Network that learns a relationship between the associated change in the structure of the host and the corresponding change in energy of the structure\n'

In [ ]:
#import python/ml packages

import pandas as pd
import numpy as np
import os

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models, optimizers
import matplotlib.pyplot as plt
import seaborn as sns
import sweetviz as sv
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [3]:
#Step 1: 
folder_path = '/Users/robertspataru/Desktop/project_newData'
output_file = 'vac_origHost_compiled.csv'
csv_files = ['vac_Mo_origHost.csv', 'vac_S_origHost.csv', 'vac_Se_origHost.csv', 'vac_Te_origHost.csv', 'vac_W_origHost.csv']
column_names = ['Structure_Combination', 'Atom_Removed_Location', 'Energy_Value_Change']  

possible_elements_removed = ['Mo', 'S', 'Se', 'Te', 'W']
dfs_vac_origHost = []
for file_name in csv_files:
    full_file_path = os.path.join(folder_path, file_name)
    df = pd.read_csv(full_file_path, header = None)
    df.columns = column_names
    dfs_vac_origHost.append(df)
df_vac_origHost = pd.concat(dfs_vac_origHost, ignore_index = True)
#print(df_vac_origHost.head())

In [4]:
#Step 2: 

input_file = '/Users/robertspataru/Desktop/project_newData/CIFhost_generic_compiled.txt'
output_file = 'CIFhost_generic_compiled.csv'

initial_column_names = ['Atom_Name', 'X-Coordinate', 'Y-Coordinate', 'Z-Coordinate']
orig_df_CIFs_sorted = []

orig_df_CIFs_sorted = pd.read_csv(input_file, sep = '\s+', header = None)
orig_df_CIFs_sorted.columns = initial_column_names

#(orig_df_CIFs_sorted.head())

In [5]:
#Step #3:

middle_column_names = ['Atom_Name','Structure_Combination', 'Atom_Location', 'X-Coordinate', 'Y-Coordinate', 'Z-Coordinate']
mid_df_CIFs_sorted = orig_df_CIFs_sorted

Atom_Location_List = []
Structure_Combination_List = []

x = 1
y = 1
num_rows = len(orig_df_CIFs_sorted.index)

for i in range(num_rows):
    # Append the current atom location, resetting to 1 after every 48 atoms
    Atom_Location_List.append((i % 48) + 1)
    
    # Construct the structure combination string and append it to the list
    Structure_Combination_List.append(f"{x}_{y}")
    
    # Check if we've reached the end of a structure combination (every 48 atoms)
    if (i + 1) % 48 == 0:
        y += 1  # Increment y after every 48 atoms
        if y > 72:
            y = 1  # Reset y back to 1 after reaching 72
            x += 1  # Increment x when y resets


mid_df_CIFs_sorted.insert(loc = 1, column = middle_column_names[1], value = Structure_Combination_List)
mid_df_CIFs_sorted.insert(loc = 2, column = middle_column_names[2], value = Atom_Location_List)

#print(mid_df_CIFs_sorted.head())

In [6]:
#Step #4: 

mid_2_column_names = ['Atom_Name_Vector', 'Structure_Combination', 'Atom_Location', 'Coordinate_Vector']
mid_2_df_CIFs_sorted = mid_df_CIFs_sorted

Atom_Name_Column = mid_2_df_CIFs_sorted['Atom_Name']
Atom_Name_Vector_List = []
for atom in Atom_Name_Column:
    atom_name_vector = [0] * len(possible_elements_removed)
    index_to_encode = possible_elements_removed.index(atom)
    atom_name_vector[index_to_encode] = 1
    Atom_Name_Vector_List.append(atom_name_vector)

Coordinate_Vector_List = []
for i in range(num_rows):
    coordinate_vector = [mid_2_df_CIFs_sorted.loc[i, 'X-Coordinate'], mid_2_df_CIFs_sorted.loc[i, 'Y-Coordinate'], mid_2_df_CIFs_sorted.loc[i, 'Z-Coordinate']]
    Coordinate_Vector_List.append(coordinate_vector)

mid_2_df_CIFs_sorted['Atom_Name'] = Atom_Name_Vector_List
mid_2_df_CIFs_sorted.rename(columns = {'Atom_Name': 'Atom_Name_Vector'}, inplace = True)

mid_2_df_CIFs_sorted.drop(['X-Coordinate', 'Y-Coordinate', 'Z-Coordinate'], axis = 1, inplace = True)
mid_2_df_CIFs_sorted['Coordinate_Vector'] = Coordinate_Vector_List

#print(mid_2_df_CIFs_sorted.head())

In [7]:
final_column_names = ['Atom_Name_Vector_List','Structure_Combination', 'Atom_Location_Vector_List','Coordinate_Vector_List']
final_df_CIFs_sorted = pd.DataFrame()

atom_name_vector_list = []
temp_atom_name_vector_list = []

structure_combination_list = []

atom_location_vector_list = []
temp_atom_location_vector_list = []

coordinate_vector_list = []
temp_coordinate_vector_list = []


for row_index in range(1, num_rows + 1):
        temp_atom_name_vector_list.append(mid_2_df_CIFs_sorted.iloc[row_index - 1]['Atom_Name_Vector'])
        
        temp_atom_location_vector_list.append(mid_2_df_CIFs_sorted.iloc[row_index - 1]['Atom_Location'])
        
        temp_coordinate_vector_list.append(mid_2_df_CIFs_sorted.iloc[row_index - 1]['Coordinate_Vector'])
        
        if row_index % 48 == 0:
            atom_name_vector_list.append(temp_atom_name_vector_list)
            temp_atom_name_vector_list = []
            
            structure_combination_list.append(mid_2_df_CIFs_sorted.iloc[row_index - 1]['Structure_Combination'])
        
            atom_location_vector_list.append(temp_atom_location_vector_list)
            temp_atom_location_vector_list = []
        
            coordinate_vector_list.append(temp_coordinate_vector_list)
            temp_coordinate_vector_list = []

final_df_CIFs_sorted['Atom_Name_Vector_List'] = atom_name_vector_list

final_df_CIFs_sorted['Structure_Combination'] = structure_combination_list

final_df_CIFs_sorted['Atom_Location_Vector_List'] = atom_location_vector_list


final_df_CIFs_sorted['Coordinate_Vector_List'] = coordinate_vector_list

#print(final_df_CIFs_sorted.head())

#print(atom_name_vector_list[0])
#print(structure_combination_list[0])
#print(atom_location_vector_list[0])
#print(coordinate_vector_list[0])

In [8]:
#Combine both of the pandas dataframes into one large .csv file by going through each sentence of the combined .csv file and then matching the specific structure combination with the corresponding 48 atoms and their coordinate space. Then, delete the duplicate column that contains the structure combination in the form x_y.
final_data_df = pd.merge(df_vac_origHost, final_df_CIFs_sorted, on = 'Structure_Combination', how = 'left')
#make the column Energy_Value_Change the last column. Also sort the columns in an order that is more logical



print(final_data_df.head())



  Structure_Combination  Atom_Removed_Location  Energy_Value_Change  \
0                   1_1                      7              -6.8879   
1                   1_1                      3              -7.0395   
2                   1_1                      1              -6.3746   
3                  1_11                      2              -5.4809   
4                  1_11                     13              -4.8951   

                               Atom_Name_Vector_List  \
0  [[1, 0, 0, 0, 0], [1, 0, 0, 0, 0], [1, 0, 0, 0...   
1  [[1, 0, 0, 0, 0], [1, 0, 0, 0, 0], [1, 0, 0, 0...   
2  [[1, 0, 0, 0, 0], [1, 0, 0, 0, 0], [1, 0, 0, 0...   
3  [[1, 0, 0, 0, 0], [1, 0, 0, 0, 0], [1, 0, 0, 0...   
4  [[1, 0, 0, 0, 0], [1, 0, 0, 0, 0], [1, 0, 0, 0...   

                           Atom_Location_Vector_List  \
0  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...   
1  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...   
2  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...   
3  [1, 2, 3,

In [ ]:
# Disable those annoying warnings
tf.get_logger().setLevel('ERROR')

# Turn off GPU usage for tf
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
os.environ["TF_CPP_MIN_LOG_LEVEL"] = '2'

In [ ]:
#define helper functions 
#make it so the paramater to this function are two seperate raw dataframes; one is the training set and the other is the test set
def prep_data(raw_df):
    """
    Prepare data that can be readily consumed by ML/DL algorithms.
    - separate features from class variables
    - split into training and testing dataset
    - scale numerical data
    
    param: a dataframe of input data
    output: X_train_normalized, X_test_normalized, y_train, y_test
    """
    raw_data = raw_df.values
    X, y = raw_data[:, :-1], raw_data[:, -1]

    # Split into train and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)

    # normalize data
    scaler = StandardScaler()
    X_train_normalized = scaler.fit_transform(X_train)
    X_test_normalized = scaler.transform(X_test)
    
    #posssibly make a function to take the nearest x (must create a hyperparameter and test different values of x) atoms and their corresponding x, y, z coordinates from the .CIF files for each of the atoms that were removed from their structure
    
    return X_train_normalized, X_test_normalized, y_train, y_test

In [ ]:
def build_model(num_features):
    """
    Build the model architecture (and compile it).
    input: number of features
    output: Keras model object.
    """
    classifier = keras.Sequential()
    classifier.add(layers.Dense(64, input_dim=num_features, activation='relu'))
    classifier.add(layers.Dropout(0.3))
    classifier.add(layers.Dense(32, activation='relu'))
    classifier.add(layers.Dropout(0.3))
    classifier.add(layers.Dense(8, activation='softmax'))
    classifier.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return classifier
    
    #Step : #hyperparameter tuning (random/grid search)

In [ ]:
def plot_history(history, param):
    """
    Shows how the model performs (in terms of accuracy and loss) over several epochs.
    """
    if param == 'acc':
        plt.plot(history.history['accuracy'])
        plt.plot(history.history['val_accuracy'])
        plt.title('model accuracy')
        plt.ylabel('accuracy')
        plt.xlabel('epoch')
        plt.legend(['train', 'val'], loc='upper left')
        plt.show()
    elif param == 'loss':
        plt.plot(history.history['loss'])
        plt.plot(history.history['val_loss'])
        plt.title('model loss')
        plt.ylabel('loss')
        plt.xlabel('epoch')
        plt.legend(['train', 'val'], loc='upper right')
        plt.show()

In [ ]:
#Step : #hyperparameter tuning (random/grid search)
#Step : load in test data to test model and print out performance mertrics

#possibly standardize the energy loss values, and then turn them back to normal values when wanting to predict the associated energy value

#posssibly make a function to take the nearest x (must create a hyperparameter and test different values of x) atoms and their corresponding x, y, z coordinates from the .CIF files for each of the atoms that were removed from their structure
#: test model again and print out new performance metrics 

In [ ]:
### The main function below drives the entire code. It prepares the dataset, builds a model with appropriate parameters, evaluates the model and predicts on the test data. Finally, plots some performance metrics.
def main():
    raw_df = final_data_df
    cols = raw_df.columns.tolist()
    features, label = cols[:-1], cols[-1]
    X_train, X_test, y_train, y_test = prep_data(raw_df)

    # Build a DL model
    num_features = len(features)
    model = build_model(num_features)

    print("Summary report of Keras classifier:")
    model.summary()

    num_epochs = 100
    batch_size = 1024
    earlystop_callback = EarlyStopping(monitor='val_accuracy', min_delta=0.0001, patience=3)
    history = model.fit(X_train,
                        y_train,
                        epochs=num_epochs,
                        batch_size=batch_size,
                        callbacks=[earlystop_callback],
                        validation_split=0.1,
                        verbose=1)

    plot_history(history, 'acc')
    plot_history(history, 'loss')

    score = model.evaluate(X_test, y_test, verbose=0)
    print(f'Test loss: {score[0]}')
    print(f'Test accuracy: {score[1]}')

    y_pred = model.predict(X_test)
   # Assuming y_true contains the actual values and y_pred contains the model's predictions

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    print(f"Mean Absolute Error (MAE): {mae}")
    print(f"Mean Squared Error (MSE): {mse}")
    print(f"Root Mean Squared Error (RMSE): {rmse}")
    print(f"R-squared (R²): {r2}")
   

In [ ]:
if __name__ == '__main__':
    main()