In [162]:
"""
#Task: Predict the change in energy of the structure given the host structure (xyz data (which contains the x,y,z coordinates of each atom in the structure) + element list data (which contains the elements within the structure and match up in order with their corresponding coordinates in space as the xyz data coordinates)) and the index of the molecule that is removed (use hot one encoding to represent the molecule that was removed: Mo: Molybdenum, S: Sulfur, Se: Selenium, Te: Tellurium, W: Tungsten)

#Machine Learning Task Type: Deep Learning Neural Network that learns a relationship between the associated change in the structure of the host and the corresponding change in energy of the structure
"""

'\n#Task: Predict the change in energy of the structure given the host structure (xyz data (which contains the x,y,z coordinates of each atom in the structure) + element list data (which contains the elements within the structure and match up in order with their corresponding coordinates in space as the xyz data coordinates)) and the index of the molecule that is removed (use hot one encoding to represent the molecule that was removed: Mo: Molybdenum, S: Sulfur, Se: Selenium, Te: Tellurium, W: Tungsten)\n\n#Machine Learning Task Type: Deep Learning Neural Network that learns a relationship between the associated change in the structure of the host and the corresponding change in energy of the structure\n'

In [163]:
#Step 1: 
import pandas as pd
import os
folder_path = '/Users/robertspataru/Desktop/project_newData'
output_file = 'vac_origHost_compiled.csv'
csv_files = ['vac_Mo_origHost.csv', 'vac_S_origHost.csv', 'vac_Se_origHost.csv', 'vac_Te_origHost.csv', 'vac_W_origHost.csv']
column_names = ['Structure_Combination', 'Atom_Removed_Location', 'Energy_Value_Change']  

possible_elements_removed = ['Mo', 'S', 'Se', 'Te', 'W']
dfs_vac_origHost = []
for file_name in csv_files:
    full_file_path = os.path.join(folder_path, file_name)
    df = pd.read_csv(full_file_path, header = None)
    df.columns = column_names
    dfs_vac_origHost.append(df)
df_vac_origHost = pd.concat(dfs_vac_origHost, ignore_index = True)
#print(df_vac_origHost.head())


In [164]:
#Step 2: 

input_file = '/Users/robertspataru/Desktop/project_newData/CIFhost_generic_compiled.txt'
output_file = 'CIFhost_generic_compiled.csv'

initial_column_names = ['Atom_Name', 'X-Coordinate', 'Y-Coordinate', 'Z-Coordinate']
orig_df_CIFs_sorted = []

orig_df_CIFs_sorted = pd.read_csv(input_file, sep = '\s+', header = None)
orig_df_CIFs_sorted.columns = initial_column_names

#(orig_df_CIFs_sorted.head())

In [165]:
#Step #3:

middle_column_names = ['Atom_Name','Structure_Combination', 'Atom_Location', 'X-Coordinate', 'Y-Coordinate', 'Z-Coordinate']
mid_df_CIFs_sorted = orig_df_CIFs_sorted

Atom_Location_List = []
Structure_Combination_List = []

x = 1
y = 1
num_rows = len(orig_df_CIFs_sorted.index)

for i in range(num_rows):
    # Append the current atom location, resetting to 1 after every 48 atoms
    Atom_Location_List.append((i % 48) + 1)
    
    # Construct the structure combination string and append it to the list
    Structure_Combination_List.append(f"{x}_{y}")
    
    # Check if we've reached the end of a structure combination (every 48 atoms)
    if (i + 1) % 48 == 0:
        y += 1  # Increment y after every 48 atoms
        if y > 72:
            y = 1  # Reset y back to 1 after reaching 72
            x += 1  # Increment x when y resets


mid_df_CIFs_sorted.insert(loc = 1, column = middle_column_names[1], value = Structure_Combination_List)
mid_df_CIFs_sorted.insert(loc = 2, column = middle_column_names[2], value = Atom_Location_List)

#print(mid_df_CIFs_sorted.head())


In [166]:
#Step #4: 

mid_2_column_names = ['Atom_Name_Vector', 'Structure_Combination', 'Atom_Location', 'Coordinate_Vector']
mid_2_df_CIFs_sorted = mid_df_CIFs_sorted

Atom_Name_Column = mid_2_df_CIFs_sorted['Atom_Name']
Atom_Name_Vector_List = []
for atom in Atom_Name_Column:
    atom_name_vector = [0] * len(possible_elements_removed)
    index_to_encode = possible_elements_removed.index(atom)
    atom_name_vector[index_to_encode] = 1
    Atom_Name_Vector_List.append(atom_name_vector)

Coordinate_Vector_List = []
for i in range(num_rows):
    coordinate_vector = [mid_2_df_CIFs_sorted.loc[i, 'X-Coordinate'], mid_2_df_CIFs_sorted.loc[i, 'Y-Coordinate'], mid_2_df_CIFs_sorted.loc[i, 'Z-Coordinate']]
    Coordinate_Vector_List.append(coordinate_vector)

mid_2_df_CIFs_sorted['Atom_Name'] = Atom_Name_Vector_List
mid_2_df_CIFs_sorted.rename(columns = {'Atom_Name': 'Atom_Name_Vector'}, inplace = True)

mid_2_df_CIFs_sorted.drop(['X-Coordinate', 'Y-Coordinate', 'Z-Coordinate'], axis = 1, inplace = True)
mid_2_df_CIFs_sorted['Coordinate_Vector'] = Coordinate_Vector_List

#print(mid_2_df_CIFs_sorted.head())




In [167]:
final_column_names = ['Atom_Name_Vector_List','Structure_Combination', 'Atom_Location_Vector_List','Coordinate_Vector_List']
final_df_CIFs_sorted = pd.DataFrame()

atom_name_vector_list = []
temp_atom_name_vector_list = []

structure_combination_list = []

atom_location_vector_list = []
temp_atom_location_vector_list = []

coordinate_vector_list = []
temp_coordinate_vector_list = []



for row_index in range(1, num_rows + 1):
        temp_atom_name_vector_list.append(mid_2_df_CIFs_sorted.iloc[row_index - 1]['Atom_Name_Vector'])
        
        temp_atom_location_vector_list.append(mid_2_df_CIFs_sorted.iloc[row_index - 1]['Atom_Location'])
        
        temp_coordinate_vector_list.append(mid_2_df_CIFs_sorted.iloc[row_index - 1]['Coordinate_Vector'])
        
        if row_index % 48 == 0:
            atom_name_vector_list.append(temp_atom_name_vector_list)
            temp_atom_name_vector_list = []
            
            structure_combination_list.append(mid_2_df_CIFs_sorted.iloc[row_index - 1]['Structure_Combination'])
        
            atom_location_vector_list.append(temp_atom_location_vector_list)
            temp_atom_location_vector_list = []
        
            coordinate_vector_list.append(temp_coordinate_vector_list)
            temp_coordinate_vector_list = []

final_df_CIFs_sorted['Atom_Name_Vector_List'] = atom_name_vector_list

final_df_CIFs_sorted['Structure_Combination'] = structure_combination_list

final_df_CIFs_sorted['Atom_Location_Vector_List'] = atom_location_vector_list


final_df_CIFs_sorted['Coordinate_Vector_List'] = coordinate_vector_list

#print(final_df_CIFs_sorted.head())

#print(atom_name_vector_list[0])
#print(structure_combination_list[0])
#print(atom_location_vector_list[0])
#print(coordinate_vector_list[0])


                               Atom_Name_Vector_List Structure_Combination  \
0  [[1, 0, 0, 0, 0], [1, 0, 0, 0, 0], [1, 0, 0, 0...                   1_1   
1  [[1, 0, 0, 0, 0], [1, 0, 0, 0, 0], [1, 0, 0, 0...                   1_2   
2  [[0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [1, 0, 0, 0...                   1_3   
3  [[1, 0, 0, 0, 0], [1, 0, 0, 0, 0], [1, 0, 0, 0...                   1_4   
4  [[1, 0, 0, 0, 0], [0, 0, 0, 0, 1], [1, 0, 0, 0...                   1_5   

                           Atom_Location_Vector_List  \
0  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...   
1  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...   
2  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...   
3  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...   
4  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...   

                              Coordinate_Vector_List  
0  [[-0.0, 0.00408, 0.49999], [0.125, 0.25408, 0....  
1  [[-0.0, 0.00408, 0.49999], [0.125, 0.25408, 0....  
2  [[-0.0, 0.00408, 0.49999],

In [177]:
#Combine both of the pandas dataframes into one large .csv file by going through each sentence of the combined .csv file and then matching the specific structure combination with the corresponding 48 atoms and their coordinate space. Then, delete the duplicate column that contains the structure combination in the form x_y.
final_compiled_dataframe = pd.merge(df_vac_origHost, final_df_CIFs_sorted, on = 'Structure_Combination', how = 'left')
final_compiled_dataframe.to_csv('NanoMaterialAnalysis.csv', header = True, index = False)
#current_working_directory = os.getcwd()
#print(current_working_directory)
#print(final_compiled_dataframe.head())



In [169]:
from sklearn.preprocessing import StandardScaler
#Step : 
#create a python function to generate a machine learning model 
#


In [170]:
#Step 
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers

In [171]:
#Step : learn the correct hyperparameters

In [172]:
#Step : load in test data to test model and print out performance mertrics

In [173]:
#hyperparameter tuning (random/grid search)

#standarize the energy loss values, and then turn them back to normal values when wanting to predic the associated energy value

#posssibly make a function to take the nearest x (must create a hyperparameter and test different values of x) atoms and their corresponding x, y, z coordinates from the .CIF files for each of the atoms that were removed from their structure

In [174]:
#: test model again and print out new performance metrics 