In [2]:
#imports
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from tools import *
import pickle

In [None]:
pickle_files = ["energy_column.pkl", "full_summary.pkl", "lat.pkl", "main_data.pkl"]
for file_name in pickle_files:
    with open(file_name, "rb") as file:
        data = pickle.load(file)
    dataframe = pd.DataFrame(data)
    file_name = file_name.removesuffix(".pkl")
    dataframe.to_csv(f"{file_name}.csv")


In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
h_n_df = pd.read_csv("data_csv/main_data.csv")
print(h_n_df.head())
print(h_n_df.columns)
print(h_n_df.shape)
print(h_n_df.info())
print(h_n_df.describe())
print(h_n_df.tail())
print(h_n_df.isnull().sum())


  Unnamed: 0                                           Hydrogen Nitrogen
0      H_523  [[0.625 0.125 0.125]\n [0.25  0.25  0.25 ]\n [...      NaN
1      H_393  [[0.625 0.125 0.125]\n [0.25  0.25  0.25 ]\n [...      NaN
2      H_136  [[0.25  0.25  0.25 ]\n [0.75  0.25  0.25 ]\n [...      NaN
3      H_397  [[0.125 0.125 0.125]\n [0.625 0.125 0.125]\n [...      NaN
4      H_490  [[0.125 0.125 0.125]\n [0.625 0.125 0.125]\n [...      NaN
Index(['Unnamed: 0', 'Hydrogen', 'Nitrogen'], dtype='object')
(2358, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2358 entries, 0 to 2357
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  2358 non-null   object
 1   Hydrogen    1179 non-null   object
 2   Nitrogen    1179 non-null   object
dtypes: object(3)
memory usage: 55.4+ KB
None
       Unnamed: 0 Hydrogen Nitrogen
count        2358     1179     1179
unique       2358     1051      863
top         H_523       []    

In [None]:
#print(h_n_df["Unnamed: 0"])
indices = h_n_df["Unnamed: 0"]
print(indices)
indices = indices[indices.notna()]
print(indices)
indices_values = indices.values
print(indices_values)

#print(h_n_df["Hydrogen"])
hydrogen_col = h_n_df["Hydrogen"]
print(hydrogen_col)
hydrogen_col = hydrogen_col[hydrogen_col.notna()]
print(hydrogen_col)
hydrogen_col_values = hydrogen_col.values
print(hydrogen_col_values)


#print(h_n_df["Nitrogen"])
nitrogen_col = h_n_df["Nitrogen"]
print(nitrogen_col)
nitrogen_col = nitrogen_col[nitrogen_col.notna()]
print(nitrogen_col)
nitrogen_col_values = nitrogen_col.values
print(nitrogen_col_values)


0        H_523
1        H_393
2        H_136
3        H_397
4        H_490
         ...  
2353    N_1151
2354    N_1141
2355     N_382
2356    N_1111
2357     N_104
Name: Unnamed: 0, Length: 2358, dtype: object
0        H_523
1        H_393
2        H_136
3        H_397
4        H_490
         ...  
2353    N_1151
2354    N_1141
2355     N_382
2356    N_1111
2357     N_104
Name: Unnamed: 0, Length: 2358, dtype: object
['H_523' 'H_393' 'H_136' ... 'N_382' 'N_1111' 'N_104']
0       [[0.625 0.125 0.125]\n [0.25  0.25  0.25 ]\n [...
1       [[0.625 0.125 0.125]\n [0.25  0.25  0.25 ]\n [...
2       [[0.25  0.25  0.25 ]\n [0.75  0.25  0.25 ]\n [...
3       [[0.125 0.125 0.125]\n [0.625 0.125 0.125]\n [...
4       [[0.125 0.125 0.125]\n [0.625 0.125 0.125]\n [...
                              ...                        
2353                                                  NaN
2354                                                  NaN
2355                                                  NaN
2

In [22]:
hydrogen_nitrogen_stacked = pd.DataFrame(pd.concat([hydrogen_col, nitrogen_col], axis=0))
hydrogen_nitrogen_stacked["Index"] = indices_values
hydrogen_nitrogen_stacked.columns = ["Element_Matrix", "Index"]
hydrogen_nitrogen_stacked.to_csv("data_csv/hydrogen_nitrogen_stacked.csv", index=False)
print(hydrogen_nitrogen_stacked)

                                         Element_Matrix   Index
0     [[0.625 0.125 0.125]\n [0.25  0.25  0.25 ]\n [...   H_523
1     [[0.625 0.125 0.125]\n [0.25  0.25  0.25 ]\n [...   H_393
2     [[0.25  0.25  0.25 ]\n [0.75  0.25  0.25 ]\n [...   H_136
3     [[0.125 0.125 0.125]\n [0.625 0.125 0.125]\n [...   H_397
4     [[0.125 0.125 0.125]\n [0.625 0.125 0.125]\n [...   H_490
...                                                 ...     ...
2353  [[0.875 0.875 0.875]\n [0.875 0.375 0.375]\n [...  N_1151
2354  [[0.125 0.125 0.625]\n [0.625 0.625 0.125]\n [...  N_1141
2355  [[0.125 0.125 0.625]\n [0.625 0.125 0.625]\n [...   N_382
2356                                                 []  N_1111
2357  [[0.125 0.125 0.125]\n [0.25  0.75  0.25 ]\n [...   N_104

[2358 rows x 2 columns]


In [None]:

hydrogen_nitrogen_stacked.sort_values(by="Index", inplace=True)
print(hydrogen_nitrogen_stacked)

                                         Element_Matrix   Index
227   [[0.125 0.125 0.125]\n [0.75  0.25  0.25 ]\n [...     H_1
427   [[0.125 0.125 0.125]\n [0.125 0.125 0.625]\n [...    H_10
51    [[0.625 0.125 0.625]\n [0.25  0.75  0.25 ]\n [...   H_100
458   [[0.125 0.125 0.125]\n [0.125 0.625 0.125]\n [...  H_1000
1006  [[0.375 0.875 0.875]\n [0.625 0.625 0.125]\n [...  H_1001
...                                                 ...     ...
1942  [[0.125 0.625 0.125]\n [0.125 0.125 0.125]\n [...   N_995
1934  [[0.125 0.625 0.125]\n [0.875 0.875 0.875]\n [...   N_996
1881                                                 []   N_997
1976                                                 []   N_998
1812  [[0.125 0.125 0.625]\n [0.25  0.75  0.25 ]\n [...   N_999

[2358 rows x 2 columns]


In [32]:
hydrogen_values = hydrogen_nitrogen_stacked[hydrogen_nitrogen_stacked["Index"].str.startswith("H_")]
print(hydrogen_values)
print(len(hydrogen_values))
hydrogen_values.to_csv("data_csv/hydrogen_values.csv", index=False)
nitrogen_values = hydrogen_nitrogen_stacked[hydrogen_nitrogen_stacked["Index"].str.startswith("N_")]
print(nitrogen_values)
print(len(nitrogen_values))
nitrogen_values.to_csv("data_csv/nitrogen_values.csv", index=False)


                                         Element_Matrix   Index
227   [[0.125 0.125 0.125]\n [0.75  0.25  0.25 ]\n [...     H_1
427   [[0.125 0.125 0.125]\n [0.125 0.125 0.625]\n [...    H_10
51    [[0.625 0.125 0.625]\n [0.25  0.75  0.25 ]\n [...   H_100
458   [[0.125 0.125 0.125]\n [0.125 0.625 0.125]\n [...  H_1000
1006  [[0.375 0.875 0.875]\n [0.625 0.625 0.125]\n [...  H_1001
...                                                 ...     ...
879               [[0.75 0.25 0.25]\n [0.25 0.75 0.75]]   H_995
737   [[0.25  0.75  0.75 ]\n [0.75  0.25  0.75 ]\n [...   H_996
751   [[0.875 0.875 0.375]\n [0.625 0.125 0.125]\n [...   H_997
722   [[0.625 0.625 0.125]\n [0.875 0.875 0.875]\n [...   H_998
685   [[0.625 0.625 0.125]\n [0.75  0.75  0.25 ]\n [...   H_999

[1179 rows x 2 columns]
1179
                                         Element_Matrix   Index
1547  [[0.625 0.125 0.125]\n [0.25  0.25  0.25 ]\n [...     N_1
1203  [[0.125 0.625 0.125]\n [0.625 0.125 0.625]\n [...    N_10
1354  [[0.

In [None]:
def remove_prefix(text):
    if text.startswith("H_"):
        prefix = "H_"
    elif text.startswith("N_"):
        prefix = "N_"
    return text[len(prefix):]
hydrogen_values["Index"] = hydrogen_values["Index"].apply(remove_prefix)
print(hydrogen_values)
hydrogen_values["Index"] = hydrogen_values["Index"].astype(int)
print(hydrogen_values)
hydrogen_values.sort_values(by="Index", inplace=True)
print(hydrogen_values)



AttributeError: 'int' object has no attribute 'startswith'

In [35]:
hydrogen_values.to_csv("data_csv/hydrogen_values.csv", index=False)

In [36]:
nitrogen_values["Index"] = nitrogen_values["Index"].apply(remove_prefix)
print(nitrogen_values)
nitrogen_values["Index"] = nitrogen_values["Index"].astype(int)
print(nitrogen_values)
nitrogen_values.sort_values(by="Index", inplace=True)
print(nitrogen_values)
nitrogen_values.to_csv("data_csv/nitrogen_values.csv", index=False)

                                         Element_Matrix Index
1547  [[0.625 0.125 0.125]\n [0.25  0.25  0.25 ]\n [...     1
1203  [[0.125 0.625 0.125]\n [0.625 0.125 0.625]\n [...    10
1354  [[0.125 0.125 0.625]\n [0.25  0.25  0.75 ]\n [...   100
1310                                                 []  1000
1554  [[0.75  0.75  0.75 ]\n [0.25  0.25  0.75 ]\n [...  1001
...                                                 ...   ...
1942  [[0.125 0.625 0.125]\n [0.125 0.125 0.125]\n [...   995
1934  [[0.125 0.625 0.125]\n [0.875 0.875 0.875]\n [...   996
1881                                                 []   997
1976                                                 []   998
1812  [[0.125 0.125 0.625]\n [0.25  0.75  0.25 ]\n [...   999

[1179 rows x 2 columns]
                                         Element_Matrix  Index
1547  [[0.625 0.125 0.125]\n [0.25  0.25  0.25 ]\n [...      1
1203  [[0.125 0.625 0.125]\n [0.625 0.125 0.625]\n [...     10
1354  [[0.125 0.125 0.625]\n [0.25  0.25  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nitrogen_values["Index"] = nitrogen_values["Index"].apply(remove_prefix)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nitrogen_values["Index"] = nitrogen_values["Index"].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nitrogen_values.sort_values(by="Index", inplace=True)


In [55]:
import pandas as pd
import numpy as np

hydrogen_matrices = pd.read_csv("data_csv/hydrogen_values.csv")["Element_Matrix"]
hydrogen_matrices_numpy = []

for line in hydrogen_matrices:
    # Clean the string thoroughly
    line = line.strip().replace('"', '').replace('[', '').replace(']', '')
    # Split into rows (assuming rows are separated by newlines)
    rows = [row.strip() for row in line.split('\n') if row.strip()]
    print("Rows after cleaning:", rows)  # Debug output
    try:
        # Convert each row to a list of floats
        array = np.array([list(map(float, row.split())) for row in rows])
        print("First 3 rows:", array[0:3])
        hydrogen_matrices_numpy.append(array)
    except ValueError as e:
        print(f"Error processing line: {line}")
        print(f"Rows: {rows}")
        print(f"Error: {e}")
        raise  # Re-raise to stop and inspect

# Convert to NumPy array (use dtype=object if shapes vary)
hydrogen_matrices_numpy = np.array(hydrogen_matrices_numpy, dtype=object)
print("First 10 matrices:", hydrogen_matrices_numpy[0:10])

Rows after cleaning: ['0.125 0.125 0.125', '0.75  0.25  0.25', '0.125 0.625 0.125', '0.125 0.125 0.625', '0.75  0.75  0.25', '0.75  0.25  0.75', '0.125 0.625 0.625', '0.75  0.75  0.75']
First 3 rows: [[0.125 0.125 0.125]
 [0.75  0.25  0.25 ]
 [0.125 0.625 0.125]]
Rows after cleaning: ['0.625 0.125 0.625', '0.25  0.25  0.75', '0.75  0.25  0.75', '0.375 0.875 0.375', '0.875 0.875 0.375', '0.375 0.875 0.875', '0.875 0.875 0.875']
First 3 rows: [[0.625 0.125 0.625]
 [0.25  0.25  0.75 ]
 [0.75  0.25  0.75 ]]
Rows after cleaning: ['0.25 0.25 0.25', '0.25 0.75 0.25', '0.25 0.25 0.75', '0.75 0.75 0.25']
First 3 rows: [[0.25 0.25 0.25]
 [0.25 0.75 0.25]
 [0.25 0.25 0.75]]
Rows after cleaning: ['0.375 0.375 0.375', '0.75  0.25  0.75', '0.375 0.875 0.375', '0.875 0.875 0.375', '0.375 0.875 0.875']
First 3 rows: [[0.375 0.375 0.375]
 [0.75  0.25  0.75 ]
 [0.375 0.875 0.375]]
Rows after cleaning: ['0.125 0.125 0.125', '0.625 0.125 0.125', '0.75  0.25  0.25', '0.375 0.375 0.375', '0.625 0.625 0.125'

In [58]:
print(hydrogen_matrices_numpy.shape)
print(hydrogen_matrices_numpy[0].shape)
print(hydrogen_matrices_numpy[0])
print(hydrogen_matrices_numpy[0][0])
print(hydrogen_matrices_numpy[0][0].shape)

(1179,)
(8, 3)
[[0.125 0.125 0.125]
 [0.75  0.25  0.25 ]
 [0.125 0.625 0.125]
 [0.125 0.125 0.625]
 [0.75  0.75  0.25 ]
 [0.75  0.25  0.75 ]
 [0.125 0.625 0.625]
 [0.75  0.75  0.75 ]]
[0.125 0.125 0.125]
(3,)


In [59]:
import pandas as pd
import numpy as np

nitrogen_matrices = pd.read_csv("data_csv/nitrogen_values.csv")["Element_Matrix"]
nitrogen_matrices_numpy = []

for line in nitrogen_matrices:
    # Clean the string thoroughly
    line = line.strip().replace('"', '').replace('[', '').replace(']', '')
    # Split into rows (assuming rows are separated by newlines)
    rows = [row.strip() for row in line.split('\n') if row.strip()]
    print("Rows after cleaning:", rows)  # Debug output
    try:
        # Convert each row to a list of floats
        array = np.array([list(map(float, row.split())) for row in rows])
        print("First 3 rows:", array[0:3])
        nitrogen_matrices_numpy.append(array)
    except ValueError as e:
        print(f"Error processing line: {line}")
        print(f"Rows: {rows}")
        print(f"Error: {e}")
        raise  # Re-raise to stop and inspect

# Convert to NumPy array (use dtype=object if shapes vary)
nitrogen_matrices_numpy = np.array(nitrogen_matrices_numpy, dtype=object)
print("First 10 matrices:", nitrogen_matrices_numpy[0:10])

Rows after cleaning: ['0.625 0.125 0.125', '0.25  0.25  0.25', '0.625 0.625 0.125', '0.625 0.125 0.625', '0.25  0.75  0.25', '0.25  0.25  0.75', '0.625 0.625 0.625', '0.25  0.75  0.75']
First 3 rows: [[0.625 0.125 0.125]
 [0.25  0.25  0.25 ]
 [0.625 0.625 0.125]]
Rows after cleaning: ['0.625 0.125 0.125', '0.25  0.75  0.25', '0.75  0.75  0.25', '0.375 0.375 0.875', '0.625 0.625 0.625', '0.875 0.375 0.875', '0.25  0.75  0.75', '0.75  0.75  0.75']
First 3 rows: [[0.625 0.125 0.125]
 [0.25  0.75  0.25 ]
 [0.75  0.75  0.25 ]]
Rows after cleaning: ['0.125 0.125 0.125', '0.625 0.125 0.125']
First 3 rows: [[0.125 0.125 0.125]
 [0.625 0.125 0.125]]
Rows after cleaning: ['0.625 0.125 0.125', '0.125 0.125 0.625', '0.625 0.125 0.625', '0.25  0.75  0.25', '0.625 0.625 0.625']
First 3 rows: [[0.625 0.125 0.125]
 [0.125 0.125 0.625]
 [0.625 0.125 0.625]]
Rows after cleaning: ['0.125 0.125 0.625', '0.25  0.75  0.25', '0.875 0.875 0.375', '0.875 0.375 0.875', '0.75  0.75  0.75', '0.375 0.875 0.875']
F

In [60]:
print(nitrogen_matrices_numpy.shape)
print(nitrogen_matrices_numpy[0].shape)
print(nitrogen_matrices_numpy[0])
print(nitrogen_matrices_numpy[0][0])
print(nitrogen_matrices_numpy[0][0].shape)


(1179,)
(8, 3)
[[0.625 0.125 0.125]
 [0.25  0.25  0.25 ]
 [0.625 0.625 0.125]
 [0.625 0.125 0.625]
 [0.25  0.75  0.25 ]
 [0.25  0.25  0.75 ]
 [0.625 0.625 0.625]
 [0.25  0.75  0.75 ]]
[0.625 0.125 0.125]
(3,)


In [61]:
with open("data_used_pkl/hydrogen_matrices_numpy.pkl", "wb") as f:
    pickle.dump(hydrogen_matrices_numpy, f)
with open("data_used_pkl/nitrogen_matrices_numpy.pkl", "wb") as f:
    pickle.dump(nitrogen_matrices_numpy, f)

In [62]:
print(nitrogen_matrices_numpy[0].shape)
print(len(nitrogen_matrices_numpy[0]))

(8, 3)
8


In [63]:
def get_max_points(element_array):
    element_lengths = []
    for i in range(len(element_array)):
        element_matrix = element_array[i]
        element_lengths.append(len(element_matrix))
    max_points = max(element_lengths)
    return max_points

max_points_hydrogen = get_max_points(hydrogen_matrices_numpy)
print("Max points for hydrogen:", max_points_hydrogen)

Max points for hydrogen: 24
