In [1]:
from pymatgen import core
import numpy as np
import sys
from pymatgen.transformations.standard_transformations import OrderDisorderedStructureTransformation
from pymatgen.transformations.standard_transformations import PartialRemoveSpecieTransformation
from pymatgen.io.cif import CifParser
from scipy.spatial.distance import cdist
import pandas as pd
import matplotlib.pyplot as plt
import os
import scipy
from tqdm.autonotebook import tqdm

  from tqdm.autonotebook import tqdm


In [2]:
import pandas as pd
df = pd.read_csv('output_core_test.csv')

In [None]:
grouped = df.groupby('sg_number')['codid'].count()

# Plot the distribution
plt.figure(figsize=(10,6))
grouped.plot(kind='bar')
plt.title('Distribution of codid between different space group numbers')
plt.xlabel('Space Group Number')
plt.ylabel('Count of codid')
plt.ylim(0, 25)
plt.show()

In [None]:
df.head()

In [None]:
df['atom_counts']

In [None]:
# Define the mapping dictionary
bravais_lattice_mapping = {
    range(1, 3): 'Triclinic',
    range(3, 16): 'Monoclinic',
    range(16, 75): 'Orthorhombic',
    range(75, 143): 'Tetragonal',
    range(143, 168): 'Trigonal',
    range(168, 195): 'Hexagonal',
    range(195, 231): 'Cubic'
}

# Function to map space group number to Bravais lattice
def map_to_bravais(sg_number):
    for key in bravais_lattice_mapping:
        if sg_number in key:
            return bravais_lattice_mapping[key]
    return 'Unknown'

#add new column to df

# Apply the mapping to the 'sg_number' column
df['sg_bravais'] = df['sg_number'].apply(map_to_bravais)

# Now 'sg_number' column contains Bravais lattices instead of space group numbers

In [None]:
grouped_bravais = df.groupby('sg_bravais')['codid'].count()

# Plot the distribution
plt.figure(figsize=(10,6))
grouped_bravais.plot(kind='bar')
plt.title('Distribution of codid between different space group numbers')
plt.xlabel('Space Group Number')
plt.ylabel('Count of codid')
plt.ylim()
plt.show()

In [None]:
df.head()

In [None]:
import pandas as pd
import h5py
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

# Define the mapping dictionary
bravais_lattice_mapping = {
    range(1, 3): 'Triclinic',
    range(3, 16): 'Monoclinic',
    range(16, 75): 'Orthorhombic',
    range(75, 143): 'Tetragonal',
    range(143, 168): 'Trigonal',
    range(168, 195): 'Hexagonal',
    range(195, 231): 'Cubic'
}

# Function to map space group number to Bravais lattice
def map_to_bravais(sg_number):

    for key in bravais_lattice_mapping:
        if sg_number in key:
            return bravais_lattice_mapping[key]
        
    return 'Unknown'

def preprocess_dataframe(csv_file):

    dataframe = pd.read_csv(csv_file)
    dataframe['sg_bravais'] = dataframe['sg_number'].apply(map_to_bravais)
    dataframe = dataframe.drop(['reduced_formula', 'sg_symbol'], axis = 1)

    for angle in [
            'lattice_angle_alpha', \
            'lattice_angle_beta', \
            'lattice_angle_gamma'
            ]:
        dataframe[angle + '_sin'] = np.sin(np.deg2rad(dataframe[angle]))
        dataframe[angle + '_cos'] = np.cos(np.deg2rad(dataframe[angle]))
        dataframe = dataframe.drop(angle, axis = 1)

    return dataframe

def one_hot_encode_bravais(dataframe):

    one_hot = pd.get_dummies(dataframe['sg_bravais'])
    dataframe = dataframe.drop('sg_bravais',axis = 1)
    dataframe = dataframe.join(one_hot)

    return dataframe

def normalize_dataframe(dataframe):

    scaler = MinMaxScaler()
    columns_to_normalize = dataframe.columns.difference(['codid'])
    dataframe[columns_to_normalize] = scaler.fit_transform(dataframe[columns_to_normalize])

    return dataframe

def normalize_hdf5(hdf5_file):

    scaler = MinMaxScaler()

    with h5py.File(hdf5_file, 'r') as f:
        # Create a new hdf5 file
        new_hdf5_file = hdf5_file.replace('.hdf5', '_normalized.hdf5')

        with h5py.File(new_hdf5_file, 'w') as g:
            for key in tqdm(f.keys()):
                
                #Atliekame artumo matricos normalizvimą tarp 0 ir 1
                data = f[key][:]
                data = scaler.fit_transform(data)
                
                #Tikriname ar matrica yra kvadratinė, jei taip, nukerpame viską po istrižaine
                if data.ndim == 2 and data.shape[0] == data.shape[1]:
                    #sukuriame numpy kaukę, kur visi elementai virš istrižainės yra 1, o po ja 0
                    mask = np.triu(np.ones(data.shape), k=0).astype(bool)
                    #Sudauginus matricą su kauke, visi elementai po istrižaine tampa 0
                    data[~mask] = 0
                
                #Įrašome duomenis į naują hdf5 failą
                g.create_dataset(key, data=data)

    return new_hdf5_file

In [None]:
df_temp = preprocess_dataframe('output.csv')
df_temp = one_hot_encode_bravais(df_temp)
df_temp = normalize_dataframe(df_temp)

In [None]:
df_temp.head()

In [None]:
class Preprocessor:
    def __init__(self, csv_file, hdf5_file):
        self.csv_file = csv_file
        self.hdf5_file_path = hdf5_file

    def preprocess(self):
        self.dataframe = preprocess_dataframe(self.csv_file)
        self.dataframe = one_hot_encode_bravais(self.dataframe)
        self.dataframe = normalize_dataframe(self.dataframe)
        
        normalized_hdf5_file_path = self.hdf5_file_path.replace('.hdf5', '_normalized.hdf5')
        if not os.path.exists(normalized_hdf5_file_path):
            self.hdf5_file = normalize_hdf5(self.hdf5_file_path)
        else:
            self.hdf5_file = h5py.File(normalized_hdf5_file_path, 'r')
        
        return self.dataframe, self.hdf5_file

In [None]:
import h5py
import pandas as pd
import torch
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, csv_file, hdf5_file):
        preprocessor = Preprocessor(csv_file, hdf5_file)
        self.dataframe, self.hdf5_file = preprocessor.preprocess()
        self.dataframe.head()

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        # Get the 'codid' for the current index
        codid = self.dataframe.iloc[idx]['codid']

        # Convert codid to string for accessing the hdf5 file
        codid = str(int(codid))

        # Get the adjacency matrix for the current 'codid'
        adjacency_matrix = torch.from_numpy(self.hdf5_file['/' + codid][:])

        # Get the features for the current index, excluding 'bravais lattices'
        bravais_lattices = ['Triclinic', 'Monoclinic', 'Orthorhombic', \
                            'Tetragonal', 'Trigonal', 'Hexagonal', 'Cubic'
                            ]

        features = self.dataframe.drop(columns=bravais_lattices).iloc[idx].values
        features = torch.from_numpy(features)

        # Get the one-hot encoded 'sg_bravais' for the current index
        labels = self.dataframe[bravais_lattices].iloc[idx].values
        labels = torch.from_numpy(labels)

        return adjacency_matrix, features, labels, codid

In [None]:
dataset = CustomDataset('output_core.csv','output_aux.hdf5')
print(dataset[1])