In [None]:
!pip install pymatgen
!pip install megnet

In [None]:
 # impport package 

import yaml
import json

import pandas as pd
import numpy as np
import tensorflow as tf

from pathlib import Path
from pymatgen.core import Structure
from sklearn.model_selection import train_test_split
from megnet.models import MEGNetModel
from megnet.data.crystal import CrystalGraph
from megnet.data.graph import GaussianDistance

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Help Function

In [None]:
def read_pymatgen_dict(file):
    with open(file, "r") as f:
        d = json.load(f)
    return Structure.from_dict(d)

def energy_within_threshold(prediction, target):
    # compute absolute error on energy per system.
    # then count the no. of systems where max energy error is < 0.02.
    e_thresh = 0.02
    error_energy = tf.math.abs(target - prediction)

    success = tf.math.count_nonzero(error_energy < e_thresh)
    total = tf.size(target)
    return success / tf.cast(total, tf.int64)


def prepare_dataset(dataset_path):
    dataset_path = Path(dataset_path)
    targets = pd.read_csv(dataset_path / "targets.csv", index_col=0)
    struct = {
        item.name.strip(".json"): read_pymatgen_dict(item)
        for item in (dataset_path / "structures").iterdir()
    }

    data = pd.DataFrame(columns=["structures"], index=struct.keys())
    data = data.assign(structures=struct.values(), targets=targets)

    return train_test_split(data, test_size=0.33, random_state=666)

 
def prepare_model(cutoff, lr):
    nfeat_bond = 200
    r_cutoff = cutoff
    gaussian_centers = np.linspace(0, r_cutoff + 1, nfeat_bond)
    gaussian_width = 0.5
    
    return MEGNetModel(
        graph_converter=CrystalGraph(cutoff=r_cutoff),
        centers=gaussian_centers,
        width=gaussian_width,
        embedding_dim=32,
        nblocks=5,
        loss=["MAE"],
        npass=2,
        lr=lr,
        metrics=energy_within_threshold,
    )


callbackFolder = "/content/drive/MyDrive/Project/International-Data-Analysis-Olympiad-2022/Student Division/callback"
def main(config):
    train, test = prepare_dataset(config["datapath"])
    model = prepare_model(
        float(config["model"]["cutoff"]),
        float(config["model"]["lr"]), 
    )
    
    model.train(
        train.structures,
        train.targets,
        validation_structures=test.structures,
        validation_targets=test.targets,
        epochs=int(config["model"]["epochs"]),
        batch_size=int(config["model"]["batch_size"]),
        prev_model=callbackFolder + "/val_mae_01431_0.035633.hdf5",
        save_checkpoint=True,
        dirname=callbackFolder,
    )

In [None]:
with open("/content/drive/MyDrive/Project/International-Data-Analysis-Olympiad-2022/Student Division/config.yaml") as file:
    config = yaml.safe_load(file)

In [None]:
main(config)

In [None]:
def sub(config):
    model = prepare_model(
        float(config["model"]["cutoff"]), float(config["model"]["lr"])
    )
    model.load_weights(config['checkpoint_path'])

    dataset_path = Path(config['test_datapath'])
    struct = {item.name.strip('.json'): read_pymatgen_dict(item) for item in (dataset_path/'structures').iterdir()}
    private_test = pd.DataFrame(columns=['id', 'structures'], index=struct.keys())
    private_test = private_test.assign(structures=struct.values())
    private_test = private_test.assign(predictions=model.predict_structures(private_test.structures))
    private_test[['predictions']].to_csv('./submission.csv', index_label='id')
with open("/content/drive/MyDrive/Project/International-Data-Analysis-Olympiad-2022/Student Division/config.yaml") as file:
    config = yaml.safe_load(file)
sub(config)