In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import yaml
import json

import tensorflow as tf

from pathlib import Path
from pymatgen.core import Structure
from sklearn.model_selection import train_test_split
from megnet.models import MEGNetModel
from megnet.data.crystal import CrystalGraph

# Посмотрим на данные

In [5]:
def read_pymatgen_dict(file):
    with open(file, "r") as f:
        d = json.load(f)
    return Structure.from_dict(d)


def prepare_dataset(dataset_path):
    dataset_path = Path(dataset_path)
#     targets = pd.read_csv(dataset_path / "targets.csv", index_col=0)
    struct = {
        item.name.strip(".json"): read_pymatgen_dict(item)
        for item in (dataset_path / "structures").iterdir()
    }

    data = pd.DataFrame(columns=["structures"], index=struct.keys())
#     data = data.assign(structures=struct.values(), targets=targets)
    data = data.assign(structures=struct.values())

    return data

In [6]:
data = prepare_dataset('data/dichalcogenides_private/')
data

Unnamed: 0,structures
6141cf244e27a1844a5f0016,[[1.27612629e-07 1.84192955e+00 3.71975100e+00...
614230b231cf3ef3d4a9f3a8,[[1.27612629e-07 1.84192955e+00 3.71975100e+00...
6142201aee0a3fd43fb47eb5,[[1.27612629e-07 1.84192955e+00 3.71975100e+00...
6141f69f4e27a1844a5f0490,[[1.27612629e-07 1.84192955e+00 3.71975100e+00...
6142d21f4e27a1844a5f0940,[[1.27612629e-07 1.84192955e+00 3.71975100e+00...
...,...
6141fd803ac25c70a5c6ca47,"[[-1.59515772 4.604824 3.719751 ] Mo, [-3..."
6142180cee0a3fd43fb47e89,[[1.27612629e-07 1.84192955e+00 3.71975100e+00...
6141e5a63ac25c70a5c6c9ab,[[1.27612629e-07 1.84192955e+00 3.71975100e+00...
6142463d4e27a1844a5f0722,[[1.27612629e-07 1.84192955e+00 3.71975100e+00...


# Получение хим состава

In [7]:
from collections import defaultdict

def decompose(structure):
    result = defaultdict(int)
    for site in structure.sites:
        result[site.species.formula] += 1
    return result

In [8]:
data['decomposition'] = data.structures.apply(decompose)
data['len_of_decomposition'] = data['decomposition'].apply(len)

data.head()

Unnamed: 0,structures,decomposition,len_of_decomposition
6141cf244e27a1844a5f0016,[[1.27612629e-07 1.84192955e+00 3.71975100e+00...,"{'Mo1': 63, 'W1': 1, 'Se1': 1, 'S1': 126}",4
614230b231cf3ef3d4a9f3a8,[[1.27612629e-07 1.84192955e+00 3.71975100e+00...,"{'Mo1': 63, 'Se1': 1, 'S1': 126}",3
6142201aee0a3fd43fb47eb5,[[1.27612629e-07 1.84192955e+00 3.71975100e+00...,"{'Mo1': 63, 'Se1': 2, 'S1': 126}",3
6141f69f4e27a1844a5f0490,[[1.27612629e-07 1.84192955e+00 3.71975100e+00...,"{'Mo1': 63, 'W1': 1, 'Se1': 1, 'S1': 126}",4
6142d21f4e27a1844a5f0940,[[1.27612629e-07 1.84192955e+00 3.71975100e+00...,"{'Mo1': 63, 'W1': 1, 'Se1': 1, 'S1': 126}",4


In [9]:
data['decomposition'].value_counts()

TypeError: unhashable type: 'collections.defaultdict'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 5231, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'collections.defaultdict'


{'Mo1': 63, 'Se1': 1, 'S1': 126}             713
{'Mo1': 63, 'W1': 1, 'Se1': 1, 'S1': 126}    699
{'Mo1': 63, 'W1': 1, 'Se1': 2, 'S1': 126}    383
{'Mo1': 63, 'S1': 126}                       380
{'Mo1': 63, 'Se1': 2, 'S1': 126}             366
{'Mo1': 63, 'W1': 1, 'S1': 126}              363
{'Mo1': 64, 'Se1': 1, 'S1': 126}              11
{'Mo1': 64, 'Se1': 2, 'S1': 126}              11
{'Mo1': 63, 'W1': 1, 'Se1': 1, 'S1': 127}     10
{'Mo1': 63, 'Se1': 1, 'S1': 127}               8
{'Mo1': 63, 'W1': 1, 'S1': 127}                7
{'Mo1': 63, 'S1': 127}                         7
{'Mo1': 64, 'S1': 126}                         7
{'Mo1': 64, 'Se1': 1, 'S1': 127}               1
{'Mo1': 64, 'S1': 127}                         1
Name: decomposition, dtype: int64

In [10]:
data['len_of_decomposition'].value_counts()

3    1480
4    1092
2     395
Name: len_of_decomposition, dtype: int64