In [1]:
import json


In [2]:
with open ('../data/finetune_data.json') as f:
    data = json.load(f)

In [3]:
mat = data['mp-644256']
mat.keys()

dict_keys(['incar', 'kpoints', 'poscar', 'robocrys', 'structure'])

In [4]:
mat['poscar']

'Li4 H8 C4 O12\n1.0\n   4.9024999999999999    0.0000000000000000    0.0000000000000000\n   0.0000000000000000    6.5645600000000002    0.0000000000000000\n   0.0000000000000000    0.0000000000000000    9.9938570000000002\nLi H C O\n4 8 4 12\ndirect\n   0.2413980000000000    0.3816140000000000    0.4506720000000000 Li\n   0.7413980000000000    0.6183860000000000    0.5493280000000000 Li\n   0.7413980000000000    0.8816140000000000    0.0493280000000000 Li\n   0.2413980000000000    0.1183860000000000    0.9506720000000000 Li\n   0.6417090000000000    0.3318130000000000    0.2149830000000000 H\n   0.1417090000000000    0.6681870000000000    0.7850170000000000 H\n   0.1417090000000000    0.8318130000000000    0.2850170000000000 H\n   0.6417090000000000    0.1681870000000000    0.7149830000000000 H\n   0.1258510000000000    0.4014190000000000    0.1291760000000000 H\n   0.6258510000000000    0.5985810000000000    0.8708240000000000 H\n   0.6258510000000000    0.9014190000000000    0.3708240

In [5]:
from tqdm import tqdm

In [6]:
res = []

for k, v in tqdm(data.items()):
    temp = {}
    temp['instruction'] = 'Generate the POSCAR file for the given crystal structure.'
    temp['input'] = v['robocrys']
    temp['output'] = v['poscar']
    res.append(temp)


100%|██████████| 1523/1523 [00:00<00:00, 206702.21it/s]


In [10]:
with open('test_POSCAR', 'w') as f:
    f.write(res[0]['output'])


In [26]:
import random

In [28]:
instruction = "Generate the poscar file based on the description of a material. Here are some examples you can learn. In each example, <Example> denote this is an example, [Description] denotes the following texts are the description of a material, [POSCAR] denotes the following texts are the corresponding poscar file. <Question> denotes this is a question, and the subsequent texts are the description of a material. You need to generate the corresponding poscar file."

# each example should be string like this
# <Example>
# [Description]
# the description from res[i]['input']
# [POSCAR]
# the poscar from res[i]['output']
examples = []

# randomly select 10 examples
index = random.sample(range(len(res)), 16)

for i in index[:-2]:
    example = "<Example>\n"
    example += "[Description]\n"
    example += res[i]['input']
    example += "\n[POSCAR]\n"
    example += res[i]['output']
    examples.append(example)

question = "<Question>\n[Description]\n"
question += res[index[-1]]['input']

prompt = instruction + '\n' + '\n'.join(examples) + '\n' + question

prompt

'Generate the poscar file based on the description of a material. Here are some examples you can learn. In each example, <Example> denote this is an example, [Description] denotes the following texts are the description of a material, [POSCAR] denotes the following texts are the corresponding poscar file. <Question> denotes this is a question, and the subsequent texts are the description of a material. You need to generate the corresponding poscar file.\n<Example>\n[Description]\nLi₂Al₂Si₄HO₁₄ crystallizes in the triclinic P1 space group. There are two inequivalent Li sites. In the first Li site, Li is bonded to four O atoms to form LiO₄ tetrahedra that share corners with three AlO₄ tetrahedra and corners with three SiO₄ tetrahedra. There are a spread of Li-O bond distances ranging from 1.94-2.08 Å. In the second Li site, Li is bonded to four O atoms to form LiO₄ tetrahedra that share corners with three AlO₄ tetrahedra and corners with three SiO₄ tetrahedra. There are a spread of Li-O 

In [29]:
with open('prompt', 'w') as f:
    f.write(prompt)


In [30]:
with open('test_POSCAR', 'w') as f:
    f.write(res[index[-1]]['output'])

In [31]:
res[index[-1]]['input']

'LiHSeO₃ crystallizes in the orthorhombic P2_12_12₁ space group. Li¹⁺ is bonded to four O²⁻ atoms to form corner-sharing LiO₄ tetrahedra. There are a spread of Li-O bond distances ranging from 1.96-2.05 Å. H¹⁺ is bonded in a distorted linear geometry to two O²⁻ atoms. There is one shorter (1.02 Å) and one longer (1.60 Å) H-O bond length. Se⁴⁺ is bonded in a trigonal non-coplanar geometry to three O²⁻ atoms. There are a spread of Se-O bond distances ranging from 1.69-1.84 Å. There are three inequivalent O²⁻ sites. In the first O²⁻ site, O²⁻ is bonded in a trigonal planar geometry to one Li¹⁺, one H¹⁺, and one Se⁴⁺ atom. In the second O²⁻ site, O²⁻ is bonded in a trigonal planar geometry to two equivalent Li¹⁺ and one Se⁴⁺ atom. In the third O²⁻ site, O²⁻ is bonded in a 3-coordinate geometry to one Li¹⁺, one H¹⁺, and one Se⁴⁺ atom.'

In [17]:
# train test split
from sklearn.model_selection import train_test_split

In [25]:
# train-val-test split approx 8-1-1
train, val = train_test_split(res, test_size=0.1, random_state=42)
# train, val = train_test_split(train, test_size=0.1, random_state=42)

len(train), len(val)

(1370, 153)

In [26]:
with open('../data/poscar_train.json', 'w') as f:
    json.dump(train, f)

with open('../data/poscar_val.json', 'w') as f:
    json.dump(val, f)

In [19]:
annotation = {
    'train': train,
    'validation': val,
    'test': test
}

In [20]:
annotation['train']

[{'instruction': 'Generate the POSCAR file for the given crystal structure.',
  'input': 'LiGa(SeO₃)₂ crystallizes in the tetragonal I̅42d space group. Li¹⁺ is bonded in a 4-coordinate geometry to four O²⁻ atoms. There are two shorter (2.01 Å) and two longer (2.03 Å) Li-O bond lengths. Ga³⁺ is bonded in an octahedral geometry to six O²⁻ atoms. There are a spread of Ga-O bond distances ranging from 1.99-2.03 Å. Se⁴⁺ is bonded in a distorted trigonal non-coplanar geometry to three O²⁻ atoms. There are a spread of Se-O bond distances ranging from 1.72-1.75 Å. There are three inequivalent O²⁻ sites. In the first O²⁻ site, O²⁻ is bonded in a 3-coordinate geometry to one Li¹⁺, one Ga³⁺, and one Se⁴⁺ atom. In the second O²⁻ site, O²⁻ is bonded in a trigonal non-coplanar geometry to one Li¹⁺, one Ga³⁺, and one Se⁴⁺ atom. In the third O²⁻ site, O²⁻ is bonded in a bent 120 degrees geometry to one Ga³⁺ and one Se⁴⁺ atom.',
  'output': 'Li4 Ga4 Se8 O24\n1.0\n  -5.3669739999999999    5.366973999999

In [21]:
with open('../data/finetune_annotation.json', 'w') as f:
    json.dump(annotation, f)