## Introduction
Where did data come from?

The material id needs to be prepended by "mp-". The job is performed below

In [1]:
# import os
# import sys
# module_path = os.path.abspath(os.path.join(''))
# if module_path not in sys.path:
#     sys.path.append(module_path)
    
# print (module_path)

# from utils.editdf import EditFile 

# ed = EditFile()
# ed.generateFile('assets/HalfHeusler.csv', 'assets/Heusler compound.csv')


In [2]:
import os.path
import pandas as pd

path_to_file = 'assets/HalfHeusler.csv'
file_name = 'assets/Heusler compound.csv'

if (not os.path.exists(file_name)):
    print ("A formatted csv file is produced\n")
    df = pd.read_csv(path_to_file)
    print (df.head())

    id_list = df['Materials-ID'].to_list()

    n = df.columns[0]
    df.drop(n, axis=1, inplace=True)

    df[n] = ["mp-" + str(x) for x in id_list]
    print (df.head())

    df.to_csv(file_name, index = False)

A formatted csv file is produced

   Materials-ID Chemical-Fomula 4c-site 4a-site 4b-site
0          2894          ScSnAu      Au      Sc      Sn
1          3161          LiAlSi      Si      Li      Al
2          3432          ScNiSb      Ni      Sc      Sb
3          3462          TmSnAu      Au      Tm      Sn
4          3522          MgCuSb      Cu      Mg      Sb
  Chemical-Fomula 4c-site 4a-site 4b-site Materials-ID
0          ScSnAu      Au      Sc      Sn      mp-2894
1          LiAlSi      Si      Li      Al      mp-3161
2          ScNiSb      Ni      Sc      Sb      mp-3432
3          TmSnAu      Au      Tm      Sn      mp-3462
4          MgCuSb      Cu      Mg      Sb      mp-3522


- Lattice parameters, atomic radii and atomic masses. The atomic radius is calculated value and not the empirical values
- Python library 'pymatgen'

In [3]:
'''
The Element class is located in the core subpakage inside the periodic_table module. 
The link to the API documentation is below.

    https://pymatgen.org/pymatgen.core.periodic_table.html#pymatgen.core.periodic_table.Element

Similarly the material project APIs are hosted in the following module.

    https://pymatgen.org/pymatgen.ext.matproj.html?highlight=mprester#module-pymatgen.ext.matproj
'''

import pymatgen.core as pg
from pymatgen.ext.matproj import MPRester
from math import sqrt
import sys

file_name_train = 'assets/Training data.csv'

if (not os.path.exists(file_name_train)):
    print ("Training data will be generated\n")

    heusler_df = pd.read_csv(file_name, header=0, usecols= ['Materials-ID', '4a-site', '4b-site', '4c-site'])
    data = []

    m = MPRester('fmdc9tZK1xE74JOq')
    for idx in heusler_df.index:
        mat_data = m.get_data(heusler_df['Materials-ID'][idx])
        lat = m.get_structure_by_material_id(heusler_df['Materials-ID'][idx])
        
        lat_const = lat.lattice.abc
        mag_moment = sum(lat.site_properties['magmom'])
        
        e1 = pg.Element(heusler_df['4a-site'][idx])
        e2 = pg.Element(heusler_df['4b-site'][idx])
        e3 = pg.Element(heusler_df['4c-site'][idx])

        x1 = e1.atomic_radius
        x2 = e2.atomic_radius
        x3 = e3.atomic_radius
        m1 = e1.atomic_mass
        m2 = e2.atomic_mass
        m3 = e3.atomic_mass
        
        
        x29 = m1+m2+m3
        x30 = x1+x2+x3
        x33 = (x29/3 -m1)
        x34 = (x29/3 -m2)
        x35 = (x29/3 -m3)
        x42 = (x30/3 -x1)
        x43 = (x30/3 -x2)
        x44 = (x30/3 -x3)
        x51 = (x1**2 + x2**2)
        x52 = (x1**2 + x3**2)
            
        data.append((x1,x2,x3,m1,m2,m3,m1**2,m2**2,m3**2,x1**2,x2**2,x3**2,
                     m1**3,m2**3,m3**3,x1**3,x2**3,x3**3,
                     sqrt(m1),sqrt(m2),sqrt(m3),sqrt(x1),sqrt(x2),sqrt(x3),
                     m2/m1, x3/m1, x2/x1,x3/x1, x29,x30,
                     ((m1**2+m2**2+m3**2)/3.)**2, ((x1**2+x2**2+x3**2)/3.)**2,
                     x33, x34, x35, abs(x33), abs(x34), abs(x35), x33**2, x34**2, x35**2,
                     x42, x43, x44, abs(x42), abs(x43), abs(x44), x42**2, x43**2, x44**2,
                     x51, x52,sqrt(x51), sqrt(x52), 
                     mat_data[0]["formation_energy_per_atom"], # mag_moment,
                     lat_const[0], lat_const[1], lat_const[2]
                    ))

    idx = []
    for i in range(1, 55):
        idx.append('x'+str(i))

#     idx.extend(['form_energy_per_atom', 'total_magnetic_moment','a', 'b', 'c'])
    idx.extend(['form_energy_per_atom','a', 'b', 'c'])
    df_train = pd.DataFrame(data, columns = idx)

    df_train.to_csv(file_name_train, index = False)

Training data will be generated



## Machine Learning
The gradient boosted trees are selected for the regression task. The cross-validation is the first step.

### Cross-Validation
A five fold cross validation will be performed for the better performance of the model.

In [4]:
# from sklearn.model_selection import KFold
# kf = KFold(n_splits=5, random_state=44)

df = pd.read_csv('assets/Training data.csv')

print (df.head())



     x1    x2    x3          x4          x5          x6            x7  \
0  1.60  1.45  1.35   44.955912  118.710000  196.966569   2021.034024   
1  1.45  1.25  1.10    6.941000   26.981539   28.085500     48.177481   
2  1.60  1.45  1.35   44.955912  121.760000   58.693400   2021.034024   
3  1.75  1.45  1.35  168.934210  118.710000  196.966569  28538.767308   
4  1.50  1.45  1.35   24.305000  121.760000   63.546000    590.733025   

             x8            x9     x10  ...       x49       x50     x51  \
0  14092.064100  38795.829304  2.5600  ...  0.000278  0.013611  4.6625   
1    728.003425    788.795310  2.1025  ...  0.000278  0.027778  3.6650   
2  14825.497600   3444.915204  2.5600  ...  0.000278  0.013611  4.6625   
3  14092.064100  38795.829304  3.0625  ...  0.004444  0.027778  5.1650   
4  14825.497600   4038.094116  2.2500  ...  0.000278  0.006944  4.3525   

      x52       x53       x54  form_energy_per_atom         a         b  \
0  4.3825  2.159282  2.093442            