## Introduction
Where did data come from?

The material id needs to be prepended by "mp-". The job is performed below

In [60]:
# import os
# import sys
# module_path = os.path.abspath(os.path.join(''))
# if module_path not in sys.path:
#     sys.path.append(module_path)
    
# print (module_path)

# from utils.editdf import EditFile 

# ed = EditFile()
# ed.generateFile('assets/HalfHeusler.csv', 'assets/Heusler compound.csv')


In [61]:
import os.path
import pandas as pd

path_to_file = 'assets/HalfHeusler.csv'
file_name = 'assets/Heusler compound.csv'

if (not os.path.exists(file_name)):
    print ("A formatted csv file is produced\n")
    df = pd.read_csv(path_to_file)
    print (df.head())

    id_list = df['Materials-ID'].to_list()

    n = df.columns[0]
    df.drop(n, axis=1, inplace=True)

    df[n] = ["mp-" + str(x) for x in id_list]
    print (df.head())

    df.to_csv(file_name, index = False)

A formatted csv file is produced

   Materials-ID Chemical-Fomula 4c-site 4a-site 4b-site
0          2894          ScSnAu      Au      Sc      Sn
1          3161          LiAlSi      Si      Li      Al
2          3432          ScNiSb      Ni      Sc      Sb
3          3462          TmSnAu      Au      Tm      Sn
4          3522          MgCuSb      Cu      Mg      Sb
  Chemical-Fomula 4c-site 4a-site 4b-site Materials-ID
0          ScSnAu      Au      Sc      Sn      mp-2894
1          LiAlSi      Si      Li      Al      mp-3161
2          ScNiSb      Ni      Sc      Sb      mp-3432
3          TmSnAu      Au      Tm      Sn      mp-3462
4          MgCuSb      Cu      Mg      Sb      mp-3522


- Lattice parameters, atomic radii and atomic masses.
- Python library 'pymatgen'

In [62]:
'''
The Element class is located in the core subpakage inside the periodic_table module. 
The link to the API documentation is below.

    https://pymatgen.org/pymatgen.core.periodic_table.html#pymatgen.core.periodic_table.Element

Similarly the material project APIs are hosted in the following module.

    https://pymatgen.org/pymatgen.ext.matproj.html?highlight=mprester#module-pymatgen.ext.matproj
'''

import pymatgen.core as pg
from pymatgen.ext.matproj import MPRester
from math import sqrt
import sys

file_name_train = 'assets/Training data.csv'

if (not os.path.exists(file_name_train)):
    print ("Training data will be generated\n")

    heusler_df = pd.read_csv(file_name, header=0, usecols= ['Materials-ID', '4a-site', '4b-site', '4c-site'])
    data = []

    m = MPRester('fmdc9tZK1xE74JOq')
    for idx in heusler_df.index:
        lat = m.get_structures(heusler_df['Materials-ID'][idx])[0].lattice.abc
        mat_data = m.get_data(heusler_df['Materials-ID'][idx])
        
        e1 = pg.Element(heusler_df['4a-site'][idx])
        e2 = pg.Element(heusler_df['4b-site'][idx])
        e3 = pg.Element(heusler_df['4c-site'][idx])

        x1 = e1.atomic_radius_calculated
        x2 = e2.atomic_radius_calculated
        x3 = e3.atomic_radius_calculated
        m1 = e1.Z
        m2 = e2.Z
        m3 = e3.Z
        
        if (x1 is None):
            x1 = 2.16
            
        x29 = m1+m2+m3
        x30 = x1+x2+x3
        x33 = (x29/3 -m1)
        x34 = (x29/3 -m2)
        x35 = (x29/3 -m3)
        x42 = (x30/3 -x1)
        x43 = (x30/3 -x2)
        x44 = (x30/3 -x3)
        x51 = (x1**2 + x2**2)
        x52 = (x1**2 + x3**2)
            
        data.append((x1,x2,x3,m1,m2,m3,m1**2,m2**2,m3**2,x1**2,x2**2,x3**2,
                     m1**3,m2**3,m3**3,x1**3,x2**3,x3**3,
                     sqrt(m1),sqrt(m2),sqrt(m3),sqrt(x1),sqrt(x2),sqrt(x3),
                     m2/m1, x3/m1, x2/x1,x3/x1, x29,x30,
                     ((m1**2+m2**2+m3**2)/3.)**2, ((x1**2+x2**2+x3**2)/3.)**2,
                     x33, x34, x35, abs(x33), abs(x34), abs(x35), x33**2, x34**2, x35**2,
                     x42, x43, x44, abs(x42), abs(x43), abs(x44), x42**2, x43**2, x44**2,
                     x51, x52,sqrt(x51), sqrt(x52), mat_data[0]["formation_energy_per_atom"], lat[0], lat[1], lat[2]
                    ))

    idx = []
    for i in range(1, 55):
        idx.append('x'+str(i))

    idx.extend(['form_energy_per_atom','a', 'b', 'c'])
    df_train = pd.DataFrame(data, columns = idx)

    df_train.to_csv(file_name_train, index = False)

Training data will be generated



## Machine Learning
The gradient boosted trees are selected for the regression task. The cross-validation is the first step.

### Cross-Validation
A five fold cross validation will be performed for the better performance of the model.

In [63]:
# from sklearn.model_selection import KFold
# kf = KFold(n_splits=5, random_state=44)

df = pd.read_csv('assets/Training data.csv')

print (df.head())



     x1    x2    x3  x4  x5  x6    x7    x8    x9     x10  ...       x49  \
0  1.84  1.45  1.74  21  50  79   441  2500  6241  3.3856  ...  0.051378   
1  1.67  1.18  1.11   3  13  14     9   169   196  2.7889  ...  0.019600   
2  1.84  1.33  1.49  21  51  28   441  2601   784  3.3856  ...  0.049878   
3  2.22  1.45  1.74  69  50  79  4761  2500  6241  4.9284  ...  0.124844   
4  1.45  1.33  1.45  12  51  29   144  2601   841  2.1025  ...  0.006400   

        x50     x51     x52       x53       x54  form_energy_per_atom  \
0  0.004011  5.4881  6.4132  2.342669  2.532430             -0.760831   
1  0.044100  4.1813  4.0210  2.044823  2.005243             -0.191266   
2  0.004011  5.1545  5.6057  2.270352  2.367636             -0.963594   
3  0.004011  7.0309  7.9560  2.651584  2.820638             -0.835058   
4  0.001600  3.8714  4.2050  1.967587  2.050610             -0.335699   

          a         b         c  
0  4.611608  4.611608  4.611608  
1  4.199121  4.199121  4.199121  
2 