# Saving best model
Saving model with the best perfomance XGboost in a form of json files as well as nexessary for the preprocessing MinMax scaler

In [1]:
import pandas as pd
import numpy as np
from pymatgen.core import Composition

import warnings
warnings.filterwarnings('ignore')

seed = 0
np.random.seed(seed)

from matminer.featurizers.conversions import StrToComposition
from matminer.featurizers.composition import ElementProperty, ElementFraction
from matminer.featurizers import composition as cf
from matminer.featurizers.base import MultipleFeaturizer

from sklearn.preprocessing import MinMaxScaler

from pickle import dump

In [2]:
df = pd.read_csv('data/raw_data.csv')
df = df.rename(columns={'Tc (K)':'T_c', 'Name': 'formula'})
df = df.drop(['Source', 'Page'], axis=1)
df = df[df['T_c'] > 2]
df = df.groupby('formula', as_index=False).median()

In [4]:
feature_calculators = MultipleFeaturizer([cf.Stoichiometry(),
                                          cf.ElementProperty.from_preset("magpie"),
                                          cf.ValenceOrbital(props=['avg']),
                                          cf.IonProperty(fast=True)])


df = StrToComposition().featurize_dataframe(df, 'formula')
df = feature_calculators.featurize_dataframe(df, col_id='composition')

ep = ElementFraction()

df = ep.featurize_dataframe(df, 'composition')

HBox(children=(HTML(value='StrToComposition'), FloatProgress(value=0.0, max=2522.0), HTML(value='')))




HBox(children=(HTML(value='MultipleFeaturizer'), FloatProgress(value=0.0, max=2522.0), HTML(value='')))




HBox(children=(HTML(value='ElementFraction'), FloatProgress(value=0.0, max=2522.0), HTML(value='')))




In [5]:
feature_labels = feature_calculators.feature_labels()
original_count = len(df)
df = df[~ df[feature_labels].isnull().any(axis=1)]
df.columns = [column_name.replace(" ", "_") for column_name in df.columns]
for column in df.columns[9:-110]:
    df.rename(columns={column:column[11:]}, inplace=True)

In [43]:
df

Unnamed: 0,formula,T_c,composition,0-norm,2-norm,3-norm,5-norm,7-norm,10-norm,minimum_Number,...,Hg,Tl,Pb,Bi,Th,U,Np,Pu,Am,Cm
0,(Ce0.7La0.3)3Al11,3.70,"(Ce, La, Al)",3,0.802483,0.787675,0.785755,0.785715,0.785714,13.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,(Ce0.9La0.1)3Al11,5.40,"(Ce, La, Al)",3,0.809321,0.789574,0.785854,0.785720,0.785714,13.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,(Co0.38Fe0.62)2P,459.00,"(Co, Fe, P)",3,0.588331,0.498544,0.443835,0.426933,0.418187,15.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,(Co0.4Fe0.6)2P,453.00,"(Co, Fe, P)",3,0.584998,0.493242,0.435713,0.416972,0.406636,15.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,(Co0.4Mn0.6)2P,445.00,"(Co, Mn, P)",3,0.584998,0.493242,0.435713,0.416972,0.406636,15.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2517,ZrMn2H3.6,146.50,"(Zr, Mn, H)",3,0.642110,0.578487,0.551282,0.546728,0.545607,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2518,ZrMn2H3.8,147.00,"(Zr, Mn, H)",3,0.648394,0.587843,0.563403,0.559719,0.558915,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2519,ZrMn3.8H3.6,133.00,"(Zr, Mn, H)",3,0.634424,0.557188,0.506784,0.487389,0.473625,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2520,ZrZn1.9,26.35,"(Zr, Zn)",2,0.740376,0.685579,0.660381,0.656215,0.655279,30.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# check and remove constant columns
def search_constant_columns(data):
    constant_columns = []
    for col in data.columns:
        if col not in ['T_c', 'formula', 'composition']:
            if data[col].std() == 0:
                constant_columns += [col]
    return constant_columns

constant_columns = search_constant_columns(df)

# remove constant columns in the dataset
df.drop(constant_columns, axis=1, inplace=True)


print('Constant columns:', constant_columns)
print(f'Removed {len(constant_columns)}/{len(df.columns)} constant columns')

Constant columns: ['minimum_GSbandgap', 'He', 'Ne', 'Ar', 'Kr', 'Tc', 'Xe', 'Pm', 'Po', 'At', 'Rn', 'Fr', 'Ra', 'Ac', 'Pa', 'Bk', 'Cf', 'Es', 'Fm', 'Md', 'No', 'Lr']
Removed 22/229 constant columns


In [7]:
# check and remove duplicates columns
def search_duplicates(data):
    groups = data.columns.to_series().groupby(data.dtypes).groups
    duplicate_columns = []
    for type_of_col, name_of_col in groups.items():
        cs = data[name_of_col].columns
        vs = data[name_of_col]
        for i in range(len(cs)):
            ia = vs.iloc[:,i].values
            for j in range(i + 1, len(cs)):
                ja = vs.iloc[:,j].values
                if np.array_equal(ia, ja):
                    duplicate_columns += [cs[i]]
                    break
    return duplicate_columns

duplicate_columns = search_duplicates(df)
print('Before:', df.shape)
df.drop(duplicate_columns, axis=1, inplace=True)
print('After:', df.shape)
print(duplicate_columns)
print(f'Removed {len(duplicate_columns)}/{len(df.columns)} duplicating columns')

Before: (2520, 229)
After: (2520, 224)
['mean_NsValence', 'mean_NpValence', 'mean_NdValence', 'mean_NfValence', 'maximum_GSbandgap']
Removed 5/224 duplicating columns


In [8]:
X = df.drop(['formula', 'T_c', 'composition'], axis=1)
y = df.T_c

In [39]:
for i in df.columns:
    print(f'"{i}"', end=',')

"formula","T_c","composition","0-norm","2-norm","3-norm","5-norm","7-norm","10-norm","minimum_Number","maximum_Number","range_Number","mean_Number","avg_dev_Number","mode_Number","minimum_MendeleevNumber","maximum_MendeleevNumber","range_MendeleevNumber","mean_MendeleevNumber","avg_dev_MendeleevNumber","mode_MendeleevNumber","minimum_AtomicWeight","maximum_AtomicWeight","range_AtomicWeight","mean_AtomicWeight","avg_dev_AtomicWeight","mode_AtomicWeight","minimum_MeltingT","maximum_MeltingT","range_MeltingT","mean_MeltingT","avg_dev_MeltingT","mode_MeltingT","minimum_Column","maximum_Column","range_Column","mean_Column","avg_dev_Column","mode_Column","minimum_Row","maximum_Row","range_Row","mean_Row","avg_dev_Row","mode_Row","minimum_CovalentRadius","maximum_CovalentRadius","range_CovalentRadius","mean_CovalentRadius","avg_dev_CovalentRadius","mode_CovalentRadius","minimum_Electronegativity","maximum_Electronegativity","range_Electronegativity","mean_Electronegativity","avg_dev_Electrone

In [10]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [20]:
scaler

MinMaxScaler()

# XGBoost

In [11]:

from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV, ShuffleSplit, KFold
from xgboost import XGBRegressor

xgb_model = XGBRegressor(random_state=seed)

num_folds = 10

cv = ShuffleSplit(n_splits=num_folds, random_state=seed)

parameters_xgb = {'max_depth' : [4, 6, 8, 10],
              'n_estimators' : [50, 100, 200, 500],
             'learning_rate' : [0.01, 0.05, 0.1, 0.2]}

xgb_grid_search = GridSearchCV(
                     estimator=xgb_model,
                     param_grid=parameters_xgb,
                     scoring='r2',
                     cv=cv,
                     n_jobs=-1,
                     verbose=10)

In [12]:
%%time
xgb_grid_search.fit(X, y)

print(f'Best score: {xgb_grid_search.best_score_}')
print(f'Best model: {xgb_grid_search.best_params_}')

Fitting 10 folds for each of 64 candidates, totalling 640 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   15.6s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   24.2s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   52.3s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:  5.9min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:  8

Best score: 0.8610162193525179
Best model: {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 500}
CPU times: user 27.1 s, sys: 329 ms, total: 27.4 s
Wall time: 35min 17s


In [14]:
xgb_best = xgb_grid_search.best_estimator_

xgb_best.save_model('xgb_best_model.json')

scaler = dump(scaler, open('scaler.pkl', 'wb'))