# Test Energy Ordering
Determine whether our machine learning models order polymorphs based on relative stability correctly.

In [1]:
%matplotlib inline
from tqdm import tqdm_notebook as tqdm
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GroupKFold
from sklearn import metrics
from glob import glob
import numpy as np
import pandas as pd
import json
import os

## Load in the Data
Read in the data from JSON to create a DataFrame

In [2]:
def load_data(path):
    """Read in a JSON file from disk, and return a DataFrame"""
    
    temp = json.load(open(path))
    data = pd.DataFrame(temp['entries']).drop(['properties', 'poscar', 'name'], axis='columns')
    data['class'] = data['class'].apply(lambda x: x['measured'])
    data = data[~data['class'].isnull()]
    return data

In [3]:
%%time
data = dict([(os.path.basename(name)[:-5], load_data(name)) for name in glob(os.path.join('..', 'datasets', '*.json'))])

CPU times: user 29.7 s, sys: 1.2 s, total: 30.9 s
Wall time: 55.2 s


## Run the Test
Our test is going to work by witholding all 3 polymorphs at a certain composition in the test set, training a model on the remaining data, and then assessing how well it ranked the polymorphs. As we have ~30k unique composition, we are going to hold out 10% of the compositions at one time

In [4]:
model = Pipeline([
    ('imputer', Imputer()),
    ('rf', RandomForestRegressor(n_estimators=100, n_jobs=-1)),
])

In [5]:
data['quat-heuslers']['predicted_class'] = np.ones_like(data['quat-heuslers']['class'])

In [6]:
%%time
training_set_sizes = []
for train_ind, test_ind in tqdm(GroupKFold(n_splits=10).split(data['quat-heuslers']['attributes'].tolist(),
                                                              data['quat-heuslers']['class'],
                                                              data['quat-heuslers']['composition'])):
    # Get the QH dataset
    qh_train = data['quat-heuslers'].loc[train_ind]
    training_set_sizes.append(len(qh_train))
    qh_test = data['quat-heuslers'].loc[test_ind]
    
    # Append rest of OQMD to QH training set
    train_data = pd.concat([qh_train, data['oqmd-no-heusler'], data['heuslers']])
    train_X = np.array(train_data['attributes'].tolist(), dtype=np.float32)
    test_X = np.array(qh_test['attributes'].tolist(), dtype=np.float32)
    
    # Train and test the model
    model.fit(train_X, train_data['class'])
    data['quat-heuslers'].loc[qh_test.index, 'predicted_class'] = model.predict(test_X)


CPU times: user 22h 35min 55s, sys: 41.4 s, total: 22h 36min 36s
Wall time: 5h 50min 39s


In [7]:
print('Training sets, on averaged, included %d QH entries.'%np.mean(training_set_sizes))

Training sets, on averaged, included 86569 QH entries.


## Save Results to Disk
Save results to disk for later analysis

In [8]:
data['quat-heuslers'].to_csv('qh-cv-data.csv', index=False)