In [1]:
import argparse
import shutil
import sys
import time
import random
from random import sample
import csv
import functools
import json
import os
import warnings
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.optim as optim
from sklearn import metrics
from torch.autograd import Variable
from torch.optim.lr_scheduler import MultiStepLR
from pymatgen.core.structure import Structure

from data import CIFData, collate_pool, get_train_val_test_loader
from model import CrystalGraphConvNet
from tools import AtomInitializer, AtomCustomJSONInitializer, Normalizer, GaussianDistance
from training import train, validate, mae, cust_loss

In [2]:
# Extract initialization vector for each element from atom_init.json
IDs = pd.read_excel('Unique IDs.xlsx',header = None).values
IDs = [IDs[i][0] for i in range(len(IDs))]
atomic_features = AtomCustomJSONInitializer('./cifs_dataset31/atom_init.json')
crystal_features = {}
max_num_nbr = 12
dmin = 0
step = 0.2
radius = 8

In [3]:
# Aggregating features of all atoms for each compound
for i in IDs:
    crystal = Structure.from_file('./cifs_dataset31/' + i + '.cif')
    atom_fea = np.vstack([atomic_features.get_atom_fea(crystal[i].specie.number) for i in range(len(crystal))])
    all_nbrs = crystal.get_all_neighbors(radius, include_index=True)
    all_nbrs = [sorted(nbrs, key=lambda x: x[1]) for nbrs in all_nbrs]
    GDF = GaussianDistance(dmin = dmin, dmax = radius, step = step)
    
    nbr_fea_idx, nbr_fea = [], []
    for nbr in all_nbrs:
        if len(nbr) < max_num_nbr:
            warnings.warn('{} not find enough neighbors to build graph. '
                            'If it happens frequently, consider increase '
                            'radius.'.format(i))

            nbr_fea_idx.append(list(map(lambda x: x[2], nbr)) + [0] * (max_num_nbr - len(nbr)))              
            nbr_fea.append(list(map(lambda x: x[1], nbr)) + [self.radius + 1.] * (max_num_nbr - len(nbr)))
                                   
        else:
            nbr_fea_idx.append(list(map(lambda x: x[2],nbr[:max_num_nbr])))                               
            nbr_fea.append(list(map(lambda x: x[1],nbr[:max_num_nbr])))
                                        
    nbr_fea_idx, nbr_fea = np.array(nbr_fea_idx), np.array(nbr_fea)
    nbr_fea = GDF.expand(nbr_fea)
    atom_fea = torch.Tensor(atom_fea)
    nbr_fea = torch.Tensor(nbr_fea)
    nbr_fea_idx = torch.LongTensor(nbr_fea_idx)
    crystal_features[i] = [atom_fea,nbr_fea,nbr_fea_idx]



In [5]:
# Check if cuda is available
use_cuda = True
is_cuda = use_cuda and torch.cuda.is_available()
torch.cuda.is_available()

False

In [6]:
dataset = CIFData(crystal_features,'./cifs_dataset31')
collate_fn = collate_pool

train_loader, val_loader, test_loader = get_train_val_test_loader(
    dataset = dataset,
    collate_fn = collate_fn,
    batch_size = 128,
    train_ratio = 0.8,
    num_workers = 0, # All workers
    val_ratio = 0.1,
    test_ratio = 0.1,
    pin_memory = is_cuda,
    train_size = None,
    val_size = None,
    test_size = None,
    return_test = True)

In [7]:
# Sampling 500 data points at random from dataset
sample_data_list = [dataset[i] for i in sample(range(len(dataset)), 500)]

In [8]:
sample_input, sample_target, _ = collate_pool(sample_data_list)
structures, _, _ = dataset[0] # Extracting only the first element of the data set
normalizer_target = Normalizer(sample_target)
normalizer_crystal = Normalizer(sample_input[4])

In [None]:
orig_atom_fea_len = structures[0].shape[-1] # Number of features in the atomic feature vector
nbr_fea_len = structures[1].shape[-1] # Number of features in the neighbor feature vector
crystal_fea_len = structures[3].shape[-1] # Number of additional crystal features

model = CrystalGraphConvNet(orig_atom_fea_len, nbr_fea_len,
                            atom_fea_len = 50, # First layer of linear transformation before convolution
                            n_conv = 3, # Number of convolution layers
                            h_fea_len = 100, # Number of units in first hidden layer of fully connected network
                            n_h = 2, # Number of fully connected layers
                            crystal_fea_len = crystal_fea_len,
                            classification = False) # Regression

if is_cuda:
    model.cuda()
    
#criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), 0.0001, weight_decay = 0.1)
scheduler = MultiStepLR(optimizer, milestones = [100],gamma = 0.1)

start_epoch = 1
end_epoch = 11 # 10 epochs

for epoch in range(start_epoch, end_epoch):
    
    # train for one epoch
    train(train_loader, model, optimizer, epoch, normalizer_target, normalizer_crystal)

    # evaluate on validation set
    mae_error = validate(val_loader, model, normalizer_target, normalizer_crystal)

    if mae_error != mae_error:
        print('Exit due to NaN')
        sys.exit(1)

    scheduler.step()
    
    best_mae_error = 1e10
    
    # remember the best mae_eror and save checkpoint
    is_best = mae_error < best_mae_error
    best_mae_error = min(mae_error, best_mae_error)

    save_checkpoint({
        'epoch': epoch + 1,
        'state_dict': model.state_dict(),
        'best_mae_error': best_mae_error,
        'optimizer': optimizer.state_dict(),
        'normalizer': normalizer_target.state_dict(),
    }, is_best)

# test best model
print('---------Evaluate Model on Test Set---------------')
best_checkpoint = torch.load('model_best.pth.tar')
model.load_state_dict(best_checkpoint['state_dict'])
test, target = validate(test_loader, model, normalizer_target, normalizer_crystal, test=True)

Epoch: [1][0/3309]	Time 1.377 (1.377)	Data 0.896 (0.896)	Loss 8.4212 (8.4212)	MAE 275.358 (275.358)


In [None]:
# Plot actual vs predicted and compute MAPE
plt.scatter(target,test)
pred = 10**test
target = 10**target
total = 0
cnt = 0
for i in range(len(y_pred)):
    total += (abs(target[i] - pred[i])/target[i])*100
    cnt += 1
print("The MAPE is " + str(round((total/cnt)[0],2)) + " %.")

In [None]:
# Save the predicted results
transform_y_pred = 10**test
transform_y_test = 10**target

import xlsxwriter

workbook = xlsxwriter.Workbook('Full dataset (CGCNN test results).xlsx') 
worksheet = workbook.add_worksheet()
row = 0
col = 0

actual = transform_y_test.tolist()
prediction = transform_y_pred.tolist()

# Iterate over the data and write it out row by row
for i in range(len(actual)):
    worksheet.write(row, col, actual[i][0])
    worksheet.write(row, col + 1, prediction[i])


    row += 1

workbook.close()