In [None]:
# Update the root directory to the location of Main and python files
root_directory = 'C:/Users/U S VAITESSWAR/Desktop/For training/CGCNN'
root_directory_2 = root_directory + '/cifs_dataset31'

In [None]:
import csv
import functools
import json
import os
import random
import warnings

import numpy as np
import pandas as pd
import torch
from pymatgen.core.structure import Structure

class AtomInitializer(object):
    """
    Base class for intializing the vector representation for atoms.
    !!! Use one AtomInitializer per dataset !!!
    """
    def __init__(self, atom_types):
        self.atom_types = set(atom_types)
        self._embedding = {}

    def get_atom_fea(self, atom_type):
        assert atom_type in self.atom_types
        return self._embedding[atom_type]

    def load_state_dict(self, state_dict):
        self._embedding = state_dict
        self.atom_types = set(self._embedding.keys())
        self._decodedict = {idx: atom_type for atom_type, idx in
                            self._embedding.items()}

    def state_dict(self):
        return self._embedding

    def decode(self, idx):
        if not hasattr(self, '_decodedict'):
            self._decodedict = {idx: atom_type for atom_type, idx in
                                self._embedding.items()}
        return self._decodedict[idx]

class AtomCustomJSONInitializer(AtomInitializer):
    """
    Initialize atom feature vectors using a JSON file, which is a python
    dictionary mapping from element number to a list representing the
    feature vector of the element.
    Parameters
    ----------
    elem_embedding_file: str
        The path to the .json file
    """
    def __init__(self, elem_embedding_file):
        with open(elem_embedding_file) as f:
            elem_embedding = json.load(f)
        # elem_embedding is a dictonary containing feature vectors for the first 100 elements.
        # The keys (atom number according to periodic table) in 'string' format and the values (atom features) are in list format.
        # The keys are converted from string to integer format and restored as a dictionary.
        elem_embedding = {int(key): value for key, value
                          in elem_embedding.items()}
        # Removing repeated keys in the dictionary
        atom_types = set(elem_embedding.keys())
        
        super(AtomCustomJSONInitializer, self).__init__(atom_types)
        # Converting the format of values (atomic features) from list to numpy array and 
        # restored into the dictionary again.
        for key, value in elem_embedding.items():
            self._embedding[key] = np.array(value, dtype=float)

In [None]:
class GaussianDistance(object):
    """
    Expands the distance by Gaussian basis.
    Unit: angstrom
    """
    def __init__(self, dmin, dmax, step, var=None):
        """
        Parameters
        ----------
        dmin: float
          Minimum interatomic distance
        dmax: float
          Maximum interatomic distance
        step: float
          Step size for the Gaussian filter
        """
        assert dmin < dmax
        assert dmax - dmin > step
        self.filter = np.arange(dmin, dmax+step, step)
        if var is None:
            var = step
        self.var = var

    def expand(self, distances):
        """
        Apply Gaussian disntance filter to a numpy distance array
        Parameters
        ----------
        distance: np.array shape n-d array
          A distance matrix of any shape
        Returns
        -------
        expanded_distance: shape (n+1)-d array
          Expanded distance matrix with the last dimension of length
          len(self.filter)
        """
        return np.exp(-(distances[..., np.newaxis] - self.filter)**2 /
                      self.var**2)

In [None]:
IDs = pd.read_excel('Unique IDs.xlsx',header = None).values
IDs = [IDs[i][0] for i in range(len(IDs))]
atomic_features = AtomCustomJSONInitializer(root_directory_2 + '/atom_init.json')
crystal_features = {}
max_num_nbr = 12
dmin = 0
step = 0.2
radius = 8

In [None]:
for i in IDs:
    crystal = Structure.from_file(directory + '/' + i + '.cif')
    atom_fea = np.vstack([atomic_features.get_atom_fea(crystal[i].specie.number) for i in range(len(crystal))])
    all_nbrs = crystal.get_all_neighbors(radius, include_index=True)
    all_nbrs = [sorted(nbrs, key=lambda x: x[1]) for nbrs in all_nbrs]
    GDF = GaussianDistance(dmin = dmin, dmax = radius, step = step)
    
    nbr_fea_idx, nbr_fea = [], []
    for nbr in all_nbrs:
        if len(nbr) < max_num_nbr:
            warnings.warn('{} not find enough neighbors to build graph. '
                            'If it happens frequently, consider increase '
                            'radius.'.format(i))

            nbr_fea_idx.append(list(map(lambda x: x[2], nbr)) + [0] * (max_num_nbr - len(nbr)))              
            nbr_fea.append(list(map(lambda x: x[1], nbr)) + [self.radius + 1.] * (max_num_nbr - len(nbr)))
                                   
        else:
            nbr_fea_idx.append(list(map(lambda x: x[2],nbr[:max_num_nbr])))                               
            nbr_fea.append(list(map(lambda x: x[1],nbr[:max_num_nbr])))
                                        
    nbr_fea_idx, nbr_fea = np.array(nbr_fea_idx), np.array(nbr_fea)
    nbr_fea = GDF.expand(nbr_fea)
    atom_fea = torch.Tensor(atom_fea)
    nbr_fea = torch.Tensor(nbr_fea)
    nbr_fea_idx = torch.LongTensor(nbr_fea_idx)
    crystal_features[i] = [atom_fea,nbr_fea,nbr_fea_idx]

In [None]:
import os

# root_directory is the location which contains the python files (Data & Model)
os.chdir(root_directory)

import argparse
import shutil
import sys
import time
import warnings
from random import sample

import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn import metrics
from torch.autograd import Variable
from torch.optim.lr_scheduler import MultiStepLR

from Data import CIFData, collate_pool, get_train_val_test_loader
from Model import CrystalGraphConvNet

In [None]:
use_cuda = True
is_cuda = use_cuda and torch.cuda.is_available()
torch.cuda.is_available()

In [None]:
class Normalizer(object):
    """Normalize a Tensor and restore it later. """

    def __init__(self, tensor):
        """tensor is taken as a sample to calculate the mean and std"""
        self.mean = torch.mean(tensor,dim = 0)
        self.std = torch.std(tensor,dim = 0)

    def norm(self, tensor):
        return (tensor - self.mean) / self.std

    def denorm(self, normed_tensor):
        return normed_tensor * self.std + self.mean

    def state_dict(self):
        return {'mean': self.mean,
                'std': self.std}

    def load_state_dict(self, state_dict):
        self.mean = state_dict['mean']
        self.std = state_dict['std']

In [None]:
def cust_loss(output, target_var):
    
    loss = 10**(torch.mean((target_var - output)**2))
    
    return loss

In [None]:
def train(train_loader, model, optimizer, epoch, normalizer_target, normalizer_crystal):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    mae_errors = AverageMeter()
    
    # switch to train mode
    model.train()
    
    loss_list = list()
    
    end = time.time()
    for i, (input, target, _) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        
        # normalize crystal features 
        crystal_fea = normalizer_crystal.norm(input[4])
        
        if is_cuda:
            input_var = (Variable(input[0].cuda(non_blocking=True)),
                         Variable(input[1].cuda(non_blocking=True)),
                         input[2].cuda(non_blocking=True),
                         [crys_idx.cuda(non_blocking=True) for crys_idx in input[3]],
                         Variable(crystal_fea).cuda(non_blocking=True))
        else:
            input_var = (Variable(input[0]),Variable(input[1]),input[2],input[3], Variable(crystal_fea))
                           
        # normalize target
        target_normed = normalizer_target.norm(target)
        
        target_var = Variable(target_normed)

        # compute output
        output = model(*input_var)
        
        # Computing loss
        loss = cust_loss(output, target_var)
        loss_mean = loss.data.cpu()
        loss_list.append(loss_mean)
        
        #print(output,target_var,loss.data.cpu())

        # measure accuracy and record loss
        mae_error = mae(normalizer_target.denorm(output.data.cpu()), target)
        losses.update(loss.data.cpu(), target.size(0))
        mae_errors.update(mae_error, target.size(0))

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % 100 == 0: # print frequency = 100
            
            print('Epoch: [{0}][{1}/{2}]\t'
                      'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                      'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                      'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                      'MAE {mae_errors.val:.3f} ({mae_errors.avg:.3f})'.format(
                    epoch, i, len(train_loader), batch_time=batch_time,
                    data_time=data_time, loss=losses, mae_errors=mae_errors)
                )
                
def validate(val_loader, model, normalizer_target, normalizer_crystal, test = False):
    batch_time = AverageMeter()
    losses = AverageMeter()
    mae_errors = AverageMeter()
    
    if test:
        test_targets = []
        test_preds = []
        test_cif_ids = []

    # switch to evaluate mode
    model.eval()

    end = time.time()
        
    for i, (input, target, batch_cif_ids) in enumerate(val_loader):
        with torch.no_grad():
            crystal_fea = normalizer_crystal.norm(input[4])
            
            if is_cuda:
                input_var = (Variable(input[0].cuda(non_blocking=True)),
                             Variable(input[1].cuda(non_blocking=True)),
                             input[2].cuda(non_blocking=True),
                             [crys_idx.cuda(non_blocking=True) for crys_idx in input[3]],
                             Variable(crystal_fea).cuda(non_blocking=True))
            else:
                input_var = (Variable(input[0]),
                             Variable(input[1]),
                             input[2],
                             input[3],
                             crystal_fea)

        target_normed = normalizer_target.norm(target)
        
        with torch.no_grad():
            target_var = Variable(target_normed)

        # compute output
        output = model(*input_var)
        
        # Computing loss
        loss = cust_loss(output, target_var)

        # measure accuracy and record loss
        mae_error = mae(normalizer_target.denorm(output.data.cpu()), target)
        losses.update(loss.data.cpu().item(), target.size(0))
        mae_errors.update(mae_error, target.size(0))
        
        if test:
            test_pred = normalizer_target.denorm(output.data.cpu())
            test_target = target
            test_preds += test_pred.view(-1).tolist()
            test_targets += test_target.view(-1).tolist()
            test_cif_ids += batch_cif_ids
            
        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
    
        if i % 100 == 0:
                print('Test: [{0}/{1}]\t' # print frequency = 100
                      'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                      'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                      'MAE {mae_errors.val:.3f} ({mae_errors.avg:.3f})'.format(
                      i, len(val_loader), batch_time=batch_time, loss=losses,
                      mae_errors=mae_errors))
            
    if test:
        star_label = '**'
        import csv
        with open('test_results.csv', 'w') as f:
            writer = csv.writer(f)
            for cif_id, target, pred in zip(test_cif_ids, test_targets,
                                            test_preds):
                writer.writerow((cif_id, target, pred))
    else:
        star_label = '*'
    
    if test:
        print(' {star} MAE {mae_errors.avg:.3f}'.format(star=star_label,mae_errors=mae_errors))                                             
        return test_preds,test_targets
    else:
        return mae_errors.avg

In [None]:
def mae(prediction, target):
    """
    Computes the mean absolute error between prediction and target
    Parameters
    ----------
    prediction: torch.Tensor (N, 1)
    target: torch.Tensor (N, 1)
    """
    prediction = 10**prediction
    target = 10**target
    errors = ((target - prediction)/target)*100
    MAE = torch.mean(torch.abs(errors))
    
    return MAE

class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
    torch.save(state, filename)
    if is_best:
        shutil.copyfile(filename, 'model_best.pth.tar')


def adjust_learning_rate(optimizer, epoch, k):
    """Sets the learning rate to the initial LR decayed by 10 every k epochs"""
    assert type(k) is int
    lr = args.lr * (0.1 ** (epoch // k))
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

In [None]:
dataset = CIFData(crystal_features,root_directory_2)
collate_fn = collate_pool

train_loader, val_loader, test_loader = get_train_val_test_loader(
    dataset = dataset,
    collate_fn = collate_fn,
    batch_size = 128,
    train_ratio = 0.8,
    num_workers = 0, # All workers
    val_ratio = 0.1,
    test_ratio = 0.1,
    pin_memory = is_cuda,
    train_size = None,
    val_size = None,
    test_size = None,
    return_test = True)

In [None]:
# Sampling 500 data points at random from dataset
sample_data_list = [dataset[i] for i in sample(range(len(dataset)), 500)]

In [None]:
sample_input, sample_target, _ = collate_pool(sample_data_list)
structures, _, _ = dataset[0] # Extracting only the first element of the data set
normalizer_target = Normalizer(sample_target)
normalizer_crystal = Normalizer(sample_input[4])

In [None]:
orig_atom_fea_len = structures[0].shape[-1] # Number of features in the atomic feature vector
nbr_fea_len = structures[1].shape[-1] # Number of features in the neighbor feature vector
crystal_fea_len = structures[3].shape[-1] # Number of additional crystal features

model = CrystalGraphConvNet(orig_atom_fea_len, nbr_fea_len,
                            atom_fea_len = 50, # First layer of linear transformation before convolution
                            n_conv = 3, # Number of convolution layers
                            h_fea_len = 100, # Number of units in first hidden layer of fully connected network
                            n_h = 2, # Number of fully connected layers
                            crystal_fea_len = crystal_fea_len,
                            classification = False) # Regression

if is_cuda:
    model.cuda()
    
#criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), 0.0001, weight_decay = 0.1)
scheduler = MultiStepLR(optimizer, milestones = [100],gamma = 0.1)

start_epoch = 1
end_epoch = 11 # 10 epochs

for epoch in range(start_epoch, end_epoch):
    
    # train for one epoch
    train(train_loader, model, optimizer, epoch, normalizer_target, normalizer_crystal)

    # evaluate on validation set
    mae_error = validate(val_loader, model, normalizer_target, normalizer_crystal)

    if mae_error != mae_error:
        print('Exit due to NaN')
        sys.exit(1)

    scheduler.step()
    
    best_mae_error = 1e10
    
    # remember the best mae_eror and save checkpoint
    is_best = mae_error < best_mae_error
    best_mae_error = min(mae_error, best_mae_error)

    save_checkpoint({
        'epoch': epoch + 1,
        'state_dict': model.state_dict(),
        'best_mae_error': best_mae_error,
        'optimizer': optimizer.state_dict(),
        'normalizer': normalizer_target.state_dict(),
    }, is_best)

# test best model
print('---------Evaluate Model on Test Set---------------')
best_checkpoint = torch.load('model_best.pth.tar')
model.load_state_dict(best_checkpoint['state_dict'])
test, target = validate(test_loader, model, normalizer_target, normalizer_crystal, test=True)

In [None]:
import xlsxwriter

workbook = xlsxwriter.Workbook('Full dataset (CGCNN test results).xlsx')
worksheet = workbook.add_worksheet()
row = 0
col = 0

# Iterate over the data and write it out row by row
for i in range(len(target)):
    worksheet.write(row, col, target[i])
    worksheet.write(row, col + 1, test[i])


    row += 1

workbook.close()