In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
from fastai.vision.all import *
from fastai.callback import *
import numpy as np
import pandas as pd
import PIL
from rdkit import Chem
from rdkit.Chem import Draw, AllChem
from pathlib import Path
from sklearn.metrics import mean_squared_error, mean_absolute_error, roc_auc_score, r2_score
import warnings
warnings.filterwarnings("ignore")
device = torch.device("cuda")

In [None]:
def random_seed(seed_value):
    import random 
    random.seed(seed_value) # Python
    import numpy as np
    np.random.seed(seed_value) # cpu vars
    import torch
    torch.manual_seed(seed_value) # cpu  vars
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value) # gpu vars
        torch.backends.cudnn.deterministic = True  #needed
        torch.backends.cudnn.benchmark = False
random_seed(2023)

In [None]:
import torch
import numpy as np
from torch.utils.data import Dataset
from torchvision.transforms import Compose, Normalize, RandomRotation, ToTensor

class AugmentedFloatArrayDataset(Dataset):
    def __init__(self, arrays, targets, augment=False):
        self.arrays = arrays
        self.targets = targets
        self.augment = augment
        self.transform = Compose([
            ToTensor(),  # Ensure conversion to tensor and scaling to [0, 1]
            #CustomRotateTensor(degrees=(0, 360)),
            RandomRotation(degrees=(0, 360)),
              # Normalization
              Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

    def __len__(self):
        return len(self.arrays)

    def __getitem__(self, idx):
        array = self.arrays[idx]
        target = self.targets[idx]

        if self.augment:
            array = ToTensor()(array)
            array = RandomRotation(degrees=(0, 360),fill=[array[0,:,:][0][0],array[1,:,:][0][0],array[2,:,:][0][0]])(array)  # Random rotation
            #array = self.transform(array)
            array = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])(array)
        else:
            # Convert to tensor without augmentation, but ensure scaling and data type
            array = ToTensor()(array)
            array = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])(array)

        target = torch.tensor(target, dtype=torch.float)
        #array = array.permute(0, 2, 1)

        return array, target

In [None]:
train_dataset = AugmentedFloatArrayDataset(train_x, train_y, augment=True)
valid_dataset = AugmentedFloatArrayDataset(val_x, val_y, augment=False)
dls = DataLoaders.from_dsets(train_dataset, valid_dataset, bs=16, device='cuda')

In [None]:
def train_model(traindata, valdata, seed,group):
    train_y = []
    train_x=[]
    for i in traindata.index:
        n = Chem.MolFromSmiles(traindata['new_cation'][i])
        m = Chem.MolFromSmiles(traindata['new_anion'][i])
        img_n = Draw.MolToImage(n, size=(224,112), fitImage=True)
        img_m = Draw.MolToImage(m, size=(224,112), fitImage=True)
        img_n =np.array(img_n)
        img_m =np.array(img_m)
        img = np.concatenate((img_n, img_m), axis =0)
        img= img/255
        cha_1 = img[:,:, 0].reshape(224, 224, 1).astype(np.float64)
        cha_2 = img[:,:, 1].reshape(224, 224, 1).astype(np.float64)
        cha_3 = img[:,:, 2].reshape(224, 224, 1).astype(np.float64)
        new_cha_1 = (cha_1 + cha_2+cha_3)/3
        new_cha_2 = np.full((224,224), traindata['P'][i]).astype(np.float64)/traindata['P'].max()
        new_cha_2= new_cha_2.reshape(224, 224, 1)
        new_cha_3 = np.full((224,224), traindata['T'][i]).astype(np.float64)/traindata['T'].max()
        new_cha_3= new_cha_3.reshape(224, 224, 1)
        hh = np.concatenate((new_cha_1, new_cha_2, new_cha_3), axis =2)
        #hh= hh.transpose()
        train_x.append(hh)
        train_y.append(traindata['CO2-exp'][i])
    train_x = np.array(train_x).astype('float32')
    train_y = np.array(train_y).astype('float32')

    val_y = []
    val_x=[]
    for i in valdata.index:
        n = Chem.MolFromSmiles(valdata['new_cation'][i])
        m = Chem.MolFromSmiles(valdata['new_anion'][i])
        img_n = Draw.MolToImage(n, size=(224,112), fitImage=True)
        img_m = Draw.MolToImage(m, size=(224,112), fitImage=True)
        img_n =np.array(img_n)
        img_m =np.array(img_m)
        img = np.concatenate((img_n, img_m), axis =0)
        img= img/255
        cha_1 = img[:,:, 0].reshape(224, 224, 1).astype(np.float64)
        cha_2 = img[:,:, 1].reshape(224, 224, 1).astype(np.float64)
        cha_3 = img[:,:, 2].reshape(224, 224, 1).astype(np.float64)
        new_cha_1 = (cha_1 + cha_2+cha_3)/3
        new_cha_2 = np.full((224,224), valdata['P'][i]).astype(np.float64)/traindata['P'].max()
        new_cha_2= new_cha_2.reshape(224, 224, 1)
        new_cha_3 = np.full((224,224), valdata['T'][i]).astype(np.float64)/traindata['T'].max()
        new_cha_3= new_cha_3.reshape(224, 224, 1)
        hh = np.concatenate((new_cha_1, new_cha_2, new_cha_3), axis =2)
        #hh= hh.transpose()
        val_x.append(hh)
        val_y.append(valdata['CO2-exp'][i])
    val_x = np.array(val_x).astype('float32')
    val_y = np.array(val_y).astype('float32')
    random_seed(seed)
    train_dataset = AugmentedFloatArrayDataset(train_x, train_y, augment=True)
    valid_dataset = AugmentedFloatArrayDataset(val_x, val_y, augment=False)
    dls = DataLoaders.from_dsets(train_dataset, valid_dataset, bs=64, device='cuda')
    
    val_y = []
    val_x=[]
    train_y = []
    train_x=[]
    
    print(f'Training starting for {group}')
    learn = vision_learner(dls, densenet121,pretrained=True, metrics=rmse, normalize=False, n_out=1,loss_func=MSELossFlat())
    random_seed(seed)
    lrs=learn.lr_find(suggest_funcs=(minimum, steep, valley, slide))
    random_seed(seed)
    with learn.no_bar(), learn.no_logging():
        learn.fine_tune(3000, lrs.valley, cbs=[SaveModelCallback(fname= f'd121_CO2_{group}.model'),
                                               EarlyStoppingCallback(monitor='valid_loss', patience=100)])

In [None]:
seed =2024
for i in range(1,6):
    traindata = pd.read_csv(f'data/train_{i}_group_co2.csv')
    valdata = pd.read_csv(f'data/val_{i}_group_co2.csv')
    train_model(traindata, valdata, seed=seed, group =i)

In [None]:
test_data=pd.read_csv(f'data/test_group_co2.csv')
test_y = []
test_x=[]
for i in test_data.index:
    n = Chem.MolFromSmiles(test_data['new_cation'][i])
    m = Chem.MolFromSmiles(test_data['new_anion'][i])
    img_n = Draw.MolToImage(n, size=(224,112), fitImage=True)
    img_m = Draw.MolToImage(m, size=(224,112), fitImage=True)
    img_n =np.array(img_n)
    img_m =np.array(img_m)
    img = np.concatenate((img_n, img_m), axis =0)
    img= img/255
    cha_1 = img[:,:, 0].reshape(224, 224, 1).astype(np.float64)
    cha_2 = img[:,:, 1].reshape(224, 224, 1).astype(np.float64)
    cha_3 = img[:,:, 2].reshape(224, 224, 1).astype(np.float64)
    new_cha_1 = (cha_1 + cha_2+cha_3)/3
    new_cha_2 = np.full((224,224), test_data['P'][i]).astype(np.float64)/traindata['P'].max()
    new_cha_2= new_cha_2.reshape(224, 224, 1)
    new_cha_3 = np.full((224,224), test_data['T'][i]).astype(np.float64)/traindata['T'].max()
    new_cha_3= new_cha_3.reshape(224, 224, 1)
    hh = np.concatenate((new_cha_1, new_cha_2, new_cha_3), axis =2)
    #hh= hh.transpose()
    test_x.append(hh)
    test_y.append(test_data['CO2-exp'][i])
test_x = np.array(test_x).astype('float32')
test_y = np.array(test_y).astype('float32')

In [None]:
test_dataset = AugmentedFloatArrayDataset(test_x, test_y, augment=False)

In [None]:
true_final = np.zeros(shape=(1324,))
pred_final = np.zeros(shape=(1324,))
for g in range(1,6):
    learn = vision_learner(dls, densenet121,pretrained=True, metrics=rmse, normalize=False, n_out=1,loss_func=MSELossFlat())
    learn = learn.load(f'd121_CO2_{g}.model')
    true = []
    test_pred=[]
    for x, y in test_dataset:
        true.append(y)
        pred = learn.predict(x.unsqueeze(0))[1][0].numpy()
        test_pred.append(pred)
    true = np.array(true)
    test_pred = np.array(test_pred)
    true_final = true_final + true
    pred_final = pred_final + test_pred

In [None]:
test_pred,test_true = pred_final/5, true_final/5