In [None]:
! pip install -U kaleido 
! pip install joblib optuna torch numpy 

In [None]:
import joblib
import random
import pandas as pd
import os
import optuna 
import torch 
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset,DataLoader
from torchmetrics import R2Score
import numpy as np
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score

In [None]:
FOLD_DICT = joblib.load("../input/perov-fold-data/fold_data_export.z")
USE_DF = pd.read_csv("../input/perov-scaled-data/scaled_trainable.csv")
TAR_COL = "JV_default_PCE_numeric"
EPOCHS = 700
K_FOLD = 20
IN_FEATURES = 103 

In [None]:
#preparing the dataset class for the input
class CustDataset(Dataset):
    
    def __init__(self,df):
        self.labels = df[TAR_COL].to_numpy(dtype=np.float64)
        self.features = df.drop([TAR_COL],axis=1).to_numpy(dtype=np.float64)
        
    def classes(self):
        return self.labels
    def __len__(self):
        return len(self.labels)
    def get_batch_labels(self,idx):
        return np.array(self.labels[idx])
    
    def get_batch_features(self,idx):
        return np.array(self.features[idx])
    
    def __getitem__(self,idx):
        batch_features = self.get_batch_features(idx)
        batch_y = self.get_batch_labels(idx)
        
        return batch_features, batch_y   


In [None]:
#Building the MLP
class MLP(torch.nn.Module):
    def __init__(self,in_feature):
        super(MLP,self).__init__()
        self.in_feature = in_feature
        self.hid1 = torch.nn.Linear(in_feature, 128)
        self.dropout = nn.Dropout(0.25)
        self.batchnorm1 = nn.BatchNorm1d(128)
        self.hid2 = torch.nn.Linear(128,64)
        self.hid3 = torch.nn.Linear(64,32)
        self.output = torch.nn.Linear(32,1)
        
        torch.nn.init.xavier_uniform_(self.hid1.weight)
        torch.nn.init.zeros_(self.hid1.bias)
        torch.nn.init.xavier_uniform_(self.hid2.weight)
        torch.nn.init.zeros_(self.hid2.bias)
        torch.nn.init.xavier_uniform_(self.hid3.weight)
        torch.nn.init.zeros_(self.hid3.bias)
        torch.nn.init.xavier_uniform_(self.output.weight)
        torch.nn.init.zeros_(self.output.bias)
    
    def forward(self,x):
        z = torch.relu(self.hid1(x))
        z = self.batchnorm1(z)
#         z = self.dropout(z)
        z = torch.relu(self.hid2(z))
#         z = self.dropout(z)
        z = torch.relu(self.hid3(z))
        z = self.output(z)
        return z

In [None]:
#Redirecting the MLP 

def build_model(in_features):
    return MLP(in_features)
    

In [None]:
def save_optuna_plots(study,dirname):
    optim_hist = optuna.visualization.plot_optimization_history(study)
    intermediate = optuna.visualization.plot_intermediate_values(study)
    parallel = optuna.visualization.plot_parallel_coordinate(study)
    plot_slice = optuna.visualization.plot_slice(study)
    name_lists = ["optim_hist","intermediate","parallel", "plot_slice"]
    plot_lists = [optim_hist,intermediate,parallel, plot_slice]
    for name,plot in zip(name_lists,plot_lists):
        if os.path.exists(f"./{dirname}"):
            print("getting into if block")
            plot.write_image(f"./{dirname}/{name}.jpg",width=2, height=2)
        else:
            print("getting into else block")
            os.mkdir(f"./{dirname}")
            plot.write_image(f"./{dirname}/{name}.jpg",width=2, height=2)

In [None]:
def train_eval(params, model,fold,trial):
    
    #loading data 
    train_index = FOLD_DICT[fold]["train"]
    test_index = FOLD_DICT[fold]["test"]
    train = CustDataset(USE_DF.iloc[train_index,:])
    val = CustDataset(USE_DF.iloc[test_index,:])
    train_loader = DataLoader(train,batch_size = 32, shuffle=False)
    val_loader = DataLoader(val,batch_size= 32, shuffle=False)
    
    cuda_bool = torch.cuda.is_available()
    device = torch.device("cuda" if cuda_bool else "cpu")
    
    criterion = nn.MSELoss()
    optimizer = getattr(optim, params["optimizer"])(model.parameters(),lr=params["learning_rate"])
    
    #creating custom early stopping
    patience = 5
    best_val_loss = float("inf")
    counter = 0
    
    if cuda_bool:
        model = model.cuda()
        criterion = criterion.cuda()
        
    
    
    for epoch_num in range(EPOCHS):
        
        loss_list = []
        #training loop 
        for train_x, train_y in train_loader:
            
            train_y = train_y.to(device)
            train_y = train_y.reshape((train_y.shape[0],1))
            train_x = train_x.to(device)
            
            output = model(train_x.float())
            
            batch_loss = criterion(output, train_y.float())
            
            optimizer.zero_grad()
            
            model.zero_grad()
            batch_loss.backward()
            optimizer.step()


          
        
        model.eval()
        with torch.no_grad():
            
            loss = 0.0
            #validation loop 
            for val_x,val_y in val_loader:
                
                val_x = val_x.to(device)
                val_y = val_y.reshape((val_y.shape[0],1))
                val_y = val_y.to(device)
                
                output = model(val_x.float())
                
                batch_loss = criterion(output, val_y.float())
                loss += batch_loss.item()
                loss_list.append(batch_loss.item())
            print(f"Validating:[{epoch_num+1}/{EPOCHS}] LOSS: {batch_loss.item()}]")
        if loss < best_val_loss:
            best_val_loss = loss
            counter = 0
        else: 
            counter += 1
            if counter >= patience: 
                print(f"[==] Early Stopping at {loss}")
                break
        
    main_loss = np.mean(loss_list)
    return main_loss

In [None]:
#defing optuna objective function
def objective(trial):
    params = {
        "learning_rate": trial.suggest_float("learning_rate",1e-5, 1e-1),
        "optimizer" : trial.suggest_categorical("optimizer",["Adam", "RMSprop", "SGD"])
    }
    
    fold = TRIAL_FOLD
    model = build_model(IN_FEATURES)
    main_loss = train_eval(params, model,fold, trial)
    
    return main_loss 

In [None]:
# Definging best hyper parameteres
NUM_TRIALS = 30
TRIAL_FOLD = random.choice([x for x in range(K_FOLD)])
study = optuna.create_study(direction="minimize",
                            sampler=optuna.samplers.TPESampler(),
                            pruner=optuna.pruners.MedianPruner())
study.optimize(objective, n_trials=NUM_TRIALS)
save_optuna_plots(study,"optuna_plots")


In [None]:
#CAPTURING BEST HYPERPARAMETERS
best_param = study.best_params
best_param

In [None]:
#Creating train function for all folds
def main_train(best_param, model,fold):
    
    #loading data 
    train_index = FOLD_DICT[fold]["train"]
    test_index = FOLD_DICT[fold]["test"]
    train = CustDataset(USE_DF.iloc[train_index,:])
    val = CustDataset(USE_DF.iloc[test_index,:])
    train_loader = DataLoader(train,batch_size = 32, shuffle=False)
    val_loader = DataLoader(val,batch_size= 32, shuffle=False)
    
    cuda_bool = torch.cuda.is_available()
    device = torch.device("cuda" if cuda_bool else "cpu")
    
    criterion = nn.MSELoss()
    optimizer = getattr(optim,best_param["optimizer"])(model.parameters(),lr=best_param["learning_rate"])
    
    
    #creating custom early stopping
    patience = 5
    best_val_loss = float("inf")
    counter = 0 
    
    
    if cuda_bool:
        model = model.cuda()
        criterion = criterion.cuda()
        
    
    
    for epoch_num in range(EPOCHS):
        
        mean_sq_list = []
        r_mean_sq_list = []
        mean_abs_list = []
        r2_list = []
        #training loop 
        for train_x, train_y in train_loader:
            
            train_y = train_y.to(device)
            train_y = train_y.reshape((train_y.shape[0],1))
            train_x = train_x.to(device)
            
            print(train_x.shape)
            output = model(train_x.float())
            print(output.shape)
            batch_loss = criterion(output, train_y.float())
            print(batch_loss)
            optimizer.zero_grad()
            
            model.zero_grad()
            batch_loss.backward()
            optimizer.step()


          
        
        model.eval()
        with torch.no_grad():
            
            #validation loop 
            loss = 0.0
            for val_x,val_y in val_loader:
                
                val_x = val_x.to(device)
                val_y = val_y.reshape((val_y.shape[0],1))
                val_y = val_y.to(device)
                
                output = model(val_x.float())
                
                batch_loss = criterion(output, val_y.float())
                loss += batch_loss.item()
                mean_abs_error = nn.L1Loss()(output, val_y)
                r_mean_sq_error = torch.sqrt(batch_loss)
                r2_score = R2Score().to(device)(output, val_y)
                mean_sq_list.append(batch_loss.item())
                mean_abs_list.append(mean_abs_error.item())
                r2_list.append(r2_score.item())
                r_mean_sq_list.append(r_mean_sq_error.item())
            print(f"Validating:[{epoch_num+1}/{EPOCHS}] LOSS: {batch_loss.item()}]")
        if loss < best_val_loss:
            best_val_loss = loss
            counter = 0
        else: 
            counter += 1
            if counter >= patience: 
                print(f"[==] Early Stopping at {loss}")
                break
        
    sq_mean_loss = np.mean(mean_sq_list)
    r_sq_mean_loss = np.mean(r_mean_sq_list)
    abs_mean_loss = np.mean(mean_abs_list)
    r2_mean_loss = np.mean(r2_list)
    
    error = {"mse_error" : sq_mean_loss ,
    "mae_error" : abs_mean_loss,
    "rmse_error" : r_sq_mean_loss,
    "r2_score" :  r2_mean_loss}
    return error,model

In [None]:
def save_cv_model(i,model_name,model,optim,losses,output_path="./"):

    ''' This function saves cross validation model in the corresponding directory ( if the path does not exist it creates the path for it'''


    if os.path.exists(os.path.join(output_path,f"{i}_{model_name}_{optim}")):
        torch.save(model, os.path.join(output_path,f"{i}_{model_name}_{optim}/{i}_model.z"))
        with open(os.path.join(output_path,f"{i}_{model_name}_{optim}/losses_{fold}.txt"),"w+") as file:file.write(f" mse_loss :: {str(losses)}")
    else:
        os.mkdir(os.path.join(output_path,f"{i}_{model_name}_{optim}"))
        torch.save(model, os.path.join(output_path,f"{i}_{model_name}_{optim}/{i}_model.z"))
        with open(os.path.join(output_path,f"{i}_{model_name}_{optim}/losses_{fold}.txt"),"w+") as file:file.write(f" mse_loss :: {str(losses)}")

In [None]:
model_name = "MLP"
optim_name = best_param["optimizer"]
for fold in [x for x in range(K_FOLD)]:
    dum_model = build_model(IN_FEATURES)
    print(f"Training for fold [{fold+1}/{K_FOLD}] started ")
    error,model = main_train(best_param,dum_model,fold)
    print(f"Saving data for fold [{fold+1}/{K_FOLD}]")
    save_cv_model(fold,model_name,model,optim_name,error,output_path="./")