In [1]:
#Self-hyperparam selection: https://link.springer.com/article/10.1007/s11063-024-11578-0
#Self-pruning: https://github.com/skarifahmed/seMLP/blob/main/src/Prune.py

In [2]:
from utils import Utils
from color import color 
import pandas as pd
import numpy as np
import os
# libraries
import joblib

# scale features
from sklearn import preprocessing
from sklearn import impute
# classifier
from sklearn.ensemble import ExtraTreesClassifier
# scoring metrics
from sklearn.metrics import confusion_matrix, matthews_corrcoef

# custom scripts
import sys
sys.path.insert(0, "%s" % "CV/")
from sklearn.model_selection import train_test_split, GridSearchCV, GroupShuffleSplit, StratifiedShuffleSplit, cross_validate, StratifiedKFold
from sklearn.metrics import roc_curve, auc, recall_score, accuracy_score, precision_score, confusion_matrix, make_scorer, matthews_corrcoef, jaccard_score

[1mHello World ![0m


In [3]:
site_path = "/Users/sanjanayasna/csc334/MLP_MAHOMES/sites_calculated_features.txt"

In [4]:
#read in feature set:
sites = pd.read_csv(site_path)
sites = sites.set_index('SITE_ID',drop=True)

# The following labels need to be changed after looking over literature (see Feehan, Franklin, Slusky 2021)
change_site_labels = ["5zb8_0", "6aci_0", "6oq7_0", "6pjv_1", "6q55_0",
                      "6q55_2", "6rmg_0", "6rtg_0", "6rw0_0", "6v77_0"]

# The following sites are removed due to unkopwn correct labels (see Feehan, Franklin, Slusky 2021)
sites.loc[sites.index.isin(change_site_labels), 'Catalytic']=True
remove_sites = ["6mf0_1", "6okh_0", "6qwo_0", "6r9n_0"]
sites=sites.loc[~sites.index.isin(remove_sites)]

#print shape of dataset
print(color.BOLD + "All features:" + color.END)
print("sites: %s \tcolumns: %s"%(sites.shape[0], sites.shape[1]))
sizes = sites.groupby(["Set", "Catalytic"]).size()
print(sizes)

[1mAll features:[0m
sites: 3981 	columns: 485
Set   Catalytic
data  False        2636
      True          829
test  False         345
      True          171
dtype: int64


In [5]:
#save_models toggel
save_models = False
#pkl output path
pkl_out = r'/Users/sanjanayasna/csc334/MLP_MAHOMES/pkl'

In [6]:
sites.head()

Unnamed: 0_level_0,Catalytic,MetalCodes,MetalAtoms,fa_atr_Sum_3.5,fa_rep_Sum_3.5,fa_sol_Sum_3.5,fa_intra_atr_xover4_Sum_3.5,fa_intra_rep_xover4_Sum_3.5,fa_intra_sol_xover4_Sum_3.5,lk_ball_Sum_3.5,...,geom_cn8,geom_cn9,geom_Filled,geom_PartFilled,geom_AvgN,geom_AvgO,geom_AvgS,geom_AvgOther,SC_vol_perc,Set
SITE_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6s9z_0,True,1,1,-33.20757,20.22373,26.34441,-1.88617,0.46054,2.14096,14.05052,...,0.0,0.0,0.0,1.0,3.0,0.0,0.0,0.0,0.910384,test
6g5l_0,True,1,1,-27.04899,39.17134,22.76555,-1.71942,0.45999,2.05517,12.94894,...,0.0,0.0,0.0,1.0,3.0,0.0,0.0,0.0,0.862189,test
6hwz_0,True,1,1,-27.30433,35.04867,23.45195,-1.62146,0.35902,1.91231,13.06378,...,0.0,0.0,0.0,1.0,3.0,0.0,0.0,0.0,0.991431,test
6qww_0,True,1,1,-25.36664,12.54178,27.17902,-1.14349,0.22087,1.68091,11.47631,...,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.864546,test
6qww_1,False,1,1,-30.53159,8.99318,27.77842,-1.00782,0.39657,1.04229,13.23736,...,0.0,0.0,1.0,0.0,0.0,4.0,0.0,1.0,0.990893,test


In [7]:
#Get scaled features
data_scaled, Tsites_scaled = Utils.get_scaled_features(sites =sites, pkl_out=pkl_out, save_models=save_models)
#Print stats
print(color.BOLD + "All scaled data-set features:" + color.END)
print("sites: %s \tcolumns: %s"%(data_scaled.shape[0], data_scaled.shape[1]))
print(data_scaled.groupby(["Catalytic"]).size())

print(color.BOLD + "\nAll scaled T-metal-site features:" + color.END)
print("sites: %s \tcolumns: %s"%(Tsites_scaled.shape[0], Tsites_scaled.shape[1]))
print(Tsites_scaled.groupby(["Catalytic"]).size())

[1mAll scaled data-set features:[0m
sites: 3465 	columns: 484
Catalytic
False    2636
True      829
dtype: int64
[1m
All scaled T-metal-site features:[0m
sites: 516 	columns: 484
Catalytic
False    345
True     171
dtype: int64


In [8]:
dir = "/Users/sanjanayasna/csc334/MLP_MAHOMES/data/"
#save the scaled data
data_scaled.to_csv(os.path.join(dir, "data_scaled.csv"))
Tsites_scaled.to_csv(os.path.join(dir, "Tsites_scaled.csv"))

In [9]:
#set feature set type
MAHOMES_feature_set = "AllMeanSph"

In [10]:
#Well sampled training data
#X is train
#y is target for train
X, y = Utils.get_training_data(MAHOMES_feature_set, random_seed = 1, data_scaled= data_scaled)
 ## prepare test-set
testX = Tsites_scaled.copy()
testY = testX['Catalytic']; del testX['Catalytic']
testX = Utils.feature_subset(testX, MAHOMES_feature_set, noBSA=True)

## get multiple predictions for test-set w/ diff random seeds
test_site_preds = {'actual': pd.Series(testY, index=testX.index)}

#Overview:
# X: training data
# y: target for training data
# testX: test data
# testY: target for test data

In [11]:
#Train 
init_features = len(X.columns)
print(init_features)

181


In [41]:
#Prelim mlp
#Possible avenue for bias and weight matrix initialization:
## Initialize weights using Xavier uniform initialization
# init.xavier_uniform_(linear_layer.weight)
 
# ## Initialize bias to zero
# init.zeros_(linear_layer.bias)
#---------------------------------
import torch
from torch import nn
from torch.utils.data import DataLoader
class MLP(nn.Module):  # nn.Module is the base class for all models in PyTorch
    def __init__(self):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(init_features, 181),
            nn.ReLU(),
            nn.Linear(181, 90),
            nn.ReLU(),
            nn.Linear(90, 1),
            #try to make output binary (0 or 1)
        )
    def forward(self, x):
     #   x =  self.layers(x)
        return self.layers(x)

In [42]:
#Loads to torch tensors
class dataLoader:
    #Use ONLY train data 
    def __init__(self, X, y):
        #converts x and y to numpy arr so they can be torch tensor
        if not torch.is_tensor(X) and not torch.is_tensor(y):
            X = X.to_numpy()
            y = y.to_numpy()
        #x_train
        # if not torch.is_tensor(X):
        #     self.X = torch.from_numpy(X)
        # #y_train
        # if not torch.is_tensor(y):
        #     self.y = torch.from_numpy(y)
        self.X = torch.from_numpy(X)
        self.y = torch.from_numpy(y)
    def get_trainloader(dataset):
        return torch.utils.data.DataLoader(dataset, batch_size=10, shuffle=True, num_workers=1)
    def get_testloader(dataset):
        return torch.utils.data.DataLoader(dataset, batch_size=10, shuffle=True, num_workers=1)
    #to get lenght, for enumerator use
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [14]:
#to set num samples variable for dataset
num_samples = len(X)

In [15]:
import torch.utils.data.sampler as sampler
#Will use subsetRandomSampler (which assumes a shuffle=trfue data loading argument)

In [43]:
#initialize dataloader with random sampling of size 10 
dataset = dataLoader(X, y)
trainloader = torch.utils.data.DataLoader(dataset, batch_size=10, num_workers=0, shuffle = True)
testloader = torch.utils.data.DataLoader(dataset, batch_size=10, num_workers=0, shuffle=True)

In [44]:
#mlp init
mlp = MLP()
#set loss function and gradient descet optimizer
loss_function = nn.L1Loss()
optimizer = torch.optim.Adagrad(mlp.parameters(), lr=1e-4)

In [18]:
#check that enumerate works
enumerate(trainloader, 0)

<enumerate at 0x13b6d38d0>

In [45]:
#train for this many epochs
for epoch in range(0,10):
    print(f'Starting Epoch {epoch+1}')

    current_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        inputs, targets = data
        inputs, targets = inputs.float(), targets.float()
        targets = targets.reshape((targets.shape[0], 1))
        optimizer.zero_grad()

        outputs = mlp(inputs)

        loss = loss_function(outputs, targets)

        loss.backward()

        optimizer.step()

        current_loss += loss.item()
        if i%10 == 0:
            print(f'Loss after mini-batch %5d: %.3f'%(i+1, current_loss/500))
            current_loss = 0.0
    
    print(f'Epoch {epoch+1} done')

Starting Epoch 1
Loss after mini-batch     1: 0.001
Loss after mini-batch    11: 0.005
Loss after mini-batch    21: 0.005
Loss after mini-batch    31: 0.006
Loss after mini-batch    41: 0.005
Loss after mini-batch    51: 0.005
Loss after mini-batch    61: 0.006
Loss after mini-batch    71: 0.005
Loss after mini-batch    81: 0.004
Loss after mini-batch    91: 0.007
Loss after mini-batch   101: 0.005
Loss after mini-batch   111: 0.004
Loss after mini-batch   121: 0.003
Loss after mini-batch   131: 0.004
Loss after mini-batch   141: 0.005
Loss after mini-batch   151: 0.005
Loss after mini-batch   161: 0.006
Loss after mini-batch   171: 0.005
Loss after mini-batch   181: 0.005
Loss after mini-batch   191: 0.006
Loss after mini-batch   201: 0.005
Loss after mini-batch   211: 0.005
Loss after mini-batch   221: 0.004
Loss after mini-batch   231: 0.007
Loss after mini-batch   241: 0.005
Loss after mini-batch   251: 0.005
Loss after mini-batch   261: 0.005
Loss after mini-batch   271: 0.005
Los

In [None]:
test_data = torch.from_numpy(testX.to_numpy()).float()
test_targets = torch.from_numpy(testY.to_numpy()).float()
print("Test data outputs look like this", test_targets)

In [48]:
#Run mlp model on test data
mlp.eval() 

MLP(
  (layers): Sequential(
    (0): Linear(in_features=181, out_features=181, bias=True)
    (1): ReLU()
    (2): Linear(in_features=181, out_features=90, bias=True)
    (3): ReLU()
    (4): Linear(in_features=90, out_features=1, bias=True)
    (5): ReLU()
  )
)

In [49]:
from sklearn.metrics import mean_squared_error, r2_score
with torch.no_grad():
    outputs = mlp(test_data)
    predicted_labels = outputs.squeeze().tolist()

predicted_labels = np.array(predicted_labels)
test_targets = np.array(test_targets)

mse = mean_squared_error(test_targets, predicted_labels)
r2 = r2_score(test_targets, predicted_labels)
print("Mean Squared Error:", mse)
print("R2 Score:", r2)
# Mean Squared Error: 0.18271737790066617
# R2 Score: 0.17536060337896386

Mean Squared Error: 0.33139534953873107
R2 Score: -0.4956522703341628
