MLP Tool

In [1]:
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

from numpy import genfromtxt

# Load data into a Pandas DataFrame
x_data = genfromtxt('../data/embs/embspKA-stef/1layerNN-1L1NLbias258/601Oembs.csv',delimiter=',',encoding='utf-8-sig',skip_header=1)

# Split data into X and y
X = x_data[:,0:128]
#128 --> pka, NMR 
#138 onwards --> electron density 
y = x_data[:,128]

label = x_data[:,129:133]

#label to split as well
# Assuming your dataset is stored in X and y
# X should contain the first 128 features, and y should contain the corresponding target values

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test, label_train, label_test = train_test_split(X, y, label, test_size=0.2, random_state=50)

use_train_for_test = False 


# Standardize the data
scaler_X = StandardScaler()
scaler_y = StandardScaler()

X_train_scaled = scaler_X.fit_transform(X_train)
X_test_scaled = scaler_X.transform(X_test)

y_train_scaled = scaler_y.fit_transform(y_train.reshape(-1, 1))
y_test_scaled = scaler_y.transform(y_test.reshape(-1, 1))

# Create an MLPRegressor model
model = MLPRegressor(hidden_layer_sizes=(200,200), activation='logistic', random_state=42)

# Train the model
model.fit(X_train_scaled, y_train_scaled)


if use_train_for_test == True:
    # Make predictions
    predictions_scaled = model.predict(X_train_scaled)
    predictions = scaler_y.inverse_transform(predictions_scaled)

    # Evaluate the model
    mse = mean_squared_error(y_train, predictions)
    r2 = r2_score(y_train, predictions)
else:
    # Make predictions
    predictions_scaled = model.predict(X_test_scaled)
    predictions = scaler_y.inverse_transform(predictions_scaled.reshape(-1, 1))

    # Evaluate the model
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)

import math
print("Mean Square-root Error:", math.sqrt(mse))
print(r2)

  y = column_or_1d(y, warn=True)


Mean Square-root Error: 1.4925829589642705
0.819563985498875




Test the ones from 5000-6000 

In [21]:
test_data = genfromtxt('../data/embs/old/model1-10000/layer6/Oembslayer6-withcarbacid.csv',delimiter=',',encoding='utf-8-sig',skip_header=1)

print(test_data)
# Split data into X and y
#FOR CARBON 5000-6000
#X_test = test_data[23769:29442,0:128]
#y_test = test_data[23769:29442,137]

#FOR HYDRO 5000-6000
#NOTE the last H in this set is the carb acid triply fluorinated! 
X_test = test_data[5871:7004,0:128]
#y_test = test_data[36244:44601,137]

#FOR Oxy 5000-6000
#NOTE the last H in this set is the carb acid triply fluorinated! 
predictions_scaled = model.predict(X_test)
predictions = scaler_y.inverse_transform(predictions_scaled.reshape(-1, 1))


#FOR NMR

y_test = predictions

[[ 6.73119770e-01 -9.51074620e-01  7.91478950e-01 ...  5.00000000e+00
  -2.04606960e+03             nan]
 [-5.69457640e-01  1.96116492e-01  1.64892563e+00 ...  5.00000000e+00
  -2.04749130e+03             nan]
 [-1.32256100e-02 -3.45614160e-01  6.69564300e-01 ...  5.00000000e+00
  -2.04654080e+03             nan]
 ...
 [-6.57988785e-01 -1.54054826e-01  2.38495001e+00 ...  5.00000000e+00
  -2.04825590e+03             nan]
 [ 1.14769748e+00 -3.87425200e-02  1.13655392e+00 ...  5.00000000e+00
  -2.04804270e+03             nan]
 [ 2.69633990e-02  1.01960132e+00  1.01233864e+00 ...  5.00000000e+00
  -2.04791310e+03             nan]]


In [33]:
print(len(y_test))

import numpy as np

testfile_path = '../carbacidd1-p2.csv'
pert_test_data = genfromtxt(testfile_path,delimiter=',',encoding='utf-8-sig',skip_header=0)
O_test_data = pert_test_data[pert_test_data[:,129] == 8]

print(len(O_test_data))

1133
1133


In [34]:
import numpy as np


# Split data into X and y
X_test = O_test_data[:,0:128]
#128 --> pka, NMR 
#138 onwards --> electron density 

predictions_scaled = model.predict(X_test)
predictions = scaler_y.inverse_transform(predictions_scaled.reshape(-1, 1))

# Evaluate the model
mse = mean_squared_error(y_test.reshape(-1,1), predictions)
r2 = r2_score(y_test, predictions)

import math
print("Mean Square-root Error:", math.sqrt(mse))
print(r2)

pred_vs_true = np.hstack((predictions,y_test.reshape(-1,1)))


Mean Square-root Error: 1.3932548787099177
0.913173752657863


In [35]:
#Carb acid H, eV
print(predictions[-2])

[2.78975962]


In [13]:
import numpy as np
np.savetxt('../data/perturbation/targetall-useavg-subavg/qm9-5000-6000-all/mlp/Haewcarbacid/d1-p1-predvstru.csv',pred_vs_true,delimiter=',')

In [None]:
Carbacid ae over perts

1 --> -16.84337022
2 --> -16.84757487
3 --> -16.92961348
4 --> -16.91248546

In [None]:
import numpy as np

print(np.shape(label_train))
print(np.shape(predictions))

print(predictions)

for i in range(np.shape(predictions)[1]):

    if use_train_for_test == True: 
        save_predictions = np.column_stack((predictions[:,i],y_train[:,i],label_train))
    elif use_train_for_test == False: 
        save_predictions = np.column_stack((predictions[:,i],y_test[:,i],label_test))
        
    save_pred_filepath = '../data/embs/embsdensity-ince/mlptestpredictions%s.csv'  %(str(i)) 
    np.savetxt(save_pred_filepath,save_predictions,delimiter=',')


In [None]:
import numpy as np

save_pred_filepath = '../data/embs/embsdensity-ince/mlptestpredictions300-300-300relu.csv'

predictions = np.column_stack((y_pred,y_test,label_test))

np.savetxt(save_pred_filepath,predictions,delimiter=',')