In [1]:
!pip install numpy==1.24.4



In [3]:
!pip install pandas



In [4]:
!pip install seaborn matplotlib



In [5]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
import emoji
%matplotlib inline

In [6]:
#importing rdkit
!pip install rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from rdkit import RDConfig
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
from rdkit.Chem import PandasTools
from rdkit import DataStructs
from rdkit.Chem.Subshape import SubshapeBuilder, SubshapeAligner, SubshapeObjects



In [7]:
!pip install scikit-learn



In [8]:
#importing sklearn
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR

In [11]:
# Load the CSV file
df = pd.read_csv('reduced.csv')


In [12]:
# Extract Solute SMILES (column 0) and Solvent SMILES (column 3)
solute_smiles = df.iloc[:, 0]
solvent_smiles = df.iloc[:, 3]

In [14]:
from rdkit.Chem import rdFingerprintGenerator

In [15]:
morgan_generator = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)

In [17]:
%%time
solute_fps = []
for smi in solute_smiles:
    try:
        mol = Chem.MolFromSmiles(smi)
        if mol:
            fp = morgan_generator.GetFingerprint(mol).ToBitString()
            solute_fps.append(list(map(int, fp)))
        else:
            solute_fps.append([0]*2048)
    except:
        solute_fps.append([0]*2048)

solute_fps_df = pd.DataFrame(solute_fps)

CPU times: user 55.7 s, sys: 1.74 s, total: 57.5 s
Wall time: 57.6 s


In [18]:
# Generate Solvent Morgan Fingerprints
solvent_fps = []
for smi in solvent_smiles:
    try:
        mol = Chem.MolFromSmiles(smi)
        if mol:
            fp = morgan_generator.GetFingerprint(mol).ToBitString()
            solvent_fps.append(list(map(int, fp)))
        else:
            solvent_fps.append([0]*2048)
    except:
        solvent_fps.append([0]*2048)

solvent_fps_df = pd.DataFrame(solvent_fps)

In [19]:
temperature = df.iloc[:, 1] 

In [20]:
X = pd.concat([solute_fps_df, temperature, solvent_fps_df], axis=1)

In [21]:
y = df.iloc[:, 5].to_numpy()

In [22]:
# Final outputs:
print("Feature matrix X shape:", X.shape)
print("Target vector y shape:", y.shape)

Feature matrix X shape: (50000, 4097)
Target vector y shape: (50000,)


In [23]:
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
# Split the data randomly using train_test_split()
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, random_state=991)

In [26]:
X_train.columns = X_train.columns.astype(str)

In [27]:
X_test.columns = X_test.columns.astype(str)

In [29]:
from sklearn.linear_model import LinearRegression

In [30]:
reg = LinearRegression()
reg.fit(X_train, Y_train)


In [31]:
# Predictions using trained linnear regression model(reg)
Y_pred_train = reg.predict(X_train)  #predict for training data
Y_pred_test = reg.predict(X_test)    #predict for testing data

In [32]:
# Metrics
print('Linear Regression - Train R2: %.2f' % r2_score(Y_train, Y_pred_train))
print('Linear Regression - Test R2: %.2f' % r2_score(Y_test, Y_pred_test))


Linear Regression - Train R2: 0.49
Linear Regression - Test R2: 0.39


In [33]:
# MAE
print('MAE Train:', mean_absolute_error(Y_train, Y_pred_train))
print('MAE Test:', mean_absolute_error(Y_test, Y_pred_test))

MAE Train: 1.1214852862374778
MAE Test: 1.2374293641461747


In [34]:
%%time
# Random Forest
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, Y_train)
Y_pred_train = model.predict(X_train)
Y_pred_test = model.predict(X_test)

CPU times: user 47min 9s, sys: 6.21 s, total: 47min 16s
Wall time: 47min 23s


In [35]:
print('Random Forest - Train R2: %.2f' % r2_score(Y_train, Y_pred_train))
print('Random Forest - Test R2: %.2f' % r2_score(Y_test, Y_pred_test))

Random Forest - Train R2: 0.97
Random Forest - Test R2: 0.83


In [36]:
# MAE
print('MAE Train:', mean_absolute_error(Y_train, Y_pred_train))
print('MAE Test:', mean_absolute_error(Y_test, Y_pred_test))


MAE Train: 0.1208977381571946
MAE Test: 0.355659083058048
