In [1]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, learning_curve
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [2]:
# Adjust the filename/path if necessary
df = pd.read_csv('dataset_complete.csv')
df.head()


Unnamed: 0,index,smiles,mu,alpha,homo,lumo,gap,r2,zpve,U0,...,H,G,Cv,num_func_groups,num_atoms,O,C,N,H.1,F
0,3895,O=C1C=CON=N1,3.3067,46.55,-0.263,-0.0607,0.2023,965.3567,0.049003,-375.430225,...,-375.420213,-375.469933,24.943,31,7,2,3,2,2,0
1,3896,O=C1C=NOC=N1,2.4177,46.66,-0.2701,-0.0861,0.184,567.5171,0.055941,-375.359188,...,-375.353122,-375.388412,17.987,24,7,2,3,2,2,0
2,3897,O=C1C=NON=C1,0.2386,48.24,-0.2681,-0.1008,0.1674,577.0287,0.05495,-375.30706,...,-375.300964,-375.336105,18.514,12,7,2,3,2,2,0
3,3898,O=C1C=NON=N1,1.5473,45.28,-0.2714,-0.12,0.1514,556.6358,0.04133,-391.337881,...,-391.331328,-391.367925,19.012,11,7,2,2,3,1,0
4,3899,O=C1N=CON=N1,1.7309,41.79,-0.2961,-0.057,0.2391,913.3757,0.038631,-391.518453,...,-391.508608,-391.558096,23.419,14,7,2,2,3,1,0


In [5]:
# Choose your thermodynamic target; here we use enthalpy "H"
target = 'H'



feature_cols = ['num_func_groups','O','C','N','H.1','F']

X = df[feature_cols]
y = df[target]

print(f"Number of features: {len(feature_cols)}")


Number of features: 6


In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)


In [7]:
ridge_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('model' , Ridge(alpha=1.0))
])
ridge_pipe.fit(X_train, y_train)


In [8]:
rf_pipe = Pipeline([
    ('scaler', StandardScaler()),  
    ('model' , RandomForestRegressor(
        n_estimators=200,
        max_depth=10,
        random_state=42
    ))
])
rf_pipe.fit(X_train, y_train)


In [10]:
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def evaluate(name, model, X_t, y_t):
    y_pred = model.predict(X_t)
    mae  = mean_absolute_error(y_t, y_pred)
    mse  = mean_squared_error(y_t, y_pred)      # this returns MSE
    rmse = np.sqrt(mse)                         # take square root
    r2   = r2_score(y_t, y_pred)
    print(f"{name}: MAE={mae:.3f}, RMSE={rmse:.3f}, R²={r2:.3f}")

print("Test-set performance:")
evaluate('Ridge Regression', ridge_pipe, X_test, y_test)
evaluate('Random Forest'   , rf_pipe   , X_test, y_test)


Test-set performance:
Ridge Regression: MAE=0.059, RMSE=0.202, R²=1.000
Random Forest: MAE=0.062, RMSE=0.447, R²=1.000


In [11]:
# Compute train metrics
evaluate('Ridge (train)', ridge_pipe, X_train, y_train)
evaluate('RF    (train)', rf_pipe,    X_train, y_train)

from sklearn.model_selection import cross_val_score
cv_mae = -cross_val_score(ridge_pipe, X, y, cv=5, scoring='neg_mean_absolute_error').mean()
print("5-fold CV MAE (Ridge):", cv_mae)


Ridge (train): MAE=0.057, RMSE=0.191, R²=1.000
RF    (train): MAE=0.049, RMSE=0.406, R²=1.000
5-fold CV MAE (Ridge): 0.5503366008429907


In [13]:
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
import numpy as np

outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)
inner_cv = KFold(n_splits=5, shuffle=True, random_state=1)

param_grid = {'model__alpha': [0.01, 0.1, 1, 10, 100]}

nested_scores = cross_val_score(
    GridSearchCV(ridge_pipe, param_grid, cv=inner_cv, scoring='neg_mean_absolute_error'),
    X, y,
    cv=outer_cv,
    scoring='neg_mean_absolute_error',
    n_jobs=-1
)

print("Nested CV MAE (Ridge):", np.mean(-nested_scores), "±", np.std(-nested_scores))


Nested CV MAE (Ridge): 0.026753521955215815 ± 0.0004561153308187549


In [15]:
from sklearn.model_selection import RepeatedKFold

rkf = RepeatedKFold(n_splits=5, n_repeats=3, random_state=0)

scores = cross_val_score(
    GridSearchCV(ridge_pipe, param_grid, cv=inner_cv, scoring='neg_mean_absolute_error'),
    X, y,
    cv=rkf,
    scoring='neg_mean_absolute_error',
    n_jobs=-1
)
print("Repeated CV MAE (Ridge):", np.mean(-scores), "±", np.std(-scores))


Repeated CV MAE (Ridge): 0.026753957030544354 ± 0.00033569980320170026
