In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error

file = Path(r"dG_RE.xlsx")
df = pd.read_excel(file, engine="openpyxl")

ligand_labels = df["ligand"].dropna().unique()
radical_labels = df["radical"].dropna().unique()
nucleophile_labels = df["nucleophile"].dropna().unique()

rs_root = 2020

In [127]:
fix_df = df.loc[((df["ligand"]=="L4") & (df["radical"]=="R11")) | ((df["radical"]=="R11") & (df["nucleophile"]=="N61")) | ((df["nucleophile"]=="N61") & (df["ligand"]=="L4")), :].copy(deep=True)
fix_df

Unnamed: 0,ligand,radical,nucleophile,dG
16,L6,R11,N61,-162.63854
66,L7,R11,N61,-186.956415
116,L14,R11,N61,-164.848001
166,L10,R11,N61,-148.2943
201,L4,R16,N61,-165.341223
206,L4,R20,N61,-162.52245
211,L4,R5,N61,-158.289899
215,L4,R11,N10,-111.169583
216,L4,R11,N61,-157.630386
217,L4,R11,N1,-149.010288


In [128]:
other_df = df.drop(fix_df.index)
other_df

Unnamed: 0,ligand,radical,nucleophile,dG
0,L6,R16,N10,-117.004794
1,L6,R16,N61,-168.488811
2,L6,R16,N1,-160.958697
3,L6,R16,N7,-131.844766
4,L6,R16,N2,-155.905990
...,...,...,...,...
244,L4,R94,N2,
245,L4,R6,N10,-100.875917
247,L4,R6,N1,-136.676589
248,L4,R6,N7,-117.616616


In [129]:
# Populate the coefficient matrix and the right-hand side vector

def get_x_y(df):
    # Create the coefficient matrix and the right-hand side vector
    coefficient_matrix = []
    G = []
    for _, row in df.iterrows():
        if pd.notna(row["dG"]):
            i = np.where(ligand_labels == row["ligand"])[0][0]              # index of fixed ligand
            j = np.where(radical_labels == row["radical"])[0][0]            # index of fixed radical
            k = np.where(nucleophile_labels == row["nucleophile"])[0][0]    # index of fixed nucleophile
            coefficients = [0] * (
                len(ligand_labels) + len(radical_labels) + len(nucleophile_labels)
            )
            coefficients[i] = 1
            coefficients[len(ligand_labels) + j] = 1
            coefficients[len(ligand_labels) + len(radical_labels) + k] = 1
            coefficient_matrix.append(coefficients)
            G.append(row["dG"])

    coefficient_matrix = np.array(coefficient_matrix)
    G = np.array(G)
    G = -G
    return coefficient_matrix, G

In [130]:
# For the ternary linear equation G = L + R + N, there exists a scenario where constant values are assigned to the independent variables (L, R, N), allowing potential linear shifts in their absolute magnitudes.
# For instance, under transformations L' = L + 10, R' = R - 3, N' = N - 3, the equation G = L' + R' + N' still holds true. Different methodologies or initial guesses may lead to variations in these constant value assignments.
# Nevertheless, the relative values of LFER parameters within L, R, and N remain fundamentally consistent.
# Therefore, we adopt the strategy of fixing a single parameter value to obtain precise and stable absolute numerical solutions for L, R, and N.

def solve(matrix, values, fixed_x_val, fixed_y_val, fixed_z_val, mechanism):
    fixed_l_index = np.where(ligand_labels == "L4")[0][0]
    match mechanism:
        case "RE":
            fixed_r_index = len(ligand_labels) + np.where(radical_labels == "R11")[0][0]
            fixed_n_index = len(ligand_labels) + len(radical_labels) + np.where(nucleophile_labels == "N61")[0][0]
        case "RS":
            fixed_r_index = len(ligand_labels) + np.where(radical_labels == "R16")[0][0]
            fixed_n_index = len(ligand_labels) + len(radical_labels) + np.where(nucleophile_labels == "N7")[0][0]
        case "IP":
            fixed_r_index = len(ligand_labels) + np.where(radical_labels == "R94")[0][0]
            fixed_n_index = len(ligand_labels) + len(radical_labels) + np.where(nucleophile_labels == "N10")[0][0]
           
    fixed_indices = [fixed_l_index, fixed_r_index, fixed_n_index]
    fixed_values = [fixed_x_val, fixed_y_val, fixed_z_val]
    A_fixed = matrix[:, fixed_indices]
    adjusted_G = values - np.dot(A_fixed, fixed_values)

    free_columns_mask = np.ones(matrix.shape[1], dtype=bool)
    free_columns_mask[fixed_indices] = False
    A_free = matrix[:, free_columns_mask]

    try:
        params_free, _, _, _ = np.linalg.lstsq(A_free, adjusted_G, rcond=None)
    except np.linalg.LinAlgError:
        print("The matrix cannot be solved.")
        exit()

    solution = np.zeros(matrix.shape[1])
    solution[fixed_indices] = fixed_values
    solution[free_columns_mask] = params_free

    results = pd.DataFrame({
        "Conponent": [ligand_labels[i] for i in range(len(ligand_labels))]
                + [radical_labels[i] for i in range(len(radical_labels))]
                + [nucleophile_labels[i] for i in range(len(nucleophile_labels))],
        "Value": np.concatenate((
            solution[:len(ligand_labels)],
            solution[len(ligand_labels):len(ligand_labels)+len(radical_labels)],
            solution[-len(nucleophile_labels):]
        ))
    })

    return results
    
    print(f"The results have been saved to 'solution_output_{mechanism}.csv'")

In [131]:
from sklearn.model_selection import KFold

mechanism = "RE"
fixed_l_val = 79.9817167545207  # L_RE of L4
fixed_r_val = 48.5196042780403  # R_RE of R11
fixed_n_val = 28.6304108032927  # N_RE of N61

rss = [rs_root * i for i in range(1, 11)]  # random states
fit_results: list[pd.DataFrame] = []
training_r2s = []
training_maes = []
training_rmses = []
test_r2s = []
test_maes = []
test_rmses = []
dG_results = []
idxs_1 = []
idxs_2 = []

for rs in rss:
    kf = KFold(n_splits=5, shuffle=True, random_state=rs)
    test_dfs = []
    for fold, (train_idx, test_idx) in enumerate(kf.split(other_df)):

        train_df, test_df = other_df.iloc[train_idx, :], other_df.iloc[test_idx, :]
        combined_df = pd.concat([train_df, fix_df], axis=0)

        x_matrix, values = get_x_y(train_df)
        fit_result = solve(
            x_matrix, values, fixed_l_val, fixed_r_val, fixed_n_val, mechanism
        )

        mapping = fit_result.set_index("Conponent")["Value"].to_dict()

        # combine the fit result L, R, N values
        combined_df[["L", "R", "N"]] = combined_df[
            ["ligand", "radical", "nucleophile"]
        ]
        combined_df.loc[:, ["L", "R", "N"]] = combined_df.loc[:, ["L", "R", "N"]].replace(mapping)
        combined_df[["L", "R", "N"]] = combined_df[["L", "R", "N"]].astype(float)
        combined_df.loc[:, "dG_pred"] = combined_df.loc[:, ["L", "R", "N"]].sum(axis=1).mul(-1)

        test_df[["L", "R", "N"]] = test_df[
            ["ligand", "radical", "nucleophile"]
        ]
        test_df.loc[:, ["L", "R", "N"]] = test_df.loc[:, ["L", "R", "N"]].replace(mapping)
        test_df[["L", "R", "N"]] = test_df[["L", "R", "N"]].astype(float)
        test_df.loc[:, "dG_pred"] = test_df.loc[:, ["L", "R", "N"]].sum(axis=1).mul(-1)
        
        combined_notna_df = combined_df.dropna(axis=0, how="any")
        training_r2 = r2_score(combined_notna_df["dG"], combined_notna_df["dG_pred"])
        training_mae = mean_absolute_error(combined_notna_df["dG"], combined_notna_df["dG_pred"])
        training_rmse = root_mean_squared_error(combined_notna_df["dG"], combined_notna_df["dG_pred"])

        test_notna_df = test_df.dropna(axis=0, how="any")
        test_r2 = r2_score(test_notna_df["dG"], test_notna_df["dG_pred"])
        test_mae = mean_absolute_error(test_notna_df["dG"], test_notna_df["dG_pred"])
        test_rmse = root_mean_squared_error(test_notna_df["dG"], test_notna_df["dG_pred"])

        test_dfs.append(test_df)

        training_r2s.append(training_r2)
        training_maes.append(training_mae)
        training_rmses.append(training_rmse)
        test_r2s.append(test_r2)
        test_maes.append(test_mae)
        test_rmses.append(test_rmse)

        idxs_1.append(rs//rs_root)
        idxs_2.append(fold)

        fit_results.append(fit_result)

        # fit_result.to_csv(fr"RE_fit_results\solution_output_{mechanism}_round-{rs//1000}_fold-{fold}.csv", index=False)
    dG_result = pd.concat(test_dfs, axis=0)
    dG_results.append(dG_result)

  combined_df.loc[:, ["L", "R", "N"]] = combined_df.loc[:, ["L", "R", "N"]].replace(mapping)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df[["L", "R", "N"]] = test_df[
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df[["L", "R", "N"]] = test_df[
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df[["L", "R", "N"]] = test_df[
  te

In [132]:
performance_df = pd.DataFrame({"Round": idxs_1, "Fold": idxs_2, "Training R2": training_r2s, "Traing MAE": training_maes, "Training RMSE": training_rmses, "Test R2": test_r2s, "Test_MAE": test_maes, "Test_RMSE": test_rmses})
performance_df

Unnamed: 0,Round,Fold,Training R2,Traing MAE,Training RMSE,Test R2,Test_MAE,Test_RMSE
0,1,0,0.990807,1.693693,2.216434,0.99544,1.3685,1.632463
1,1,1,0.991264,1.654067,2.142212,0.993482,1.565424,2.029195
2,1,2,0.992284,1.597462,2.060861,0.989087,1.801503,2.353746
3,1,3,0.992353,1.581004,2.092478,0.989191,1.810338,2.151995
4,1,4,0.991613,1.565404,2.061458,0.991403,1.881008,2.426176
5,2,0,0.990894,1.588374,2.147754,0.991582,1.981938,2.429858
6,2,1,0.991987,1.579335,2.10051,0.990672,1.837307,2.165659
7,2,2,0.992187,1.554406,2.051886,0.989994,1.950375,2.314643
8,2,3,0.990911,1.745498,2.231772,0.995633,1.218336,1.468528
9,2,4,0.992078,1.601284,2.065302,0.987374,1.820318,2.417


In [133]:
performance_df.loc[:, ["Test R2", "Test_MAE","Test_RMSE"]].mean().round(3)

Test R2      0.991
Test_MAE     1.746
Test_RMSE    2.186
dtype: float64

In [134]:
r2s = []
maes = []
rmses = []
for l, d in performance_df.groupby("Round"):
    r2 = d.loc[:, "Test R2"].mean()
    mae = d.loc[:, "Test_MAE"].mean()
    rmse = d.loc[:, "Test_RMSE"].mean()
    r2s.append(r2)
    maes.append(mae)
    rmses.append(rmse)


print(np.array(r2s).mean())
print(np.array(maes).mean())
print(np.array(rmses).mean())
print(np.array(r2s).std())
print(np.array(maes).std())
print(np.array(rmses).std())

0.9910048977318832
1.7456310320408577
2.1859143535611993
0.0005025677014960855
0.03981000289383016
0.04439572410627668
