In [21]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import LeaveOneOut, KFold

df = pd.read_excel('LFER_Parameters_N.xlsx')
df

Unnamed: 0,No,N(RE),N(RS),N(IP)
0,N65,0.116255,,-1.028377
1,N66,-11.737400,,-16.217872
2,N70,-5.690091,,-8.080957
3,N69,-9.566844,,-10.297948
4,N68,-6.803292,,-5.296697
...,...,...,...,...
70,N56,5.065422,78.4,12.006250
71,N9,-6.935069,71.0,16.747084
72,N59,1.562037,75.7,12.714708
73,N58,-4.491547,70.7,8.369205


In [22]:
df_cut = df.loc[(df["N(RE)"].notna() & df["N(RS)"].notna()), ["N(RE)", "N(RS)"]]

In [23]:
# KFold CV

X = np.array(df_cut["N(RE)"])
Y = np.array(df_cut["N(RS)"].astype(float))

random_states = np.array(range(1,11,2)) * 2025

mean_r2s = []
mean_maes = []
mean_rmses = []
fold_results = []

for rs in random_states:
    kf = KFold(n_splits=5, shuffle=True, random_state=rs)
    r2s = []
    maes = []
    rmses = []
    for fold, (train_idx, test_idx) in enumerate(kf.split(X)):
        x_train, x_test = X[train_idx], X[test_idx]
        y_train, y_test = Y[train_idx], Y[test_idx]
        model = LinearRegression()
        model.fit(x_train.reshape(-1, 1), y_train)
        y_pred = model.predict(x_test.reshape(-1, 1))
        r2 = r2_score(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        rmse = root_mean_squared_error(y_test, y_pred)
        r2s.append(r2)
        maes.append(mae)
        rmses.append(rmse)
    mean_r2s.append(np.mean(r2s))
    mean_maes.append(np.mean(maes))
    mean_rmses.append(np.mean(rmses))
    fold_result = {"fold": [i+1 for i in range(5)], "R-squared": r2s, "MAE": maes, "RMSE": rmses}
    fold_results.append(fold_result)

mean_r2_scores_of_5_times = np.mean(mean_r2s)
mean_mae_scores_of_5_times = np.mean(mean_maes)
mean_rmse_scores_of_5_times = np.mean(mean_rmses)

print("Mean R-squared score: {:.3f}".format(mean_r2_scores_of_5_times))
print("Mean absolute error: {:.1f}".format(mean_mae_scores_of_5_times))
print("Root mean squared error: {:.1f}".format(mean_rmse_scores_of_5_times))

kf_results_df = pd.DataFrame()
for i, r in enumerate(fold_results):
    tmp_df = pd.DataFrame(r)
    tmp_df.insert(0, "random_state", 2025 * (2 * i + 1))
    kf_results_df = pd.concat([kf_results_df, tmp_df], axis=0, ignore_index=True)

Mean R-squared score: 0.960
Mean absolute error: 1.4
Root mean squared error: 1.7


In [24]:
kf_results_df

Unnamed: 0,random_state,fold,R-squared,MAE,RMSE
0,2025,1,0.970019,1.41101,1.656291
1,2025,2,0.969668,1.370669,1.559975
2,2025,3,0.943963,1.313404,2.053294
3,2025,4,0.980995,1.07031,1.293793
4,2025,5,0.946048,1.71324,1.98268
5,6075,1,0.973086,1.355188,1.742389
6,6075,2,0.94397,1.588178,1.735604
7,6075,3,0.988104,0.812076,0.898441
8,6075,4,0.983144,1.01787,1.379084
9,6075,5,0.914824,2.008637,2.514319


In [25]:
# LOO-CV

X = np.array(df_cut["N(RE)"])
Y = np.array(df_cut["N(RS)"].astype(float))

random_states = np.array(range(1,11,2)) * 2025

mean_r2s = []
mean_maes = []
mean_rmses = []
fold_results = []

y_preds = []
y_tests = []
loo = LeaveOneOut()
for train_idx, test_idx in loo.split(X):
    x_train, x_test = X[train_idx], X[test_idx]
    y_train, y_test = Y[train_idx], Y[test_idx]
    model = LinearRegression()
    model.fit(x_train.reshape(-1, 1), y_train)
    y_pred = model.predict(x_test.reshape(-1, 1))
    y_preds.append(y_pred)
    y_tests.append(y_test)
r2 = r2_score(y_tests, y_preds)
mae = mean_absolute_error(y_tests, y_preds)
rmse = root_mean_squared_error(y_tests, y_preds)

print("R-squared score: {:.3f}".format(r2))
print("Mean absolute error: {:.1f}".format(mae))
print("Root mean squared error: {:.1f}".format(rmse))

R-squared score: 0.967
Mean absolute error: 1.4
Root mean squared error: 1.7


In [26]:
model = LinearRegression()
model.fit(X.reshape(-1, 1), Y)
print(model.coef_)
print(model.intercept_)

[0.7203451]
75.04721418496464
