In [1]:
from sklearn.datasets import make_regression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import GradientBoostingRegressor
X, y = make_regression(n_samples=10, n_targets=3, random_state=1)
MultiOutputRegressor(GradientBoostingRegressor(random_state=0)).fit(X, y).predict(X)

array([[-154.75474165, -147.03498585,  -50.03812219],
       [   7.12165031,    5.12914884,  -81.46081961],
       [-187.8948621 , -100.44373091,   13.88978285],
       [-141.62745778,   95.02891072, -191.48204257],
       [  97.03260883,  165.34867495,  139.52003279],
       [ 123.92529176,   21.25719016,   -7.84253   ],
       [-122.25193977,  -85.16443186, -107.12274212],
       [ -30.170388  ,  -94.80956739,   12.16979946],
       [ 140.72667194,  176.50941682,  -17.50447799],
       [ 149.37967282,  -81.15699552,   -5.72850319]])

In [2]:
import pandas as pd

from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import BayesianRidge

from sklearn.model_selection import cross_validate
from sklearn.metrics import r2_score
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import mean_squared_error

from sklearn.model_selection import train_test_split
from scipy import stats

import os

In [4]:
df_data = pd.read_csv("../../selected_dataset/esm1v.csv")

response = df_data[["activity", 'expression']]
df_to_train = df_data.drop(columns=['expression', "activity"])

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df_to_train, response, random_state=42, test_size=0.3)

In [8]:
multi_hist = MultiOutputRegressor(HistGradientBoostingRegressor(random_state=0))
multi_hist.fit(X_train, y_train)
multi_hist_predictions = multi_hist.predict(X_test)
multi_hist

In [9]:
multi_hist_predictions

array([[-0.36273384, -0.19862028],
       [-0.01042983, -0.06312559],
       [-0.37855973, -0.25631505],
       ...,
       [-0.62205083, -0.11524784],
       [-0.65491905, -0.40539248],
       [-0.67511719, -0.24900412]])

In [22]:
y_test_data = y_test.reset_index()
y_test_data = y_test_data[["activity", "expression"]]

In [23]:
df_predictions = pd.DataFrame(data=multi_hist_predictions, columns=["activity_predict_hist", "expression_predict_hist"])
df_predictions = pd.concat([df_predictions, y_test_data], axis=1)
df_predictions

Unnamed: 0,activity_predict_hist,expression_predict_hist,activity,expression
0,-0.362734,-0.198620,-0.521377,-0.476279
1,-0.010430,-0.063126,0.002142,-0.025383
2,-0.378560,-0.256315,-0.699729,-0.470856
3,-0.615668,0.026981,-0.677578,0.100492
4,-0.104281,-0.095388,0.071708,0.132686
...,...,...,...,...
1904,-0.616069,0.094498,-0.699621,0.059900
1905,-0.405431,-0.292712,-0.399827,-0.292246
1906,-0.622051,-0.115248,-0.716196,-0.250583
1907,-0.654919,-0.405392,-0.694500,-0.399782


In [25]:
r2_value = r2_score(y_pred=df_predictions["activity_predict_hist"], y_true=df_predictions["activity"])
mse_value = mean_squared_error(y_pred=df_predictions["activity_predict_hist"], y_true=df_predictions["activity"])
rmse_value = root_mean_squared_error(y_pred=df_predictions["activity_predict_hist"], y_true=df_predictions["activity"])
spearman_value = stats.spearmanr(df_predictions["activity_predict_hist"], df_predictions["activity"])[0]

row = [r2_value, mse_value, rmse_value, spearman_value]
row

[0.6044944577719198,
 0.029989605173532156,
 0.17317507087780318,
 0.7720965371952571]