In [35]:
import pandas as pd

from sklearn import linear_model

from sklearn.model_selection import cross_validate
from sklearn.metrics import r2_score
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import mean_squared_error

from sklearn.model_selection import train_test_split
from scipy import stats

import os
import numpy as np

import warnings
warnings.filterwarnings("ignore")

In [42]:
import seaborn as sns
import matplotlib.pyplot as plt
import vapeplot
import numpy as np

vapeplot.set_palette('cool')
plt.rc('axes', grid=False, facecolor="white")
plt.rcParams.update({'font.size': 18})

In [16]:
def get_performances_by_task(dataset, real_column, predict_column):

    r2_value = r2_score(y_pred=dataset[real_column], y_true=dataset[predict_column])
    mse_value = mean_squared_error(y_pred=dataset[real_column], y_true=dataset[predict_column])
    rmse_value = root_mean_squared_error(y_pred=dataset[real_column], y_true=dataset[predict_column])
    spearman_value = stats.spearmanr(dataset[real_column], dataset[predict_column])[0]

    return [r2_value, mse_value, rmse_value, spearman_value]


In [38]:
list_df_summary_performances = []

for encoder in os.listdir("../../selected_dataset/"):

    print("Processing: ", encoder)
    name_encoder = encoder.split(".")[0]

    df_data = pd.read_csv(f"../../selected_dataset/{encoder}")

    response = df_data[["activity", 'expression']]
    df_to_train = df_data.drop(columns=['expression', "activity"])

    X_train, X_test, y_train, y_test = train_test_split(df_to_train, response, random_state=42, test_size=0.3)

    clf_elastic = linear_model.MultiTaskElasticNet(alpha=0.1)
    clf_elastic.fit(X=X_train, y=y_train)

    clf_lasso = linear_model.MultiTaskLasso(alpha=0.1)
    clf_lasso.fit(X=X_train, y=y_train)

    predictions_elastic = clf_elastic.predict(X=X_test)
    predictions_lasso = clf_lasso.predict(X=X_test)

    y_test_data = y_test.reset_index()
    y_test_data = y_test_data[["activity", "expression"]]

    df_predictions_elastic = pd.DataFrame(data=predictions_elastic, columns=["activity_predict_elastic", "expression_predict_elastic"])
    df_predictions_lasso = pd.DataFrame(data=predictions_lasso, columns=["activity_predict_lasso", "expression_predict_lasso"])

    df_predictions = pd.concat([df_predictions_elastic, df_predictions_lasso], axis=1)
    df_predictions["activity"] = y_test_data.activity.values
    df_predictions["expression"] = y_test_data.expression.values

    #performances
    performances_activity_elastic = get_performances_by_task(df_predictions, "activity_predict_elastic", "activity")
    performances_activity_lasso = get_performances_by_task(df_predictions, "activity_predict_lasso", "activity")
    performances_expression_elastic = get_performances_by_task(df_predictions, "expression_predict_elastic", "expression")
    performances_expression_lasso = get_performances_by_task(df_predictions, "expression_predict_lasso", "expression")

    df_summary = pd.DataFrame()
    df_summary["activity_lasso"] = performances_activity_lasso
    df_summary["activity_elastic"] = performances_activity_elastic
    df_summary["expression_lasso"] = performances_expression_lasso
    df_summary["expression_elastic"] = performances_expression_elastic
    df_summary["metrics"] = ["r2_value", "mse_value", "rmse_value", "spearman_value"]

    df_summary["method"] = name_encoder

    list_df_summary_performances.append(df_summary)


Processing:  prottrans_t5_uniref.csv
Processing:  prottrans_albert.csv
Processing:  esm1v.csv
Processing:  prottrans_t5_xlu50.csv
Processing:  prottrans_bert.csv
Processing:  prottrans_t5bdf.csv
Processing:  prottrans_xlnet.csv


In [43]:
df_predictions

Unnamed: 0,activity_predict_elastic,expression_predict_elastic,activity_predict_lasso,expression_predict_lasso,activity,expression
0,-0.330259,-0.174485,-0.330259,-0.174485,-0.521377,-0.476279
1,-0.330259,-0.174485,-0.330259,-0.174485,0.002142,-0.025383
2,-0.330259,-0.174485,-0.330259,-0.174485,-0.699729,-0.470856
3,-0.330259,-0.174485,-0.330259,-0.174485,-0.677578,0.100492
4,-0.330259,-0.174485,-0.330259,-0.174485,0.071708,0.132686
...,...,...,...,...,...,...
1904,-0.330259,-0.174485,-0.330259,-0.174485,-0.699621,0.059900
1905,-0.330259,-0.174485,-0.330259,-0.174485,-0.399827,-0.292246
1906,-0.330259,-0.174485,-0.330259,-0.174485,-0.716196,-0.250583
1907,-0.330259,-0.174485,-0.330259,-0.174485,-0.694500,-0.399782


In [41]:
df_summary = pd.concat(list_df_summary_performances, axis=0)
df_summary = df_summary.dropna()
df_summary.to_csv("../../results_selected_models/results_multioutput/summary_performances.csv", index=False)