In [11]:
import pandas as pd
from joblib import load

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score, matthews_corrcoef

import random

import warnings
warnings.filterwarnings("ignore")

In [2]:
df_data = pd.read_csv("../../selected_strategies/single_double/prottrans_t5_uniref.csv")

df_positive = df_data[df_data["type_interaction"] == "single"]
df_positive["type_interaction"] = 1

df_negative = df_data[df_data["type_interaction"] == "double"]
df_negative["type_interaction"] = 0

df_data_negative_shuffle = shuffle(df_negative, random_state=42)
df_data_negative_to_train = df_data_negative_shuffle[:len(df_positive)]

df_to_train = pd.concat([df_positive, df_data_negative_to_train], axis=0)

response = df_to_train["type_interaction"]
df_to_train = df_to_train.drop(columns=["type_interaction"])

X_train_uniref, X_test_uniref, y_train_uniref, y_test_uniref = train_test_split(df_to_train, response, random_state=42, test_size=0.3)


In [3]:
df_data = pd.read_csv("../../selected_strategies/single_double/prottrans_t5_xlu50.csv")

df_positive = df_data[df_data["type_interaction"] == "single"]
df_positive["type_interaction"] = 1

df_negative = df_data[df_data["type_interaction"] == "double"]
df_negative["type_interaction"] = 0

df_data_negative_shuffle = shuffle(df_negative, random_state=42)
df_data_negative_to_train = df_data_negative_shuffle[:len(df_positive)]

df_to_train = pd.concat([df_positive, df_data_negative_to_train], axis=0)

response = df_to_train["type_interaction"]
df_to_train = df_to_train.drop(columns=["type_interaction"])

X_train_xlu50, X_test_xlu50, y_train_xlu50, y_test_xlu50 = train_test_split(df_to_train, response, random_state=42, test_size=0.3)

In [5]:
df_values = pd.DataFrame()
df_values['response_uniref'] = y_test_uniref
df_values['response_xlu50'] = y_test_xlu50

df_values

Unnamed: 0,response_uniref,response_xlu50
117,1,1
132,1,1
154,1,1
245,1,1
84,1,1
...,...,...
31,1,1
113,1,1
1224,0,0
512,0,0


In [6]:
model1 = load("../../trained_models/single_double/ExtraTreesClassifier_prottrans_t5_uniref.joblib")
model2 = load("../../trained_models/single_double/ExtraTreesClassifier_prottrans_t5_xlu50.joblib")
model3 = load("../../trained_models/single_double/GaussianProcessClassifier_prottrans_t5_xlu50.joblib")

In [7]:
predictions_model1 = model1.predict(X_test_uniref)
predictions_model2 = model2.predict(X_test_xlu50)
predictions_model3 = model3.predict(X_test_xlu50)

In [9]:
df_predictions = pd.DataFrame()
df_predictions['model_1'] = predictions_model1
df_predictions['model_2'] = predictions_model2
df_predictions['model_3'] = predictions_model3

df_predictions.to_csv("demo_predictions.csv", index=False)

In [16]:
def create_voting_process(column_list, df_predictions):
    response_voting = []

    for index in df_predictions.index:
        count_0 = 0
        count_1 = 0

        for column in column_list:
            if df_predictions[column][index] == 1:
                count_1+=1
            if df_predictions[column][index] == 0:
                count_0+=1
        
        if count_0>count_1:
            response_voting.append(0)
        elif count_1>count_0:
            response_voting.append(1)
        else:
            response_voting.append(random.choice([0,1]))
    
    return response_voting

In [19]:
columns = ["model_1", "model_2"]
name_column = "model_1-model_2"

df_predictions[name_column] = create_voting_process(columns, df_predictions)

columns = ["model_1", "model_3"]
name_column = "model_1-model_3"

df_predictions[name_column] = create_voting_process(columns, df_predictions)

columns = ["model_2", "model_3"]
name_column = "model_2-model_3"

df_predictions[name_column] = create_voting_process(columns, df_predictions)

columns = ["model_1", "model_2", "model_3"]
name_column = "model_1-model_2-model_3"

df_predictions[name_column] = create_voting_process(columns, df_predictions)

In [20]:
df_predictions

Unnamed: 0,model_1,model_2,model_3,model_1-model_2,model_1-model_3,model_2-model_3,model_1-model_2-model_3
0,1,1,1,1,1,1,1
1,1,1,1,1,1,1,1
2,1,1,1,1,1,1,1
3,1,1,1,1,1,1,1
4,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...
156,0,0,0,0,0,0,0
157,1,1,1,1,1,1,1
158,0,0,0,0,0,0,0
159,0,0,0,0,0,0,0


In [22]:
matrix_data = []
for column in df_predictions.columns:
    test_accuracy = accuracy_score(y_true=y_test_uniref, y_pred=df_predictions[column])
    test_precision = f1_score(y_true=y_test_uniref, y_pred=df_predictions[column])
    test_f1 = precision_score(y_true=y_test_uniref, y_pred=df_predictions[column])
    test_recall = recall_score(y_true=y_test_uniref, y_pred=df_predictions[column])
    test_mcc = matthews_corrcoef(y_true=y_test_uniref, y_pred=df_predictions[column])
    test_cm = confusion_matrix(y_test_uniref, df_predictions[column]).tolist()

    row = [column, test_accuracy, test_precision, test_f1, test_recall, test_mcc, test_cm]
    matrix_data.append(row)

columns = ["Model", "test_accuracy", "test_precision", "test_f1", "test_recall", "test_mcc", "test_cm"]
df_summary = pd.DataFrame(data=matrix_data, columns=columns)


In [23]:
df_summary

Unnamed: 0,Model,test_accuracy,test_precision,test_f1,test_recall,test_mcc,test_cm
0,model_1,0.89441,0.882759,0.927536,0.842105,0.790161,"[[80, 5], [12, 64]]"
1,model_2,0.89441,0.884354,0.915493,0.855263,0.788961,"[[79, 6], [11, 65]]"
2,model_3,0.900621,0.887324,0.954545,0.828947,0.805589,"[[82, 3], [13, 63]]"
3,model_1-model_2,0.900621,0.891892,0.916667,0.868421,0.801063,"[[79, 6], [10, 66]]"
4,model_1-model_3,0.89441,0.881119,0.940299,0.828947,0.791881,"[[81, 4], [13, 63]]"
5,model_2-model_3,0.900621,0.890411,0.928571,0.855263,0.802046,"[[80, 5], [11, 65]]"
6,model_1-model_2-model_3,0.888199,0.875,0.926471,0.828947,0.778361,"[[80, 5], [13, 63]]"
