In [1]:
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [2]:
from sklearn.model_selection import train_test_split

In [3]:
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score, matthews_corrcoef

In [4]:
def get_performances(y_val, y_predict, description):

    row = [
        description,
        accuracy_score(y_true=y_val, y_pred=y_predict),
        precision_score(y_true=y_val, y_pred=y_predict),
        f1_score(y_true=y_val, y_pred=y_predict),
        recall_score(y_true=y_val, y_pred=y_predict),
        matthews_corrcoef(y_true=y_val, y_pred=y_predict)
    ]

    return row

In [5]:
df_data = pd.read_csv("../clustering_examples/estimated_properties.csv")
df_data.head()

Unnamed: 0,sequence,Activity,MW,isoelectric_point,aromaticity,aliphatic_index,boman_index,charge,charge_density,hydrophobic_ratio,instability_index
0,KKKKVVEATYVLV,1,1503.88,10.7646,0.0769,126.9231,0.6792,3.996,0.00266,0.4615,2.7
1,GLPVCGESCFGGSCYTPGCSCTWPICTRD,1,2999.44,5.8203,0.1034,36.8966,0.5838,-0.408,-0.00014,0.3448,79.331
2,MQYKINMYAIVVYDVNVSRQNQIREFLRKYLYHVQRSVFEGEISPS...,1,10913.64,8.9775,0.1319,108.022,1.5576,2.028,0.00019,0.3846,60.4879
3,KQEGRDHDKSKGHFHMIVIHHKGGQAHHG,1,3308.67,10.6753,0.0345,40.3448,2.8534,3.689,0.00111,0.2069,19.3345
4,LAHKSRLYERHM,1,1539.81,11.2881,0.0833,73.3333,3.4217,3.194,0.00207,0.3333,47.6417


In [6]:
df_values = df_data.drop(columns=["sequence", "Activity"])
response = df_data["Activity"].values

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df_values, response, test_size=.1, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=.25, random_state=42)


In [8]:
adaboost_instance = AdaBoostClassifier(random_state=42).fit(X=X_train, y=y_train)
rf_instance = RandomForestClassifier(random_state=42).fit(X=X_train, y=y_train)
svc_instance = SVC(probability=True, random_state=42).fit(X=X_train, y=y_train)
dt_instance = DecisionTreeClassifier().fit(X=X_train, y=y_train)
lgbm_instance = LGBMClassifier().fit(X=X_train, y=y_train)
xgboost_instance = XGBClassifier().fit(X=X_train, y=y_train)




[LightGBM] [Info] Number of positive: 3468, number of negative: 3520
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000347 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2250
[LightGBM] [Info] Number of data points in the train set: 6988, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.496279 -> initscore=-0.014883
[LightGBM] [Info] Start training from score -0.014883


In [9]:
val_predictions_adaboost = adaboost_instance.predict(X_val)
val_predictions_rf = rf_instance.predict(X_val)
val_predictions_dt = dt_instance.predict(X_val)
val_predictions_svc = svc_instance.predict(X_val)
val_predictions_lgbm = lgbm_instance.predict(X_val)
val_predictions_xgboost = xgboost_instance.predict(X_val)

In [10]:
test_predictions_adaboost = adaboost_instance.predict(X_test)
test_predictions_rf = rf_instance.predict(X_test)
test_predictions_dt = dt_instance.predict(X_test)
test_predictions_svc = svc_instance.predict(X_test)
test_predictions_lgbm = lgbm_instance.predict(X_test)
test_predictions_xgboost = xgboost_instance.predict(X_test)

In [11]:
matrix_val = [get_performances(y_val, val_predictions_adaboost, "Adaboost"),
    get_performances(y_val, val_predictions_rf, "RF"),
    get_performances(y_val, val_predictions_dt, "DT"),
    get_performances(y_val, val_predictions_svc, "SVC"),
    get_performances(y_val, val_predictions_lgbm, "LGBM"),
    get_performances(y_val, val_predictions_xgboost, "XGBoost")
]

df_performance_val = pd.DataFrame(data=matrix_val, columns=["description", "accuracy",
                                                            "precision", "f1", "recall", "mcc"])
df_performance_val


Unnamed: 0,description,accuracy,precision,f1,recall,mcc
0,Adaboost,0.64721,0.62621,0.671725,0.724376,0.298451
1,RF,0.727897,0.719766,0.731356,0.743325,0.456091
2,DT,0.672532,0.665282,0.677378,0.689922,0.345375
3,SVC,0.587983,0.551093,0.693095,0.933678,0.245864
4,LGBM,0.721459,0.711921,0.726045,0.740741,0.44335
5,XGBoost,0.725322,0.714403,0.73064,0.747631,0.45121


In [12]:
matrix_test = [get_performances(y_test, test_predictions_adaboost, "Adaboost"),
    get_performances(y_test, test_predictions_rf, "RF"),
    get_performances(y_test, test_predictions_dt, "DT"),
    get_performances(y_test, test_predictions_svc, "SVC"),
    get_performances(y_test, test_predictions_lgbm, "LGBM"),
    get_performances(y_test, test_predictions_xgboost, "XGBoost")
]

df_performance_test = pd.DataFrame(data=matrix_test, columns=["description", "accuracy",
                                                            "precision", "f1", "recall", "mcc"])
df_performance_test

Unnamed: 0,description,accuracy,precision,f1,recall,mcc
0,Adaboost,0.65251,0.652104,0.691252,0.735401,0.299976
1,RF,0.747104,0.757194,0.762681,0.768248,0.49211
2,DT,0.689189,0.699647,0.710952,0.722628,0.375253
3,SVC,0.614865,0.584755,0.720392,0.937956,0.264498
4,LGBM,0.72973,0.736749,0.748654,0.760949,0.456822
5,XGBoost,0.738417,0.754128,0.752059,0.75,0.475258


In [13]:
import joblib

In [14]:
joblib.dump(rf_instance, "rf_model.joblib")
joblib.dump(xgboost_instance, "xgboost_model.joblib")

['xgboost_model.joblib']