In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import ast
import numpy as np
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, f1_score
from sklearn.model_selection import GridSearchCV, PredefinedSplit
from evaluate_ec import evaluate_ec_predictions

## Loading data

In [2]:
train_df = pd.read_csv("../dataset/all_features/train.csv")
valid_df = pd.read_csv("../dataset/all_features/valid.csv")
test_df = pd.read_csv("../dataset/all_features/test.csv")

In [3]:
train_df

Unnamed: 0,molecular_weight,isoelectric_point,gravy,aromaticity,instability_index,aliphatic_index,boman_index,ec_0,ec_1,ec_2,...,VSTPV2,VSTPV3,VSTPV4,VSTPV5,VSTPV6,Z1,Z2,Z3,Z4,Z5
0,32275.0348,8.347963,-0.154483,0.058621,42.982793,105.862069,0.291586,2,5,1,...,0.198414,0.413690,-0.335655,0.229759,-0.193793,0.191931,-0.526621,-0.395103,-0.531103,0.289207
1,529990.8042,6.424364,-0.162032,0.053542,46.634426,90.848941,0.260615,2,3,2,...,0.087494,0.466179,-0.437199,0.225794,-0.309963,0.170827,-0.700737,-0.225900,-0.406464,0.306325
2,527221.8439,5.875981,-0.186202,0.060612,42.951614,90.877120,0.276119,2,3,2,...,0.096173,0.443808,-0.421132,0.215689,-0.279313,0.188678,-0.649019,-0.242199,-0.456235,0.270306
3,37082.7180,5.953111,-0.295385,0.089231,50.063692,94.553846,0.349231,2,5,1,...,0.117631,0.360800,-0.354215,0.253908,-0.135877,0.168492,-0.332492,-0.314738,-0.447969,0.251938
4,73292.5257,8.690096,0.295475,0.110106,38.304540,110.030166,0.535460,2,3,1,...,0.070995,0.430256,-0.347949,0.126863,-0.318069,-0.412036,-0.599759,-0.247285,-0.336094,0.256787
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45755,62736.3791,9.224539,-0.222993,0.094891,37.310420,96.733577,0.360274,3,6,4,...,0.168960,0.359106,-0.272701,0.183175,-0.148960,-0.005420,-0.330237,-0.383011,-0.402555,0.230420
45756,81911.2453,8.032326,-0.263022,0.133813,38.216604,95.251799,0.419914,3,6,4,...,0.129986,0.275353,-0.283281,0.204619,-0.114173,-0.096835,-0.121640,-0.361914,-0.448230,0.190950
45757,25398.5248,5.070572,-0.309459,0.094595,31.677928,91.036036,0.396667,3,1,3,...,0.076937,0.366396,-0.373874,0.182883,-0.177703,-0.019144,-0.322297,-0.230586,-0.340180,0.257568
45758,18860.1921,5.420075,-0.202367,0.053254,45.992367,102.721893,0.323846,3,1,4,...,0.130237,0.415325,-0.423136,0.232604,-0.183432,0.242249,-0.566036,-0.371479,-0.476272,0.235207


## Data Processing

In [4]:
def get_feature_and_label(df):
    ec_cols = [col for col in df.columns if col.startswith('ec_')]
    return df.drop(columns=ec_cols), df[ec_cols]

In [5]:
train_X, train_Y = get_feature_and_label(train_df)
valid_X, valid_Y = get_feature_and_label(valid_df)
test_X, test_Y = get_feature_and_label(test_df)

scaler = MinMaxScaler()
scaler.fit(train_X)
train_X = scaler.transform(train_X)
valid_X = scaler.transform(valid_X)
test_X = scaler.transform(test_X)

In [6]:
train_Y

Unnamed: 0,ec_0,ec_1,ec_2,ec_3
0,2,5,1,61
1,2,3,2,26
2,2,3,2,26
3,2,5,1,83
4,2,3,1,78
...,...,...,...,...
45755,3,6,4,13
45756,3,6,4,12
45757,3,1,3,5
45758,3,1,4,-1


# Modeling

In [7]:
def multioutput_f1_score(y_true, y_pred):
    # Flatten all outputs and compute micro-F1
    return f1_score(y_true.ravel(), y_pred.ravel(), average='micro', zero_division=0)

In [8]:
X_trainval = np.concatenate([train_X, valid_X], axis=0)
y_trainval = np.concatenate([train_Y, valid_Y], axis=0)
split_index = np.concatenate([
    np.full(len(train_X), -1),  # Training samples (-1 means they are used for training)
    np.full(len(valid_X), 0)    # Validation samples (0 means they are used for validation)
])
# Use the list to create PredefinedSplit
pds = PredefinedSplit(test_fold = split_index)

### Single Random Forest

In [None]:
f1_scorer = make_scorer(multioutput_f1_score)

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20]
}

# Set up the RandomForestClassifier (no MultiOutputClassifier needed)

rf = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring=f1_scorer,
    cv=pds,
    n_jobs=-1,
    verbose=1
)
grid_search.fit(X_trainval, y_trainval)

# Best parameters and best F1 score
print("Best params:", grid_search.best_params_)
print("Best F1:", grid_search.best_score_)

In [10]:
# Evaluate on the test set
# best_params = grid_search.best_params_
# best_model = RandomForestClassifier(random_state=42, **best_params)
best_model = RandomForestClassifier(random_state=42, n_estimators=200, max_depth=None)
best_model.fit(X_trainval, y_trainval)

In [11]:
test_pred = best_model.predict(test_X)
final_f1 = multioutput_f1_score(test_Y.to_numpy(), test_pred)
print("Test F1:", final_f1)
eval_report = evaluate_ec_predictions(test_pred, test_Y.to_numpy(), "Physiochemical + HMM + Peptides + RF")
eval_report

Test F1: 0.40664344212199


Unnamed: 0,Method,Exact Match Accuracy,No EC number found,No Prediction,Position 1 Accuracy,Position 1 Precision,Position 1 Recall,Position 1 F1-Score,Position 2 Accuracy,Position 2 Precision,Position 2 Recall,Position 2 F1-Score,Position 3 Accuracy,Position 3 Precision,Position 3 Recall,Position 3 F1-Score,Position 4 Accuracy,Position 4 Precision,Position 4 Recall,Position 4 F1-Score
0,Physiochemical + HMM + Peptides + RF,7.077374,0.0,0.0,46.13128,0.347138,0.694721,0.351522,41.254125,0.134055,0.411474,0.159142,49.810537,0.073154,0.399611,0.089062,6.295074,0.606769,0.651824,0.6208


In [12]:
metrics = pd.read_csv('../metrics/experiment_results.csv')
report_combined = pd.concat([metrics, eval_report], axis=0, ignore_index=True)
display(report_combined)
report_combined.to_csv('../metrics/experiment_results.csv', index=False)

Unnamed: 0,Method,Exact Match Accuracy,No EC number found,No Prediction,Position 1 Accuracy,Position 1 Precision,Position 1 Recall,Position 1 F1-Score,Position 2 Accuracy,Position 2 Precision,Position 2 Recall,Position 2 F1-Score,Position 3 Accuracy,Position 3 Precision,Position 3 Recall,Position 3 F1-Score,Position 4 Accuracy,Position 4 Precision,Position 4 Recall,Position 4 F1-Score
0,Diamond Benchmark,3.920132,94.101484,94.101484,5.568785,72.116835,3.745772,7.08897,5.330647,48.623788,3.184853,5.853805,4.945961,58.634361,3.92022,7.205686,3.920132,27.519438,4.400288,6.852923
1,Physiochemical + HMM + RF,4.534898,0.0,0.0,42.879844,0.308276,0.72813,0.301557,37.415964,0.106004,0.313204,0.119088,48.453734,0.056081,0.316874,0.060274,4.461557,0.458614,0.523714,0.474899
2,Physiochemical + HMM + MultiRF,3.520352,0.0,0.0,42.060873,0.297715,0.720197,0.285135,36.499205,0.098482,0.356674,0.108555,48.906002,0.064475,0.313242,0.074011,20.180907,0.055158,0.203867,0.072607
3,Physiochemical + HMM + Peptides + RF,7.077374,0.0,0.0,46.13128,0.347138,0.694721,0.351522,41.254125,0.134055,0.411474,0.159142,49.810537,0.073154,0.399611,0.089062,6.295074,0.606769,0.651824,0.6208


### Cascade Modelling

In [14]:
# Dictionary to store the tuned models for each EC column
models = []
ec_columns = ["ec_0", "ec_1", "ec_2", "ec_3"]

cascade_features = X_trainval.copy()

for i in tqdm(range(y_trainval.shape[1])):
    valid_indices = y_trainval[:, i] != -1
    X_trainval_i = cascade_features[valid_indices]
    y_trainval_i = y_trainval[valid_indices, i]

    model = RandomForestClassifier(random_state=42, n_estimators=200, max_depth=None)
    model.fit(X_trainval_i, y_trainval_i)
    models.append(model)

    pred = model.predict(cascade_features).reshape(-1, 1)
    
    cascade_features = np.hstack([cascade_features, pred])

100%|██████████| 4/4 [11:39<00:00, 175.00s/it]


In [15]:
cascade_test_features = test_X.copy()
test_preds_list = []

for i in tqdm(range(len(models))):
    y_pred_col = models[i].predict(cascade_test_features)
    test_preds_list.append(y_pred_col)
    cascade_test_features = np.hstack([cascade_test_features, y_pred_col.reshape(-1, 1)])
test_pred = np.column_stack(test_preds_list)
final_f1 = multioutput_f1_score(test_Y.to_numpy(), test_pred)
print("Test F1:", final_f1)
eval_report = evaluate_ec_predictions(test_pred, test_Y.to_numpy(), "Physiochemical + HMM + Peptides + CascadeRF")
eval_report

  0%|          | 0/4 [00:00<?, ?it/s]

100%|██████████| 4/4 [00:01<00:00,  2.35it/s]

Test F1: 0.408660310475492





Unnamed: 0,Method,Exact Match Accuracy,No EC number found,No Prediction,Position 1 Accuracy,Position 1 Precision,Position 1 Recall,Position 1 F1-Score,Position 2 Accuracy,Position 2 Precision,Position 2 Recall,Position 2 F1-Score,Position 3 Accuracy,Position 3 Precision,Position 3 Recall,Position 3 F1-Score,Position 4 Accuracy,Position 4 Precision,Position 4 Recall,Position 4 F1-Score
0,Physiochemical + HMM + Peptides + CascadeRF,9.595404,0.0,0.0,45.862364,0.343595,0.714645,0.34626,39.836206,0.132142,0.396339,0.150091,51.424031,0.095252,0.412118,0.120354,26.341523,0.118571,0.246855,0.135757


In [16]:
metrics = pd.read_csv('../metrics/experiment_results.csv')
report_combined = pd.concat([metrics, eval_report], axis=0, ignore_index=True)
display(report_combined)
report_combined.to_csv('../metrics/experiment_results.csv', index=False)

Unnamed: 0,Method,Exact Match Accuracy,No EC number found,No Prediction,Position 1 Accuracy,Position 1 Precision,Position 1 Recall,Position 1 F1-Score,Position 2 Accuracy,Position 2 Precision,Position 2 Recall,Position 2 F1-Score,Position 3 Accuracy,Position 3 Precision,Position 3 Recall,Position 3 F1-Score,Position 4 Accuracy,Position 4 Precision,Position 4 Recall,Position 4 F1-Score
0,Diamond Benchmark,3.920132,94.101484,94.101484,5.568785,72.116835,3.745772,7.08897,5.330647,48.623788,3.184853,5.853805,4.945961,58.634361,3.92022,7.205686,3.920132,27.519438,4.400288,6.852923
1,Physiochemical + HMM + RF,4.534898,0.0,0.0,42.879844,0.308276,0.72813,0.301557,37.415964,0.106004,0.313204,0.119088,48.453734,0.056081,0.316874,0.060274,4.461557,0.458614,0.523714,0.474899
2,Physiochemical + HMM + MultiRF,3.520352,0.0,0.0,42.060873,0.297715,0.720197,0.285135,36.499205,0.098482,0.356674,0.108555,48.906002,0.064475,0.313242,0.074011,20.180907,0.055158,0.203867,0.072607
3,Physiochemical + HMM + Peptides + RF,7.077374,0.0,0.0,46.13128,0.347138,0.694721,0.351522,41.254125,0.134055,0.411474,0.159142,49.810537,0.073154,0.399611,0.089062,6.295074,0.606769,0.651824,0.6208
4,Physiochemical + HMM + Peptides + CascadeRF,9.595404,0.0,0.0,45.862364,0.343595,0.714645,0.34626,39.836206,0.132142,0.396339,0.150091,51.424031,0.095252,0.412118,0.120354,26.341523,0.118571,0.246855,0.135757


ROC plot, Grid search for SVM, multiple random forest takes previous ec prediction as a feature (tree structrue), Maybe rfe?, make change to BLAST grader to format as RF, Finished PSSM, After getting infomative feature set try different model and tuning them -> ROC curve