In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import ast
import numpy as np
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, f1_score
from sklearn.model_selection import GridSearchCV, PredefinedSplit
from evaluate_ec import evaluate_ec_predictions

## Train-test split and encoding for physiochemical

In [31]:
fe_df = pd.read_csv("../dataset/physiochemical/output_results.csv")
fe_df

Unnamed: 0,Original,AAC,DC,molecular_weight,isoelectric_point,gravy,aromaticity,instability_index,aliphatic_index,boman_index
0,MQAKILRIATRKSPLAICQACYVCNKLKHYHPHIQTELIPIITTGD...,"[0.0707395498392283, 0.02572347266881029, 0.04...","[0.0, 0.0032258064516129032, 0.003225806451612...",35011.6433,9.364049,-0.041479,0.051447,38.157235,116.720257,0.359100
1,MKKLRIGARDSKLSRIQVDIVARKIKQTLGIECEFVPIKTKGDIDK...,"[0.05517241379310345, 0.02413793103448276, 0.0...","[0.0034602076124567475, 0.0, 0.003460207612456...",32275.0348,8.347963,-0.154483,0.058621,42.982793,105.862069,0.291586
2,MKKELIIGTRSSPLALWQAEFTKAELSRHFPELNITLKLVKTTGDV...,"[0.07051282051282051, 0.01282051282051282, 0.0...","[0.0, 0.0, 0.003215434083601286, 0.02250803858...",34393.4749,6.042860,-0.122436,0.044872,40.237821,101.923077,0.262981
3,MFNRQASGGAGSSGQGAGSSQTASAAPVSAGVGVGGGGGASGAAAG...,"[0.09324104234527687, 0.023615635179153095, 0....","[0.010792099368764, 0.0030543677458766036, 0.0...",529990.8042,6.424364,-0.162032,0.053542,46.634426,90.848941,0.260615
4,MPSESFCLAAQARLDSKWLKTDIQLAFTRDGLCGLWNEMVKDGEIV...,"[0.07840297889946214, 0.02399669011170873, 0.0...","[0.008069522036002483, 0.0028967515001034555, ...",527221.8439,5.875981,-0.186202,0.060612,42.951614,90.877120,0.276119
...,...,...,...,...,...,...,...,...,...,...
110217,MVRTRLAISVVLVSTLLLLNVKAKSVDPYKVLGVSKDAKQREIQKA...,"[0.050699300699300696, 0.008741258741258742, 0...","[0.0035026269702276708, 0.0, 0.0, 0.0070052539...",62569.2650,9.518709,-0.490559,0.089161,38.246521,72.237762,0.166154
110218,MPKAPKQQPPEPEWIGDGESTSPSDKVVKKGKKDKKIKKTFFEELA...,"[0.07558859975216853, 0.009913258983890954, 0....","[0.009925558312655087, 0.0012406947890818859, ...",91651.1398,7.208401,-0.883643,0.055762,47.106072,74.361834,0.030607
110219,MTDPHTARTIVGIVGNVISFGLFCAPIPTMVKIWKMKSVSEFKPDP...,"[0.05, 0.020833333333333332, 0.033333333333333...","[0.0041841004184100415, 0.0041841004184100415,...",27209.3824,8.883244,0.681667,0.137500,34.603792,111.583333,0.752250
110220,MAVPASPQHPRGYGILLLTLLLKALATTASACNHLRPQDATFSHDS...,"[0.07772020725388601, 0.031088082901554404, 0....","[0.0, 0.010416666666666666, 0.0052083333333333...",22115.7405,9.055116,-0.532124,0.067358,65.206218,81.502591,0.197876


In [32]:
train_df = pd.read_csv("../dataset/ec40/train.csv")
valid_df = pd.read_csv("../dataset/ec40/valid.csv")
test_df = pd.read_csv("../dataset/ec40/test.csv")

In [33]:
fe_train = fe_df[fe_df['Original'].isin(train_df['sequence'])]
fe_test  = fe_df[fe_df['Original'].isin(test_df['sequence'])]
fe_valid = fe_df[fe_df['Original'].isin(valid_df['sequence'])]
train_df = train_df.merge(fe_train, left_on='sequence', right_on='Original', how='left')
test_df  = test_df.merge(fe_test, left_on='sequence', right_on='Original', how='left')
valid_df = valid_df.merge(fe_valid, left_on='sequence', right_on='Original', how='left')

In [34]:

def encode_ec_vector(x):
    if isinstance(x, str):
        try:
            parsed = ast.literal_eval(x)
            if isinstance(parsed, list):
                x = parsed[0]
            else:
                x = parsed
        except Exception:
            pass
    if isinstance(x, str):
        parts = x.split('.')
    else:
        parts = []

    vec = []
    for part in parts:
        part = part.strip()
        if part == '-' or part == '':
            vec.append(-1)
        else:
            try:
                vec.append(int(part))
            except ValueError:
                try:
                    vec.append(float(part))
                except Exception:
                    vec.append(-1)
    return vec

def convert_str_to_list(x):
    if isinstance(x, str):
        try:
            return ast.literal_eval(x)
        except Exception:
            return x
    return x

def expand_column(df, column_names, drop_original=False):

    for column_name in column_names:
        expanded_cols = df[column_name].apply(pd.Series)
        expanded_cols.columns = [f"{column_name}_{i}" for i in expanded_cols.columns]
        df = pd.concat([df, expanded_cols], axis=1)
        if drop_original:
            df = df.drop(columns=[column_name])
    return df

In [35]:
train_df['ec'] = train_df['ec'].apply(encode_ec_vector)
valid_df['ec'] = valid_df['ec'].apply(encode_ec_vector)
test_df['ec'] = test_df['ec'].apply(encode_ec_vector)

In [36]:
cols_to_convert = ['AAC', 'DC']
for col in cols_to_convert:
    train_df[col] = train_df[col].apply(convert_str_to_list)
    valid_df[col] = valid_df[col].apply(convert_str_to_list)
    test_df[col] = test_df[col].apply(convert_str_to_list)

In [37]:
train_df = expand_column(train_df, ['ec', 'AAC', 'DC'], True)
valid_df = expand_column(valid_df, ['ec', 'AAC', 'DC'], True)
test_df = expand_column(test_df, ['ec', 'AAC', 'DC'], True)

In [38]:
def merge_hmm(df, flag="train"):
    feature_df = pd.read_csv(f"../dataset/HMM/{flag}_features.csv")
    final_df = pd.merge(df, feature_df, left_on='accession',right_on='query_name', how="left").fillna(0)
    return final_df

train_df = merge_hmm(train_df, "train")
valid_df = merge_hmm(valid_df, "valid")
test_df = merge_hmm(test_df, "test")

In [39]:
display(valid_df.columns)

Index(['accession', 'sequence', 'traintest', 'negative_for', 'mainclass_set',
       'sprot_version', 'len', 'cluster_ID', 'representative', 'Original',
       ...
       'DC_395', 'DC_396', 'DC_397', 'DC_398', 'DC_399', 'query_name',
       'E-value', 'score', 'coverage', 'num_domains'],
      dtype='object', length=446)

In [40]:
display(train_df.columns)

Index(['accession', 'sequence', 'traintest', 'negative_for', 'mainclass_set',
       'sprot_version', 'len', 'cluster_ID', 'representative', 'Original',
       ...
       'DC_395', 'DC_396', 'DC_397', 'DC_398', 'DC_399', 'query_name',
       'E-value', 'score', 'coverage', 'num_domains'],
      dtype='object', length=446)

In [41]:
drop_columns=['Original', 'traintest', 'negative_for', 'mainclass_set', 'sprot_version', 'len', 'cluster_ID', 'representative', 'sequence', 'accession', 'query_name']
train_df = train_df.drop(columns=drop_columns)
valid_df = valid_df.drop(columns=drop_columns)
test_df = test_df.drop(columns=drop_columns)

In [42]:
train_df.to_csv("../dataset/all_features/train.csv", index=False)
test_df.to_csv("../dataset/all_features/test.csv", index=False)
valid_df.to_csv("../dataset/all_features/valid.csv", index=False)

## Loading data

In [2]:
train_df = pd.read_csv("../dataset/all_features/train.csv")
valid_df = pd.read_csv("../dataset/all_features/valid.csv")
test_df = pd.read_csv("../dataset/all_features/test.csv")

In [3]:
train_df

Unnamed: 0,molecular_weight,isoelectric_point,gravy,aromaticity,instability_index,aliphatic_index,boman_index,ec_0,ec_1,ec_2,...,DC_394,DC_395,DC_396,DC_397,DC_398,DC_399,E-value,score,coverage,num_domains
0,32275.0348,8.347963,-0.154483,0.058621,42.982793,105.862069,0.291586,2,5,1,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2.900000e-71,27.0,0.920690,2.0
1,529990.8042,6.424364,-0.162032,0.053542,46.634426,90.848941,0.260615,2,3,2,...,0.001018,0.001425,0.001833,0.001629,0.000204,0.000407,3.000000e-03,717.7,0.476181,57.0
2,527221.8439,5.875981,-0.186202,0.060612,42.951614,90.877120,0.276119,2,3,2,...,0.000828,0.001862,0.002069,0.000621,0.000207,0.000207,4.200000e-02,9.5,0.650393,89.0
3,37082.7180,5.953111,-0.295385,0.089231,50.063692,94.553846,0.349231,2,5,1,...,0.000000,0.000000,0.003086,0.000000,0.000000,0.000000,1.400000e-04,227.6,1.064615,4.0
4,73292.5257,8.690096,0.295475,0.110106,38.304540,110.030166,0.535460,2,3,1,...,0.003021,0.001511,0.000000,0.003021,0.003021,0.001511,8.400000e-03,30.3,0.506787,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45755,62736.3791,9.224539,-0.222993,0.094891,37.310420,96.733577,0.360274,3,6,4,...,0.001828,0.000000,0.000000,0.003656,0.000000,0.003656,1.200000e-05,81.3,1.312044,9.0
45756,81911.2453,8.032326,-0.263022,0.133813,38.216604,95.251799,0.419914,3,6,4,...,0.001441,0.001441,0.000000,0.001441,0.000000,0.002882,6.900000e-04,92.2,2.138129,21.0
45757,25398.5248,5.070572,-0.309459,0.094595,31.677928,91.036036,0.396667,3,1,3,...,0.004525,0.000000,0.000000,0.000000,0.004525,0.000000,8.100000e-04,77.9,2.729730,6.0
45758,18860.1921,5.420075,-0.202367,0.053254,45.992367,102.721893,0.323846,3,1,4,...,0.005952,0.000000,0.000000,0.000000,0.000000,0.005952,1.100000e-01,37.6,1.928994,3.0


## Data Processing

In [4]:
def get_feature_and_label(df):
    ec_cols = [col for col in df.columns if col.startswith('ec_')]
    return df.drop(columns=ec_cols), df[ec_cols]

In [5]:
train_X, train_Y = get_feature_and_label(train_df)
valid_X, valid_Y = get_feature_and_label(valid_df)
test_X, test_Y = get_feature_and_label(test_df)

scaler = MinMaxScaler()
scaler.fit(train_X)
train_X = scaler.transform(train_X)
valid_X = scaler.transform(valid_X)
test_X = scaler.transform(test_X)

In [6]:
train_Y

Unnamed: 0,ec_0,ec_1,ec_2,ec_3
0,2,5,1,61
1,2,3,2,26
2,2,3,2,26
3,2,5,1,83
4,2,3,1,78
...,...,...,...,...
45755,3,6,4,13
45756,3,6,4,12
45757,3,1,3,5
45758,3,1,4,-1


# Modeling

In [7]:
def multioutput_f1_score(y_true, y_pred):
    # Flatten all outputs and compute micro-F1
    return f1_score(y_true.ravel(), y_pred.ravel(), average='micro', zero_division=0)

In [19]:
X_trainval = np.concatenate([train_X, valid_X], axis=0)
y_trainval = np.concatenate([train_Y, valid_Y], axis=0)
split_index = np.concatenate([
    np.full(len(train_X), -1),  # Training samples (-1 means they are used for training)
    np.full(len(valid_X), 0)    # Validation samples (0 means they are used for validation)
])
# Use the list to create PredefinedSplit
pds = PredefinedSplit(test_fold = split_index)

### Single Random Forest

In [None]:
f1_scorer = make_scorer(multioutput_f1_score)

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20]
}

# Set up the RandomForestClassifier (no MultiOutputClassifier needed)

rf = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring=f1_scorer,
    cv=pds,
    n_jobs=-1,
    verbose=1
)
grid_search.fit(X_trainval, y_trainval)

# Best parameters and best F1 score
print("Best params:", grid_search.best_params_)
print("Best F1:", grid_search.best_score_)

In [10]:
# Evaluate on the test set
# best_params = grid_search.best_params_
# best_model = RandomForestClassifier(random_state=42, **best_params)
best_model = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=None)
best_model.fit(X_trainval, y_trainval)

In [17]:
test_pred = best_model.predict(test_X)
final_f1 = multioutput_f1_score(test_Y.to_numpy(), test_pred)
print("Test F1:", final_f1)
eval_report = evaluate_ec_predictions(test_pred, test_Y.to_numpy(), "Physiochemical + HMM + RF")
eval_report

Test F1: 0.38094364992054763


Unnamed: 0,Method,Exact Match Accuracy,No EC number found,No Prediction,Position 1 Accuracy,Position 1 Precision,Position 1 Recall,Position 1 F1-Score,Position 2 Accuracy,Position 2 Precision,Position 2 Recall,Position 2 F1-Score,Position 3 Accuracy,Position 3 Precision,Position 3 Recall,Position 3 F1-Score,Position 4 Accuracy,Position 4 Precision,Position 4 Recall,Position 4 F1-Score
0,Physiochemical + HMM + RF,4.534898,0.0,0.0,42.879844,0.308276,0.72813,0.301557,37.415964,0.106004,0.313204,0.119088,48.453734,0.056081,0.316874,0.060274,4.461557,0.458614,0.523714,0.474899


In [18]:
eval_report.to_csv('../metrics/experiment_results.csv', index=False)

### Multiple Model Prediction

In [20]:
# Dictionary to store the tuned models for each EC column
models = []
ec_columns = ["ec_0", "ec_1", "ec_2", "ec_3"]

for i in tqdm(range(y_trainval.shape[1])):
    # 1) Create a new RandomForestClassifier
    model = RandomForestClassifier(
        random_state=42, 
        n_estimators=100, 
        max_depth=None
    )
    # 2) Fit on the single column ec_col
    valid_indices = y_trainval[:, i] != -1
    X_trainval_i = X_trainval[valid_indices]
    y_trainval_i = y_trainval[valid_indices, i]
    valid_indices = y_trainval[:, i] != -1
    model.fit(X_trainval_i, y_trainval_i)
    
    models.append(model)

100%|██████████| 4/4 [04:32<00:00, 68.04s/it]


In [22]:
test_preds_list = []
for i in tqdm(range(y_trainval.shape[1])):
    y_pred_col = models[i].predict(test_X)
    test_preds_list.append(y_pred_col)

test_pred = np.column_stack(test_preds_list)
final_f1 = multioutput_f1_score(test_Y.to_numpy(), test_pred)
print("Test F1:", final_f1)
eval_report = evaluate_ec_predictions(test_pred, test_Y.to_numpy(), "Physiochemical + HMM + MultiRF")
eval_report

100%|██████████| 4/4 [00:00<00:00,  4.14it/s]

Test F1: 0.3691174673022858





Unnamed: 0,Method,Exact Match Accuracy,No EC number found,No Prediction,Position 1 Accuracy,Position 1 Precision,Position 1 Recall,Position 1 F1-Score,Position 2 Accuracy,Position 2 Precision,Position 2 Recall,Position 2 F1-Score,Position 3 Accuracy,Position 3 Precision,Position 3 Recall,Position 3 F1-Score,Position 4 Accuracy,Position 4 Precision,Position 4 Recall,Position 4 F1-Score
0,Physiochemical + HMM + MultiRF,3.520352,0.0,0.0,42.060873,0.297715,0.720197,0.285135,36.499205,0.098482,0.356674,0.108555,48.906002,0.064475,0.313242,0.074011,20.180907,0.055158,0.203867,0.072607


In [23]:
metrics = pd.read_csv('../metrics/experiment_results.csv')
report_combined = pd.concat([metrics, eval_report], axis=0, ignore_index=True)
display(report_combined)
report_combined.to_csv('../metrics/experiment_results.csv', index=False)

Unnamed: 0,Method,Exact Match Accuracy,No EC number found,No Prediction,Position 1 Accuracy,Position 1 Precision,Position 1 Recall,Position 1 F1-Score,Position 2 Accuracy,Position 2 Precision,Position 2 Recall,Position 2 F1-Score,Position 3 Accuracy,Position 3 Precision,Position 3 Recall,Position 3 F1-Score,Position 4 Accuracy,Position 4 Precision,Position 4 Recall,Position 4 F1-Score
0,Physiochemical + HMM + RF,4.534898,0.0,0.0,42.879844,0.308276,0.72813,0.301557,37.415964,0.106004,0.313204,0.119088,48.453734,0.056081,0.316874,0.060274,4.461557,0.458614,0.523714,0.474899
1,Physiochemical + HMM + MultiRF,3.520352,0.0,0.0,42.060873,0.297715,0.720197,0.285135,36.499205,0.098482,0.356674,0.108555,48.906002,0.064475,0.313242,0.074011,20.180907,0.055158,0.203867,0.072607


ROC plot, Grid search for SVM, multiple random forest takes previous ec prediction as a feature (tree structrue), Maybe rfe?, make change to BLAST grader to format as RF, Finished PSSM, After getting infomative feature set try different model and tuning them -> ROC curve