In [25]:
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import f1_score
import numpy as np
from catboost import CatBoostClassifier, Pool
import pathlib
import pandas as pd

In [26]:
path = pathlib.Path(os.getcwd())
path = path.parent.absolute()
path_data = os.path.join(path, 'data')
path = os.path.join(path, 'data', 'skempi_v2_preprocessed.csv')
print(path)

g:\Работа\BioCad\мл_биоинф\protein_ddg_prediction\data\skempi_v2_preprocessed.csv


In [27]:
# Load the preprocessed data
df = pd.read_csv(path, sep = ';') 

# Separate features and target 
X = df.drop(columns='ddG_sign')
y = df['ddG_sign']

# Identify categorical features 
categorical_features = [
    'Hold_out_type',
    'Protein 1',
    'Protein 2',
    'iMutation_Location(s)_1',
    'iMutation_Location(s)_2',
    'orig_aa_1',
    'chain_1',
    'residue_num_1',
    'mut_aa_1',
    'orig_aa_2',
    'chain_2',
    'residue_num_2',
    'mut_aa_2'
]

In [42]:
# Split into train/validation
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Create CatBoost pools
train_pool = Pool(X_train, y_train, cat_features=categorical_features)
val_pool = Pool(X_val, y_val, cat_features=categorical_features)

In [31]:
# Define hyperparameter grid
param_grid = {
    'learning_rate': [0.03, 0.05, 0.1],
    'depth': [4, 6, 8]
}

# Initialize best parameters tracking
best_params = None
best_f1 = 0
results = []


In [32]:
# Grid search with cross-validation
for lr in param_grid['learning_rate']:
    for depth in param_grid['depth']:
        print(f"\nTesting lr={lr}, depth={depth}")
        
        cv_f1_scores = []
        kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        
        for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
            # Split data
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
            
            # Create CatBoost pools
            train_pool = Pool(X_train, y_train, cat_features=categorical_features)
            val_pool = Pool(X_val, y_val, cat_features=categorical_features)
            
            # Initialize model with current parameters
            model = CatBoostClassifier(
                iterations=1000,
                learning_rate=lr,
                depth=depth,
                loss_function='MultiClass',
                eval_metric='TotalF1',
                cat_features=categorical_features,
                early_stopping_rounds=50,
                verbose=0  # Set to 100 for detailed training logs
            )
            
            # Train
            model.fit(train_pool, eval_set=val_pool, use_best_model=True)
            
            # Evaluate
            y_pred = model.predict(X_val)
            fold_f1 = f1_score(y_val, y_pred, average='weighted')
            cv_f1_scores.append(fold_f1)
        
        # Calculate mean F1 across folds
        mean_f1 = np.mean(cv_f1_scores)
        std_f1 = np.std(cv_f1_scores)
        results.append((lr, depth, mean_f1, std_f1))
        
        print(f"lr={lr}, depth={depth} | Mean F1: {mean_f1:.4f} ± {std_f1:.4f}")
        
        # Update best parameters
        if mean_f1 > best_f1:
            best_f1 = mean_f1
            best_params = {'learning_rate': lr, 'depth': depth}


Testing lr=0.03, depth=4
lr=0.03, depth=4 | Mean F1: 0.7903 ± 0.0035

Testing lr=0.03, depth=6
lr=0.03, depth=6 | Mean F1: 0.7948 ± 0.0060

Testing lr=0.03, depth=8
lr=0.03, depth=8 | Mean F1: 0.7948 ± 0.0049

Testing lr=0.05, depth=4
lr=0.05, depth=4 | Mean F1: 0.7949 ± 0.0075

Testing lr=0.05, depth=6
lr=0.05, depth=6 | Mean F1: 0.7960 ± 0.0076

Testing lr=0.05, depth=8
lr=0.05, depth=8 | Mean F1: 0.7998 ± 0.0085

Testing lr=0.1, depth=4
lr=0.1, depth=4 | Mean F1: 0.7996 ± 0.0027

Testing lr=0.1, depth=6
lr=0.1, depth=6 | Mean F1: 0.7980 ± 0.0127

Testing lr=0.1, depth=8
lr=0.1, depth=8 | Mean F1: 0.8014 ± 0.0123


In [40]:
# Final evaluation
print(f"\nMean CV F1: {np.mean(cv_f1_scores):.4f} ± {np.std(cv_f1_scores):.4f}")
print("\n=== Grid Search Results ===")
for lr, depth, mean_f1, std_f1 in results:
    print(f"lr={lr}, depth={depth}: F1 = {mean_f1:.4f} ± {std_f1:.4f}")
print(f"\nBest parameters: {best_params} (F1 = {best_f1:.4f})")



Mean CV F1: 0.8020 ± 0.0114

=== Grid Search Results ===
lr=0.03, depth=4: F1 = 0.7903 ± 0.0035
lr=0.03, depth=6: F1 = 0.7948 ± 0.0060
lr=0.03, depth=8: F1 = 0.7948 ± 0.0049
lr=0.05, depth=4: F1 = 0.7949 ± 0.0075
lr=0.05, depth=6: F1 = 0.7960 ± 0.0076
lr=0.05, depth=8: F1 = 0.7998 ± 0.0085
lr=0.1, depth=4: F1 = 0.7996 ± 0.0027
lr=0.1, depth=6: F1 = 0.7980 ± 0.0127
lr=0.1, depth=8: F1 = 0.8014 ± 0.0123

Best parameters: {'learning_rate': 0.1, 'depth': 8} (F1 = 0.8014)


In [44]:
# Train final model with best parameters
final_model = CatBoostClassifier(
    iterations=1000,
    learning_rate=best_params['learning_rate'],
    depth=best_params['depth'],
    loss_function='MultiClass',
    eval_metric='TotalF1',
    cat_features=categorical_features,
    verbose=100
)

# First split: 80% train (for grid search), 20% final test
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)


final_model.fit(
    Pool(X_temp, y_temp, cat_features=categorical_features),
    plot=True
)

# Evaluate on hold-out test set
test_pool = Pool(X_test, y_test, cat_features=categorical_features)
y_pred = final_model.predict(X_test)
test_f1 = f1_score(y_test, y_pred, average='weighted')

print(f"\n=== Final Test Evaluation ===")
print(f"Test F1 Score: {test_f1:.4f}")


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6663942	total: 41ms	remaining: 41s
100:	learn: 0.8321602	total: 9.21s	remaining: 1m 21s
200:	learn: 0.8825675	total: 19.5s	remaining: 1m 17s
300:	learn: 0.9180494	total: 29.5s	remaining: 1m 8s
400:	learn: 0.9441455	total: 38.9s	remaining: 58.1s
500:	learn: 0.9606743	total: 49.4s	remaining: 49.2s
600:	learn: 0.9728798	total: 1m	remaining: 40s
700:	learn: 0.9841490	total: 1m 11s	remaining: 30.7s
800:	learn: 0.9907103	total: 1m 22s	remaining: 20.5s
900:	learn: 0.9946149	total: 1m 33s	remaining: 10.3s
999:	learn: 0.9967731	total: 1m 44s	remaining: 0us

=== Final Test Evaluation ===
Test F1 Score: 0.7942


In [45]:
# Feature importance
feature_importances = final_model.get_feature_importance()
for feature, importance in zip(X.columns, feature_importances):
    print(f"{feature}: {importance:.2f}")

Hold_out_type: 5.88
Protein 1: 13.74
Protein 2: 8.70
iMutation_Location(s)_1: 13.86
iMutation_Location(s)_2: 2.04
orig_aa_1: 17.18
chain_1: 7.52
residue_num_1: 15.26
mut_aa_1: 11.22
orig_aa_2: 1.00
chain_2: 1.11
residue_num_2: 0.64
mut_aa_2: 1.87


In [46]:
final_model.save_model('ddG_sign_model.cbm')
