In [1]:
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib  # To save the model

# Load the dataset (if not already in memory)
file_path = "data/featured_clinvar_result.csv"
df = pd.read_csv(file_path)

# Define features (X) and target (y)
X = df.drop(columns=["label"])
y = df["label"]

In [2]:
from sklearn.model_selection import train_test_split

# Split data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"✅ Train-Test Split Done: {X_train.shape[0]} Train Samples, {X_test.shape[0]} Test Samples")


✅ Train-Test Split Done: 972 Train Samples, 243 Test Samples


In [3]:
# Define Random Forest Classifier
import time
gb_model = GradientBoostingClassifier(random_state=42)
start_time = time.time()
gb_model.fit(X_train, y_train)
rf_training_time = time.time() - start_time
print("✅ Gradient Boosting Model Trained Successfully!")
print(f"Gradient Boosting Training Time: {rf_training_time:.4f} seconds")

✅ Gradient Boosting Model Trained Successfully!
Gradient Boosting Training Time: 0.4257 seconds


In [4]:
# Make predictions
start_time = time.time()
gb_preds = gb_model.predict(X_test)
rf_inference_time = time.time() - start_time
# Calculate accuracy
accuracy = accuracy_score(y_test, gb_preds)
print(f"✅ Random Forest Accuracy: {accuracy:.4f}")

# Classification Report
print("\n🔍 Classification Report:")
print(classification_report(y_test, gb_preds))

# Confusion Matrix
print("\n🔍 Confusion Matrix:")
print(confusion_matrix(y_test, gb_preds))
print(f"Random Forest Inference Time: {rf_inference_time:.4f} seconds")

✅ Random Forest Accuracy: 0.9712

🔍 Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.98      0.97       126
           1       0.97      0.97      0.97       117

    accuracy                           0.97       243
   macro avg       0.97      0.97      0.97       243
weighted avg       0.97      0.97      0.97       243


🔍 Confusion Matrix:
[[123   3]
 [  4 113]]
Random Forest Inference Time: 0.0074 seconds


In [None]:
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)
gb_preds = gb_model.predict(X_test)