In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib  # To save the model

# Load the dataset (if not already in memory)
file_path = "data/featured_clinvar_result.csv"
df = pd.read_csv(file_path)

# Define features (X) and target (y)
X = df.drop(columns=["label"])
y = df["label"]


In [2]:
from sklearn.model_selection import train_test_split

# Split data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"✅ Train-Test Split Done: {X_train.shape[0]} Train Samples, {X_test.shape[0]} Test Samples")


✅ Train-Test Split Done: 972 Train Samples, 243 Test Samples


In [3]:
import time
dt_model = DecisionTreeClassifier(random_state=42)
start_time = time.time()
dt_model.fit(X_train, y_train)
rf_training_time = time.time() - start_time
print("✅ Decision Tree Model Trained Successfully!")
print(f"Decision tree Training Time: {rf_training_time:.4f} seconds")
start_time = time.time()
dt_preds = dt_model.predict(X_test)
rf_inference_time = time.time() - start_time
print(f"Decision Tree Inference Time: {rf_inference_time:.4f} seconds")

✅ Decision Tree Model Trained Successfully!
Decision tree Training Time: 0.0178 seconds
Decision Tree Inference Time: 0.0021 seconds


In [5]:
# Calculate accuracy
accuracy = accuracy_score(y_test, dt_preds)
print(f"✅ Decision Tree Accuracy: {accuracy:.4f}")

# Classification Report
print("\n🔍 Classification Report:")
print(classification_report(y_test, dt_preds))

# Confusion Matrix
print("\n🔍 Confusion Matrix:")
print(confusion_matrix(y_test, dt_preds))

✅ Decision Tree Accuracy: 0.9588

🔍 Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.97      0.96       126
           1       0.97      0.95      0.96       117

    accuracy                           0.96       243
   macro avg       0.96      0.96      0.96       243
weighted avg       0.96      0.96      0.96       243


🔍 Confusion Matrix:
[[122   4]
 [  6 111]]


In [6]:
# Save trained Random Forest model
model_path = "models/decision_tree.pkl"
joblib.dump(dt_model, model_path)

print(f"✅ Model Saved at: {model_path}")


✅ Model Saved at: models/decision_tree.pkl
