In [2]:
!pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl (3.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m39.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lightgbm
Successfully installed lightgbm-4.6.0


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import pandas as pd
import numpy as np
import time
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report, accuracy_score, precision_score,
    recall_score, f1_score, confusion_matrix
)
from sklearn.preprocessing import LabelEncoder
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Load dataset
file_path = '/content/drive/MyDrive/Copy of final_dataset'
df = pd.read_csv(file_path, dtype={'Chromosome': str}, low_memory=False)

# Target and Features
target = 'Grouped_ClinicalSignificance'
X = df.drop(columns=[target])
y = df[target]

# Encode target
le = LabelEncoder()
y = le.fit_transform(y)

# Encode categorical features for LightGBM
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
X = X.copy()
for col in categorical_cols:
    X[col] = LabelEncoder().fit_transform(X[col].astype(str))

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
import warnings
warnings.filterwarnings("ignore")
# Model setup
model = LGBMClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=7,
    class_weight='balanced',
    random_state=42,
    verbose=-1
)

# Train and time the process
start_time = time.time()
model.fit(X_train, y_train)
end_time = time.time()

# Predictions
start_infer = time.time()
y_pred = model.predict(X_test)
end_infer = time.time()
inference_time = end_infer - start_infer

# Evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
conf_matrix = confusion_matrix(y_test, y_pred)
false_negatives = sum(conf_matrix[i].sum() - conf_matrix[i][i] for i in range(len(conf_matrix)))

# Print results
print("✅ Accuracy:", f"{accuracy:.4f}")
print("🎯 Precision:", f"{precision:.4f}")
print("🔁 Recall:", f"{recall:.4f}")
print("📊 F1 Score:", f"{f1:.4f}")
print(f"❌ Total False Negatives: {false_negatives}")
print(f"⏱️ Computation Time: {end_time - start_time:.3f} seconds\n")
print(f"⚡ Inference Time: {inference_time:.4f} seconds")
print("📋 Classification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Accuracy: 0.7988
🎯 Precision: 0.8252
🔁 Recall: 0.7988
📊 F1 Score: 0.8087
❌ Total False Negatives: 243577
⏱️ Computation Time: 134.240 seconds

⚡ Inference Time: 38.3096 seconds
📋 Classification Report:
                                              precision    recall  f1-score   support

                                     Benign       0.46      0.66      0.54     97674
Conflicting classification of pathogenicity       1.00      1.00      1.00     53770
                              Likely benign       0.83      0.78      0.80    329237
                          Likely pathogenic       0.35      0.46      0.39     47932
                                 Pathogenic       0.61      0.76      0.68     58924
                                Risk allele       0.16      1.00      0.28         9
                     Uncertain significance       0.92      0.84      