In [1]:
pip install numpy pandas scikit-learn tensorflow xgboost matplotlib



In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
from tqdm import tqdm

In [3]:
df = pd.read_csv("/content/preprocessed_dataset.csv")  # adjust path if needed
X = df.drop(columns=['PotentialFraud', 'AttendingPhysician_fraud_prob', 'OperatingPhysician_fraud_prob', 'OtherPhysician_fraud_prob'])
y = df["PotentialFraud"]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [5]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [12]:
base_models = [
    ("rf", RandomForestClassifier(n_estimators=100, random_state=42)),
    ("xgb", XGBClassifier(n_estimators=200, learning_rate=0.05,
                          max_depth=6, subsample=0.8,
                          colsample_bytree=0.8,
                          eval_metric="logloss", random_state=42))
]


In [13]:
meta_model = LogisticRegression(max_iter=1000, random_state=42)


In [14]:
class StackingWithProgress(StackingClassifier):
    def fit(self, X, y, **fit_params):
        print("\nTraining Stacking Ensemble...\n")
        for i in tqdm(range(self.cv), desc="Cross-Validation Folds"):
            pass  # Just progress for folds (scikit-learn internally handles fitting)
        return super().fit(X, y, **fit_params)

In [15]:
stack_model = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_model,
    passthrough=False,
    cv=3,
    n_jobs=-1
)

In [16]:
from joblib import parallel_backend
with parallel_backend('threading'):
    stack_model.fit(X_train_scaled, y_train)


In [11]:
y_pred = stack_model.predict(X_test_scaled)

print("\nEnsemble Model Performance:\n")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))


Ensemble Model Performance:

              precision    recall  f1-score   support

           0       0.75      0.84      0.79     69083
           1       0.68      0.55      0.61     42560

    accuracy                           0.73    111643
   macro avg       0.72      0.69      0.70    111643
weighted avg       0.72      0.73      0.72    111643

Accuracy: 0.7297904929104377
