In [1]:
import pandas as pd

In [2]:
df_cleaned = pd.read_csv('../data/health_claims_cleaned.csv')

In [3]:
df_cleaned.head()  # Display the first few rows of the cleaned DataFrame

Unnamed: 0,Patient_Age,Patient_State,Provider_Specialty,Diagnosis_Code,Procedure_Code,Number_of_Previous_Claims_Patient,Number_of_Previous_Claims_Provider,Is_Fraudulent,Claim_Service_Difference,Cost_Per_Procedure,...,Service_Type_Inpatient,Service_Type_Laboratory,Service_Type_Outpatient,Service_Type_Pharmacy,Risk_Category_Low Risk,Patient_Age_Group_Child,Patient_Age_Group_MiddleAged,Patient_Age_Group_Senior,Provider_Patient_Distance_Category_Low,Provider_Patient_Distance_Category_Medium
0,-0.65054,1.261974,1.65961,0.151646,0.14061,-0.01728,0.901833,0,0.0364,-0.430402,...,2.261636,-0.446332,-0.446733,-0.451053,0.72244,-0.51179,-0.597731,-0.635397,-0.582942,-0.990887
1,-0.194227,-0.635083,1.65961,2.472967,0.826761,-0.01728,-0.608247,1,-1.635309,1.582017,...,-0.442158,-0.446332,2.238474,-0.451053,0.72244,-0.51179,-0.597731,-0.635397,1.715437,-0.990887
2,1.707077,0.904174,-0.610372,-0.427573,-2.627671,-0.01728,-1.212278,0,-0.115574,2.102428,...,-0.442158,-0.446332,-0.446733,2.217035,0.72244,-0.51179,-0.597731,1.573819,-0.582942,1.009197
3,0.148008,1.141892,-0.567247,-0.51165,0.135826,-0.01728,-1.212278,1,-1.521329,-0.598905,...,-0.442158,-0.446332,-0.446733,2.217035,0.72244,-0.51179,1.672992,-0.635397,-0.582942,1.009197
4,-0.802644,0.904174,-0.567247,-0.445633,0.135826,-0.01728,0.750825,0,-0.343534,0.393473,...,-0.442158,-0.446332,-0.446733,-0.451053,0.72244,-0.51179,-0.597731,-0.635397,-0.582942,-0.990887


In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from joblib import Parallel, delayed

# Prepare features and target
X = df_cleaned.drop(columns=['Is_Fraudulent'])
y = df_cleaned['Is_Fraudulent']

# Split data: 80% train, 20% test (stratify to maintain target distribution)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42, test_size=0.2)

# Identify numeric features for scaling
numeric_features = X.select_dtypes(include=['float64', 'int64']).columns.tolist()

# Reduced hyperparameter grids for faster tuning
models_params = {
    'LogisticRegression': {
        'model': LogisticRegression(max_iter=1000, random_state=42),
        'params': {
            'model__C': [0.1, 1, 10],
            'model__penalty': ['l2']
        }
    },
    'RandomForest': {
        'model': RandomForestClassifier(random_state=42, n_jobs=-1),
        'params': {
            'model__n_estimators': [100, 200],
            'model__max_depth': [10, 20]
        }
    },
    'GradientBoosting': {
        'model': GradientBoostingClassifier(random_state=42),
        'params': {
            'model__n_estimators': [100, 200],
            'model__learning_rate': [0.01, 0.1]
        }
    },
    'XGBoost': {
        'model': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, n_jobs=-1),
        'params': {
            'model__n_estimators': [100, 200],
            'model__learning_rate': [0.01, 0.1]
        }
    }
}

# Reduced cross-validation folds for speed
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

def train_model(model_name, mp):
    print(f"Running {model_name}...")
    
    # Pipeline: scaling numeric features + model
    pipeline_steps = [
        ('scaler', StandardScaler()),
        ('model', mp['model'])
    ]
    pipeline = Pipeline(pipeline_steps)
    
    # GridSearchCV with reduced verbosity
    grid = GridSearchCV(pipeline, param_grid=mp['params'], cv=cv, scoring='roc_auc', n_jobs=-1, verbose=0)
    grid.fit(X_train, y_train)
    
    # Best model and params
    best_model = grid.best_estimator_
    
    # Predict on test set
    y_pred = best_model.predict(X_test)
    y_proba = best_model.predict_proba(X_test)[:,1]
    
    # Metrics
    report = classification_report(y_test, y_pred, output_dict=True)
    roc_auc = roc_auc_score(y_test, y_proba)
    
    print(f"{model_name} - ROC AUC: {roc_auc:.4f}")
    
    return {
        'Model': model_name,
        'Best_Params': grid.best_params_,
        'ROC_AUC': roc_auc,
        'Precision': report['1']['precision'],
        'Recall': report['1']['recall'],
        'F1_Score': report['1']['f1-score']
    }

# Parallel execution of models
results = Parallel(n_jobs=-1)(delayed(train_model)(model_name, mp) for model_name, mp in models_params.items())

# Summarize all model results
results_df = pd.DataFrame(results).sort_values(by='ROC_AUC', ascending=False)
print("\nModel Performance Summary:")
print(results_df)



Model Performance Summary:
                Model                                        Best_Params  \
2    GradientBoosting  {'model__learning_rate': 0.1, 'model__n_estima...   
3             XGBoost  {'model__learning_rate': 0.1, 'model__n_estima...   
1        RandomForest  {'model__max_depth': 20, 'model__n_estimators'...   
0  LogisticRegression           {'model__C': 10, 'model__penalty': 'l2'}   

    ROC_AUC  Precision    Recall  F1_Score  
2  0.868757   0.943534  0.516966  0.667956  
3  0.868373   0.925606  0.533932  0.677215  
1  0.860923   0.933333  0.516966  0.665382  
0  0.808378   0.715200  0.446108  0.549478  
