In [29]:
import numpy as np
import pandas as pd
import joblib
import time
import os
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import sklearn.metrics as metrics
import optuna
from datetime import datetime
from warnings import filterwarnings
filterwarnings("ignore")
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 20)

### Training Data

In [2]:
train = pd.read_csv("../data/processed/training_data.csv")

In [3]:
train.head()

Unnamed: 0,Provider,PotentialFraud,count_unique_beneficiary,count_unique_claims,count_dead_beneficiary,count_unique_states,count_unique_counties,mean_hospital_stay_days,max_hospital_stay_days,total_top_diagnosis_codes,mean_total_diagnosis,mean_total_procedures,mean_number_of_physicians,mean_difference_stay_vs_claim,patients_under_top_attending_physician,patients_under_top_operating_physician,mean_claim_amount,total_claim_amount,std_claim_amount,mean_age,count_of_males,count_of_females,count_of_black_people,count_of_white_people,count_of_hispanic_people,count_of_other_people,count_alzheimer,count_heartfailure,count_kidneydisease,count_cancer,count_obstrpulmonary,count_depression,count_diabetes,count_ischemicheart,count_osteoporasis,count_rheumatoidarthritis,count_stroke,count_renal_disease,mean_annual_inpatient_reimbursement,mean_annual_inpatient_deductible,mean_annual_outpatient_reimbursement,mean_annual_outpatient_deductible,count_inpatient,count_outpatient,claims_per_bene
0,PRV51001,0,24,25,0,1,7,1.2,15.0,3,3.2,0.12,1.6,0.0,0,0,4185.6,104640,10796.091144,77.88,9,16,4,21,0,0,15,19,17,5,10,9,21,23,6,8,6,8,17606.0,897.12,2615.2,463.92,5,20,1.041667
1,PRV51003,1,117,132,1,3,23,2.893939,28.0,26,5.25,0.363636,1.530303,0.0,0,0,4588.409091,605670,7309.794729,69.083333,54,78,24,107,1,0,56,80,64,10,41,54,100,112,33,38,12,29,7568.181818,931.424242,2678.181818,737.121212,62,70,1.128205
2,PRV51004,0,138,149,1,9,28,0.0,0.0,7,2.583893,0.0,1.604027,0.0,0,0,350.134228,52170,689.963754,71.261745,46,103,24,120,0,5,64,88,50,16,41,63,105,108,49,46,17,23,4351.879195,434.95302,2194.899329,622.751678,0,149,1.07971
3,PRV51005,1,495,1165,4,4,26,0.0,0.0,82,2.588841,0.0,1.599142,0.0,0,0,241.124464,280910,491.556392,69.52618,511,654,262,893,0,10,426,680,507,165,295,485,799,895,344,331,124,259,3623.991416,379.162232,2109.733906,636.328755,0,1165,2.353535
4,PRV51007,0,58,72,1,2,6,0.263889,8.0,4,2.986111,0.013889,1.527778,0.0,0,0,468.194444,33710,1433.769116,68.305556,34,38,14,58,0,0,26,40,22,12,16,29,49,51,21,22,12,11,3050.0,445.0,1729.722222,469.722222,3,69,1.241379


In [4]:
standard_scaling_column = ['mean_age','mean_claim_amount','total_claim_amount','std_claim_amount','mean_hospital_stay_days',
                           'max_hospital_stay_days','mean_annual_inpatient_reimbursement','mean_annual_inpatient_deductible',
                           'mean_annual_outpatient_reimbursement',  'mean_annual_outpatient_deductible']

In [5]:
min_max_cols = [
    'count_unique_beneficiary', 'count_unique_claims', 'claims_per_bene',
    'count_unique_states', 'count_unique_counties',
    'total_top_diagnosis_codes', 'mean_total_diagnosis', 'mean_total_procedures',
    'mean_number_of_physicians', 'mean_difference_stay_vs_claim',
    'count_inpatient', 'count_outpatient',
    'count_of_males', 'count_of_females', 'count_of_black_people', 'count_of_white_people' ,
    'count_of_hispanic_people', 'count_of_other_people'
]

In [6]:
X = train.drop(['Provider','PotentialFraud'],axis=1)

In [7]:
y = train[['PotentialFraud']]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

In [9]:
preprocessor = make_column_transformer(
    (StandardScaler(), standard_scaling_column),
    (MinMaxScaler(), min_max_cols),
    remainder='passthrough'
)

## Random Forest

In [11]:
rf = RandomForestClassifier(
    class_weight='balanced',   
    random_state=42,
    n_jobs=-1
)

In [12]:
pipe_rf = make_pipeline(preprocessor, rf)

In [13]:
start_train = time.time()
pipe_rf.fit(X_train, y_train)
train_time = time.time() - start_train
print(f"⏱ Training time: {train_time:.2f} seconds")

⏱ Training time: 0.36 seconds


In [15]:
y_pred_train_rf = pipe_rf.predict_proba(X_train)[:, 1]
y_pred_test_rf = pipe_rf.predict_proba(X_test)[:, 1]
print(f"Training roc-auc score: {metrics.roc_auc_score(y_train, y_pred_train_rf)}")
print(f"Testing roc-auc score: {metrics.roc_auc_score(y_test, y_pred_test_rf)}")

Training roc-auc score: 1.0
Testing roc-auc score: 0.9491128921896788


In [16]:
print(metrics.classification_report(y_test, pipe_rf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.95      0.98      0.97      1619
           1       0.75      0.55      0.63       167

    accuracy                           0.94      1786
   macro avg       0.85      0.77      0.80      1786
weighted avg       0.94      0.94      0.94      1786



Model is overfitted and only capturing 55% fraudsters

## XGBoost

In [17]:
scale_pos_weight = round((y_train==0).sum()/(y_train==1).sum())['PotentialFraud']

In [18]:
xgb = XGBClassifier(
    objective='binary:logistic',
    eval_metric='auc',
    n_jobs=-1,
    random_state=42,
    scale_pos_weight=scale_pos_weight)

In [20]:
pipe_xgb = make_pipeline(preprocessor, xgb)

In [21]:
start_train = time.time()
pipe_xgb.fit(X_train, y_train)
train_time = time.time() - start_train
print(f"⏱ Training time: {train_time:.2f} seconds")

⏱ Training time: 1.70 seconds


In [22]:
y_pred_train_xgb = pipe_xgb.predict_proba(X_train)[:, 1]
y_pred_test_xgb = pipe_xgb.predict_proba(X_test)[:, 1]
print(f"Training roc-auc score: {metrics.roc_auc_score(y_train, y_pred_train_xgb)}")
print(f"Testing roc-auc score: {metrics.roc_auc_score(y_test, y_pred_test_xgb)}")

Training roc-auc score: 1.0
Testing roc-auc score: 0.9506200693116545


In [23]:
print(metrics.classification_report(y_test, pipe_xgb.predict(X_test)))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97      1619
           1       0.69      0.68      0.68       167

    accuracy                           0.94      1786
   macro avg       0.83      0.83      0.83      1786
weighted avg       0.94      0.94      0.94      1786



The model is overfitted but now capturing more than Random Forest so we will fine tune XGBoost.

## Fine tuning

In [26]:
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 600),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
        "max_depth": trial.suggest_int("max_depth", 3, 7),
        "subsample": trial.suggest_float("subsample", 0.5, 0.9),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 0.9),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "gamma": trial.suggest_float("gamma", 0, 1),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 5),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 5),
        "random_state": 42,
        "n_jobs": -1,
        "scale_pos_weight": scale_pos_weight,
        "objective": "binary:logistic",
        "eval_metric": "auc"
    }
    
    model = XGBClassifier(**params)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    pipeline = make_pipeline(preprocessor, model)
    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring='roc_auc')
    
    return cv_scores.mean()  

In [27]:
study = optuna.create_study(
    direction='maximize',  # Maximize AUC
    study_name='xgboost_fraud_detection'
)

[I 2025-12-29 21:13:19,087] A new study created in memory with name: xgboost_fraud_detection


In [30]:
study.optimize(objective, n_trials=100, show_progress_bar=True)

  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-12-29 21:13:46,772] Trial 1 finished with value: 0.9422322980851853 and parameters: {'n_estimators': 376, 'learning_rate': 0.07448633498465948, 'max_depth': 6, 'subsample': 0.7638955606563205, 'colsample_bytree': 0.7900829799946718, 'min_child_weight': 9, 'gamma': 0.24095129886647548, 'reg_alpha': 1.671623137790793, 'reg_lambda': 4.371861095710674}. Best is trial 1 with value: 0.9422322980851853.
[I 2025-12-29 21:13:48,686] Trial 2 finished with value: 0.9458260002699369 and parameters: {'n_estimators': 127, 'learning_rate': 0.021191236723614205, 'max_depth': 7, 'subsample': 0.6952233781189163, 'colsample_bytree': 0.7132649845811445, 'min_child_weight': 2, 'gamma': 0.6674373012846787, 'reg_alpha': 3.3261514924782287, 'reg_lambda': 0.40017976653055287}. Best is trial 2 with value: 0.9458260002699369.
[I 2025-12-29 21:13:52,820] Trial 3 finished with value: 0.9458909454874433 and parameters: {'n_estimators': 473, 'learning_rate': 0.018504907444419814, 'max_depth': 7, 'subsample':

In [36]:
best_params = study.best_params
best_params.update({
    "random_state": 42,
    "n_jobs": -1,
    "scale_pos_weight": scale_pos_weight,
    "eval_metric": "logloss"
})

xgb = XGBClassifier(**best_params)
pipe_xgb = make_pipeline(preprocessor, xgb)

In [37]:
start_train = time.time()
pipe_xgb.fit(X_train, y_train)
train_time = time.time() - start_train
print(f"⏱ Training time: {train_time:.2f} seconds")

⏱ Training time: 0.28 seconds


In [38]:
y_pred_train_xgb = pipe_xgb.predict_proba(X_train)[:, 1]
y_pred_test_xgb = pipe_xgb.predict_proba(X_test)[:, 1]
print(f"Training roc-auc score: {metrics.roc_auc_score(y_train, y_pred_train_xgb)}")
print(f"Testing roc-auc score: {metrics.roc_auc_score(y_test, y_pred_test_xgb)}")

Training roc-auc score: 0.9796886715785975
Testing roc-auc score: 0.9568743920435842


In [39]:
print(metrics.classification_report(y_test, pipe_xgb.predict(X_test)))

              precision    recall  f1-score   support

           0       0.98      0.91      0.95      1619
           1       0.50      0.86      0.63       167

    accuracy                           0.91      1786
   macro avg       0.74      0.89      0.79      1786
weighted avg       0.94      0.91      0.92      1786



The model is not overfitting much and it capturing almost 86% fraudsters.

### Testing Pipeline

In [40]:
test_beneficiary = pd.read_csv("../data/Test_Beneficiarydata-1542969243754.csv")
test_inpatient = pd.read_csv("../data/Test_Inpatientdata-1542969243754.csv")
test_outpatient = pd.read_csv("../data/Test_Outpatientdata-1542969243754.csv")
test_providers = pd.read_csv("../data/Test-1542969243754.csv")

In [41]:
top_diagnosis_code = pd.read_csv('../data/processed/top_diagnosis_code.csv')
top_attending_physician = pd.read_csv('../data/processed/top_attending_physician.csv')
top_operating_physician = pd.read_csv('../data/processed/top_operating_physician.csv')

In [42]:
test_beneficiary.loc[:, test_beneficiary.columns.str.contains('ChronicCond_')] = \
test_beneficiary.filter(like='ChronicCond_').replace({1: 1, 2: 0})
test_beneficiary = pd.concat([test_beneficiary,
 pd.get_dummies(test_beneficiary['Gender'].map({1: 'Male', 2: 'Female'}), prefix='Gender'),
 pd.get_dummies(test_beneficiary['Race'].map({1: 'White', 2: 'Black', 3: 'Other', 4: 'Unknown', 5: 'Hispanic'}), prefix='Race')], axis=1)
test_beneficiary['RenalDiseaseIndicator'] = test_beneficiary['RenalDiseaseIndicator'].map({'0':0,'Y':1})

In [43]:
test_inpatient['Is_Inpatient'] = True
test_inpatient.head()

Unnamed: 0,BeneID,ClaimID,ClaimStartDt,ClaimEndDt,Provider,InscClaimAmtReimbursed,AttendingPhysician,OperatingPhysician,OtherPhysician,AdmissionDt,ClmAdmitDiagnosisCode,DeductibleAmtPaid,DischargeDt,DiagnosisGroupCode,ClmDiagnosisCode_1,ClmDiagnosisCode_2,ClmDiagnosisCode_3,ClmDiagnosisCode_4,ClmDiagnosisCode_5,ClmDiagnosisCode_6,ClmDiagnosisCode_7,ClmDiagnosisCode_8,ClmDiagnosisCode_9,ClmDiagnosisCode_10,ClmProcedureCode_1,ClmProcedureCode_2,ClmProcedureCode_3,ClmProcedureCode_4,ClmProcedureCode_5,ClmProcedureCode_6,Is_Inpatient
0,BENE11014,CLM67387,2009-09-09,2009-09-16,PRV57070,9000,PHY317786,PHY427017,,2009-09-09,5789,1068.0,2009-09-16,332,5780,5533,496,V420,40390,2851,5990,570,41071.0,4280.0,4443.0,5849.0,,,,,True
1,BENE11017,CLM31237,2008-12-25,2009-01-08,PRV54750,14000,PHY314656,PHY426644,,2008-12-25,5939,1068.0,2009-01-08,661,1889,41071,5990,5601,4588,5845,4549,29570,34831.0,,5551.0,,,,,,True
2,BENE11026,CLM78930,2009-12-09,2009-12-13,PRV53758,2000,PHY349495,,,2009-12-09,4019,1068.0,2009-12-13,241,4010,78791,60000,41401,V1254,4372,78650,7813,4254.0,,,,,,,,True
3,BENE11031,CLM56810,2009-06-23,2009-07-06,PRV55825,16000,PHY429538,PHY371893,,2009-06-23,8208,1068.0,2009-07-06,564,8208,4168,920,5990,40391,2859,4254,41400,5849.0,41401.0,8152.0,3320.0,,,,,True
4,BENE11085,CLM34625,2009-01-20,2009-01-31,PRV52338,19000,PHY397161,,,2009-01-20,4279,1068.0,2009-01-31,880,29654,V142,78702,30503,V140,V4582,V6109,7242,,,,,,,,,True


In [44]:
test_outpatient['Is_Outpatient'] = True
test_outpatient.head()

Unnamed: 0,BeneID,ClaimID,ClaimStartDt,ClaimEndDt,Provider,InscClaimAmtReimbursed,AttendingPhysician,OperatingPhysician,OtherPhysician,ClmDiagnosisCode_1,ClmDiagnosisCode_2,ClmDiagnosisCode_3,ClmDiagnosisCode_4,ClmDiagnosisCode_5,ClmDiagnosisCode_6,ClmDiagnosisCode_7,ClmDiagnosisCode_8,ClmDiagnosisCode_9,ClmDiagnosisCode_10,ClmProcedureCode_1,ClmProcedureCode_2,ClmProcedureCode_3,ClmProcedureCode_4,ClmProcedureCode_5,ClmProcedureCode_6,DeductibleAmtPaid,ClmAdmitDiagnosisCode,Is_Outpatient
0,BENE11001,CLM392397,2009-06-02,2009-06-02,PRV55962,30,PHY347633,,PHY347633,V5832,V5861,2724.0,3182.0,V5869,42731.0,,,,,,,,,,,0,,True
1,BENE11001,CLM430760,2009-06-23,2009-06-23,PRV56112,30,PHY381777,,PHY381777,9594,E9174,4019.0,,,,,,,,,,,,,,0,,True
2,BENE11007,CLM233081,2009-03-07,2009-03-07,PRV56979,200,PHY425311,,PHY425311,7248,,,,,,,,,,,,,,,,0,,True
3,BENE11007,CLM496381,2009-07-29,2009-07-29,PRV56573,10,PHY393253,PHY347995,,58889,2449,,,,,,,,,,,,,,,0,5939.0,True
4,BENE11007,CLM521391,2009-08-12,2009-08-12,PRV56573,10,PHY417685,,PHY382041,V666,,,,,,,,,,,,,,,,0,,True


In [45]:
test_patient_data = pd.concat([test_inpatient, test_outpatient])

In [46]:
for i in ['ClaimStartDt','ClaimEndDt','AdmissionDt','DischargeDt']:
    test_patient_data[i] = pd.to_datetime(test_patient_data[i])

In [47]:
test_patient_data['billing_before_admission'] = (test_patient_data['ClaimStartDt'] - test_patient_data['AdmissionDt']).dt.days < 0            
test_patient_data['billing_after_discharge'] = (test_patient_data['ClaimEndDt'] - test_patient_data['DischargeDt']).dt.days > 0

In [48]:
test_patient_data['top_diagnosis_code'] = np.where(test_patient_data['ClmAdmitDiagnosisCode'].isin(top_diagnosis_code['ClmAdmitDiagnosisCode']),1,0)

In [49]:
test_patient_data['hospital_stay_days'] = (test_patient_data['DischargeDt'] - test_patient_data['AdmissionDt']).dt.days + 1
test_patient_data['num_diagnoses'] =test_patient_data.filter(like='ClmDiagnosisCode_').notna().sum(axis=1)
test_patient_data['num_procedures'] =test_patient_data.filter(like='ClmProcedureCode_').notna().sum(axis=1)

In [50]:
test_patient_data['claim_duration_days'] = (test_patient_data['ClaimEndDt'] - test_patient_data['ClaimStartDt']).dt.days + 1
test_patient_data['num_of_physicians'] = test_patient_data.filter(like='Physician').notna().sum(axis=1)

# Binary columns
test_patient_data['is_top20_attending'] = test_patient_data['AttendingPhysician'].isin(top_attending_physician['AttendingPhysician']).astype(int)
test_patient_data['is_top20_operating'] = test_patient_data['OperatingPhysician'].isin(top_operating_physician['OperatingPhysician']).astype(int)
test_patient_data['stay_vs_claim_diff'] = test_patient_data['hospital_stay_days'] - test_patient_data['claim_duration_days']

In [51]:
test_patient_data = test_patient_data.merge(test_beneficiary,on='BeneID',how='left')
test_patient_data['DOB'] = pd.to_datetime(test_patient_data['DOB'])
test_patient_data['Age_at_Claim'] = ((pd.to_datetime(test_patient_data['ClaimStartDt']) - pd.to_datetime(test_patient_data['DOB'])).dt.days / 365.25).astype(int)

In [52]:
test_patient_data['Is_Dead'] = np.where(test_patient_data['DOD'].isna(), False, True)

In [53]:
for i in ['hospital_stay_days','stay_vs_claim_diff']:
    test_patient_data[i] = test_patient_data[i].fillna(0)
for i in ['Is_Inpatient','Is_Outpatient']:
    test_patient_data[i] = test_patient_data[i].fillna(False)

In [54]:
test_df = test_patient_data.groupby(['Provider']).agg(
    count_unique_beneficiary = ('BeneID','nunique'),
    count_unique_claims = ('ClaimID','count'),
    count_dead_beneficiary = ('Is_Dead','sum'),
    count_unique_states = ('State','nunique'),
    count_unique_counties = ('County','nunique'),
    mean_hospital_stay_days = ('hospital_stay_days', 'mean'),
    max_hospital_stay_days = ('hospital_stay_days', 'max'),
    total_top_diagnosis_codes = ('top_diagnosis_code','sum'),
    mean_total_diagnosis =('num_diagnoses','mean'),
    mean_total_procedures = ('num_procedures','mean'),
    mean_number_of_physicians = ('num_of_physicians','mean'),
    mean_difference_stay_vs_claim = ('stay_vs_claim_diff','mean'),
    patients_under_top_attending_physician = ('is_top20_attending','sum'),
    patients_under_top_operating_physician = ('is_top20_operating','sum'),
    mean_claim_amount = ('InscClaimAmtReimbursed','mean'),
    total_claim_amount = ('InscClaimAmtReimbursed','sum'),
    std_claim_amount = ('InscClaimAmtReimbursed','std'),
    mean_age = ('Age_at_Claim','mean'),
    count_of_males = ('Gender_Male','sum'),
    count_of_females = ('Gender_Female','sum'),
    count_of_black_people = ('Race_Black','sum'),
    count_of_white_people = ('Race_White','sum'),
    count_of_hispanic_people = ('Race_Hispanic','sum'),
    count_of_other_people = ('Race_Other','sum'),
    count_alzheimer = ('ChronicCond_Alzheimer', 'sum'),
    count_heartfailure = ('ChronicCond_Heartfailure', 'sum'),
    count_kidneydisease = ('ChronicCond_KidneyDisease', 'sum'),
    count_cancer = ('ChronicCond_Cancer', 'sum'),
    count_obstrpulmonary = ('ChronicCond_ObstrPulmonary', 'sum'),
    count_depression = ('ChronicCond_Depression', 'sum'),
    count_diabetes = ('ChronicCond_Diabetes', 'sum'),
    count_ischemicheart = ('ChronicCond_IschemicHeart', 'sum'),
    count_osteoporasis = ('ChronicCond_Osteoporasis', 'sum'),
    count_rheumatoidarthritis = ('ChronicCond_rheumatoidarthritis', 'sum'),
    count_stroke = ('ChronicCond_stroke', 'sum'), 
    count_renal_disease = ('RenalDiseaseIndicator', 'sum'),   
    mean_annual_inpatient_reimbursement = ('IPAnnualReimbursementAmt','mean'),
    mean_annual_inpatient_deductible = ('IPAnnualDeductibleAmt','mean'),
    mean_annual_outpatient_reimbursement = ('OPAnnualReimbursementAmt','mean'),
    mean_annual_outpatient_deductible = ('OPAnnualDeductibleAmt','mean'),
    count_inpatient = ('Is_Inpatient','sum'),
    count_outpatient = ('Is_Outpatient','sum')
).reset_index()

In [55]:
test_df['claims_per_bene'] = test_df['count_unique_claims'] / test_df['count_unique_beneficiary']

In [56]:
test_df = test_providers.merge(test_df, on="Provider", how="left")

In [57]:
test_df.head()

Unnamed: 0,Provider,count_unique_beneficiary,count_unique_claims,count_dead_beneficiary,count_unique_states,count_unique_counties,mean_hospital_stay_days,max_hospital_stay_days,total_top_diagnosis_codes,mean_total_diagnosis,mean_total_procedures,mean_number_of_physicians,mean_difference_stay_vs_claim,patients_under_top_attending_physician,patients_under_top_operating_physician,mean_claim_amount,total_claim_amount,std_claim_amount,mean_age,count_of_males,count_of_females,count_of_black_people,count_of_white_people,count_of_hispanic_people,count_of_other_people,count_alzheimer,count_heartfailure,count_kidneydisease,count_cancer,count_obstrpulmonary,count_depression,count_diabetes,count_ischemicheart,count_osteoporasis,count_rheumatoidarthritis,count_stroke,count_renal_disease,mean_annual_inpatient_reimbursement,mean_annual_inpatient_deductible,mean_annual_outpatient_reimbursement,mean_annual_outpatient_deductible,count_inpatient,count_outpatient,claims_per_bene
0,PRV51002,169,205,3,4,39,0.0,0.0,12,2.663415,0.0,1.521951,0.0,0,0,262.390244,53790,609.021752,71.360976,71,134,30,169,4,2,79,108,72,25,64,91,147,151,61,54,19,32,5180.926829,548.253659,2345.073171,677.95122,0,205,1.213018
1,PRV51006,81,102,0,2,15,0.0,0.0,7,2.823529,0.0,1.607843,0.0,0,0,301.176471,30720,520.83576,73.823529,43,59,26,76,0,0,35,69,50,15,37,53,72,78,26,16,8,10,3767.54902,479.647059,2401.666667,605.882353,0,102,1.259259
2,PRV51009,30,39,0,1,5,0.205128,4.0,3,3.076923,0.076923,1.538462,0.0,0,0,698.205128,27230,1934.691424,68.615385,16,23,6,33,0,0,8,17,19,1,15,13,29,31,20,4,3,12,3004.102564,246.461538,2441.025641,646.410256,2,37,1.3
3,PRV51010,25,38,0,1,4,0.763158,9.0,4,3.105263,0.078947,1.5,0.0,0,0,1699.473684,64580,4480.813118,74.342105,23,15,1,37,0,0,21,23,19,7,13,18,28,28,14,10,5,5,5268.421053,534.0,1775.789474,663.947368,6,32,1.52
4,PRV51018,146,190,2,2,32,0.0,0.0,10,2.694737,0.0,1.536842,0.0,0,0,324.315789,61620,673.767191,72.405263,66,124,45,145,0,0,73,109,93,25,61,86,146,135,57,44,13,41,4738.947368,534.0,2812.947368,826.105263,0,190,1.30137


In [58]:
test_df.isna().sum().reset_index().rename(columns={"index": "feature", 0: "missing_values"}).sort_values("missing_values", ascending=False)

Unnamed: 0,feature,missing_values
17,std_claim_amount,49
0,Provider,0
2,count_unique_claims,0
1,count_unique_beneficiary,0
4,count_unique_states,0
...,...,...
39,mean_annual_outpatient_reimbursement,0
40,mean_annual_outpatient_deductible,0
41,count_inpatient,0
42,count_outpatient,0


In [59]:
test_df['std_claim_amount'] = test_df['std_claim_amount'].fillna(0)

In [60]:
provider_id = test_df[['Provider']]
test_df.drop(columns=['Provider'], inplace=True)

In [69]:
y_test_df = pipe_xgb.predict_proba(test_df)[:, 1]
pd.Series((y_test_df >= 0.50).astype(int)).reset_index()[0].value_counts(normalize=True)

0
0    0.81966
1    0.18034
Name: proportion, dtype: float64

In [70]:
pd.Series((y_test_df >= 0.20).astype(int)).reset_index()[0].value_counts(normalize=True)

0
0    0.743533
1    0.256467
Name: proportion, dtype: float64