In [8]:
import numpy as np
import pandas as pd
# from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
# from joblib import dump
import pickle


In [10]:
def tuned_gbm(X, y, param_grid):
    """Trains a GBM classifier.

    Args:
      X: A NumPy array containing the training data features.
      y: A NumPy array containing the training data labels.
      param_grid: parameter grid for hyperparameter tuning
        consisting of:
            n_estimators: The number of trees in the GBM ensemble.
            learning_rate: The learning rate of the GBM algorithm.
            max_depth: The maximum depth of the trees in the GBM ensemble.
          optional:
            min_samples_leaf: The minimum number of samples required to split a node in the trees of the GBM ensemble.

    Returns:
      A map of trained metrics.
    """

    # Helper Function to get metrics for evaluating the model
    def metrics(y_pred, y_test):
        # Calculate the accuracy, precision, recall, and F1 score
        accuracy = np.mean(y_pred == y_test)
        precision = np.mean(y_pred[y_pred == 1] == y_test[y_pred == 1])
        recall = np.mean(y_pred[y_test == 1] == 1)
        f1_score = 2 * precision * recall / (precision + recall)

        metric_map = {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1_score
        }
        return metric_map

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42)

    # Scale the features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # create xgb
    xgb_model = xgb.XGBClassifier()

    skf = StratifiedKFold(n_splits=5, shuffle = True, random_state = 1001)
    # Initialize GridSearchCV for hyperparameter tuning
    random_search = RandomizedSearchCV(xgb_model, param_distributions=param_grid, n_iter=5, scoring='roc_auc', 
    n_jobs=4, cv=skf.split(X_train,y_train), verbose=3, random_state=42 )

    # Fit the grid search to the data
    random_search.fit(X_train, y_train)

    # Get the best parameters from the grid search
    best_params = random_search.best_params_

    # Train a GBM with the best hyperparameters
    best_gbm = xgb.XGBClassifier(**best_params)
    best_gbm.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = best_gbm.predict(X_test)

    # Export the model to a file
    with open('bestgbm_model.pkl', 'wb') as f:
        pickle.dump(best_gbm, f)
        print(f"Model saved as best_gbm")

    metric_map = metrics(y_pred, y_test)

    return metric_map


In [17]:
# Load the dataset
df = pd.read_csv('final_training_set.csv')
df = df.drop(columns=["Provider","ClaimID"], axis=1)
#df['PotentialFraud'] = df['PotentialFraud'].replace({'Yes': 1, 'No': 0})
# Prepare the data
# dropping utility columns
#df = df.drop(columns=["Provider","ClaimID","BeneID", "diagnosis_descriptor", "procedures_descriptor",
#'ClaimStartDt','ClaimEndDt','AdmissionDt','DischargeDt','DOB','DOD'], axis=1)

X = df
y = X.pop('PotentialFraud')

#print(y)

# # Handling Date Time
# date_format = "%m/%d/%Y"
# problems = set()
# # Convert date columns to datetime objects
# # handle the errorness  y-m-d patterns 2009-08-01 to conver to m/d/y format
# ymd_pattern = re.compile(r'\d{4}-\d{2}-\d{2}')
# try:
#     df['ClaimStartDt'] = pd.to_datetime(df['ClaimStartDt'], format=date_format)
#     # df['ClaimEndDt'] = pd.to_datetime(df['ClaimEndDt'], format=date_format)
#     # df['AdmissionDt'] = pd.to_datetime(df['ClaimStartDt'], format=date_format)
#     # df['DischargeDt'] = pd.to_datetime(df['ClaimEndDt'], format=date_format)
#     # df['DOB'] = pd.to_datetime(df['ClaimEndDt'], format=date_format)
#     # # df['DOD'] = pd.to_datetime(df['ClaimEndDt'], format=date_format)
# except:
#     print(df['ClaimStartDt'])

# # Calculate the number of days elapsed
# df['Claim_days_elapsed'] = (df['ClaimEndDt'] - df['ClaimStartDt']).dt.days
# df['Admission_days_elapsed'] = (df['DischargeDt'] - df['AdmissionDt']).dt.days

# # Claimer's age
# now = pd.Timestamp('now')
# df['age'] = (now - df['dob']).astype('<m8[Y]')

# # Claimer is Alive or Not boolean
# df['DOD'] = df['DOD'].fillna(0)
# df.loc[df['DOD'].notnull(), 'DOD'] = 1


# # drop redundant columns
# df = df.drop(columns=['ClaimStartDt', 'ClaimEndDt',
#              'AdmissionDt', 'DischargeDt'])

# # Convert categorical features to numerical features
# df = pd.get_dummies(
#     df, columns=['Provider'])

# print(df)

In [18]:


# Define a parameter grid for hyperparameter tuning
param_grid = {'objective': ['binary:logistic'],
              'learning_rate': [0.04, 0.05, 0.06],  # so called `eta` value
              'max_depth': [4, 5, 6],
              'min_child_weight': [11],
              'silent': [1],
              'subsample': [0.8],
              'colsample_bytree': [0.7],
              'n_estimators': [10, 100, 1000],
              'missing': [np.nan],
              'seed': [42]}

# Main Algorithm
metrics_ret = tuned_gbm(X, y, param_grid)


Fitting 5 folds for each of 5 candidates, totalling 25 fits


In [59]:
print(metrics_ret)

{'accuracy': 0.8045735066237919, 'precision': 0.769321425765393, 'recall': 0.693051787188864, 'f1_score': 0.7291976963558734}
