In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, precision_recall_curve, auc
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import shap

In [2]:
base_path = r"C:/Users/TUKARAM/OneDrive/Desktop/Machine Learning Case Study/Tukaram_Submission/Tukaram_Submission/DataSet"
train_path = os.path.join(base_path, "Training Data")
test_path = os.path.join(base_path, "Unseen Data")

inpatient_train = pd.read_csv(os.path.join(train_path, 'Inpatientdata-1542865627584.csv'))
outpatient_train = pd.read_csv(os.path.join(train_path, 'Outpatientdata-1542865627584.csv'))
beneficiary_train = pd.read_csv(os.path.join(train_path, 'Beneficiarydata-1542865627584.csv'))
labels = pd.read_csv(os.path.join(train_path, 'Train_1542865627584.csv'))

In [3]:
inpatient_train['is_inpatient'] = 1
outpatient_train['is_inpatient'] = 0
claims_train = pd.concat([inpatient_train, outpatient_train], ignore_index=True)
claims_train = claims_train.merge(beneficiary_train, on='BeneID', how='left')
claims_train = claims_train.merge(labels, on='Provider', how='left')


In [4]:
claims_train['AdmissionDt'] = pd.to_datetime(claims_train['AdmissionDt'], errors='coerce')
claims_train['DischargeDt'] = pd.to_datetime(claims_train['DischargeDt'], errors='coerce')
claims_train['StayDuration'] = (claims_train['DischargeDt'] - claims_train['AdmissionDt']).dt.days

provider_claim_counts = claims_train.groupby('Provider').size().reset_index(name='NumClaims')
claims_train = claims_train.merge(provider_claim_counts, on='Provider', how='left')

# Drop missing or irrelevant
features = ['InscClaimAmtReimbursed', 'DeductibleAmtPaid', 'is_inpatient', 'StayDuration', 'NumClaims']
X = claims_train[features].fillna(0)
y = claims_train['PotentialFraud'].map({'Yes': 1, 'No': 0})

print("Fraud ratio:")
print(y.value_counts(normalize=True))

Fraud ratio:
PotentialFraud
0    0.618789
1    0.381211
Name: proportion, dtype: float64


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [6]:
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train, y_train)


In [7]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

param_grid = {
    'n_estimators': [100],   
    'max_depth': [5, 10]    
}

# Use StratifiedKFold to ensure class balance and reduce to 3 folds
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Initialize model with class_weight
model = RandomForestClassifier(class_weight='balanced', random_state=42, n_jobs=-1)

# GridSearchCV with fewer parameters
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring='f1', cv=cv, verbose=1, n_jobs=-1)

# Fit model
grid.fit(X_res, y_res)

# Print results
print("Best parameters:", grid.best_params_)


Fitting 3 folds for each of 2 candidates, totalling 6 fits
Best parameters: {'max_depth': 10, 'n_estimators': 100}


In [8]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (confusion_matrix, classification_report,
                             roc_auc_score, precision_recall_curve, auc)

# 1. Define the pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', RandomForestClassifier(class_weight='balanced', random_state=42))
])

# 2. Fit the model
pipeline.fit(X_train, y_train)

# 3. Make predictions
preds = pipeline.predict(X_test)
probs = pipeline.predict_proba(X_test)[:, 1]

# 4. Evaluate
print("Confusion Matrix:\n", confusion_matrix(y_test, preds))
print("\nClassification Report:\n", classification_report(y_test, preds))
print("ROC AUC:", roc_auc_score(y_test, probs))

# Precision-Recall AUC
precision, recall, _ = precision_recall_curve(y_test, probs)
print("PR AUC:", auc(recall, precision))


Confusion Matrix:
 [[64553  4530]
 [ 5135 37425]]

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.93      0.93     69083
           1       0.89      0.88      0.89     42560

    accuracy                           0.91    111643
   macro avg       0.91      0.91      0.91    111643
weighted avg       0.91      0.91      0.91    111643

ROC AUC: 0.9715997652967624
PR AUC: 0.9650504195996831


In [9]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier



pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', RandomForestClassifier(**grid.best_params_, class_weight='balanced', random_state=42))
])
pipeline.fit(X_res, y_res)


In [10]:
import joblib
joblib.dump(pipeline, "fraud_detection_model.pkl")

['fraud_detection_model.pkl']

In [11]:
import pandas as pd
import os

# Define the base path
base_path = r"C:/Users/TUKARAM/OneDrive/Desktop/Machine Learning Case Study/Tukaram_Submission/Tukaram_Submission/DataSet"
test_path = os.path.join(base_path, "Unseen Data")

# Load the unseen/test data files
inpatient_test = pd.read_csv(os.path.join(test_path, 'Inpatientdata-1542865627584.csv'))
outpatient_test = pd.read_csv(os.path.join(test_path, 'Outpatientdata-1542865627584.csv'))
beneficiary_test = pd.read_csv(os.path.join(test_path, 'Beneficiarydata-1542865627584.csv'))


In [12]:
inpatient_test['is_inpatient'] = 1
outpatient_test['is_inpatient'] = 0

claims_test = pd.concat([inpatient_test, outpatient_test], ignore_index=True)

claims_test = claims_test.merge(beneficiary_test, on='BeneID', how='left')

# Process datetime columns
claims_test['AdmissionDt'] = pd.to_datetime(claims_test['AdmissionDt'], errors='coerce')
claims_test['DischargeDt'] = pd.to_datetime(claims_test['DischargeDt'], errors='coerce')
claims_test['StayDuration'] = (claims_test['DischargeDt'] - claims_test['AdmissionDt']).dt.days

# Add claim count per provider
provider_claim_counts = claims_test.groupby('Provider').size().reset_index(name='NumClaims')
claims_test = claims_test.merge(provider_claim_counts, on='Provider', how='left')

# Extract features used in training
features = ['InscClaimAmtReimbursed', 'DeductibleAmtPaid', 'is_inpatient', 'StayDuration', 'NumClaims']
X_unseen = claims_test[features].fillna(0)


In [13]:
import joblib

# Load trained model
model = joblib.load("fraud_detection_model.pkl")

# Predict
probs = model.predict_proba(X_unseen)[:, 1]
preds = model.predict(X_unseen)

# Create and save the submission file
submission = pd.DataFrame({
    "Probability": probs,
    "Predicted_Class": preds
})
submission.to_csv("Tukaram_Submission.csv", index=False)

print("Tukaram_Submission.csv has been created successfully.")


Tukaram_Submission.csv has been created successfully.
