In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import warnings
warnings.filterwarnings("ignore")

In [7]:
try:
    claims = pd.read_csv("training dataset/Train-1542865627584.csv")
    inpatient = pd.read_csv("training dataset/Train_Inpatientdata-1542865627584.csv")
    outpatient = pd.read_csv("training dataset/Train_Outpatientdata-1542865627584.csv", engine='python')
    beneficiary = pd.read_csv("training dataset/Train_Beneficiarydata-1542865627584.csv")
except Exception as e:
    print(f"Error loading data: {e}")
    exit()

In [8]:

inpatient['Type'] = 'Inpatient'
outpatient['Type'] = 'Outpatient'
claims_data = pd.concat([inpatient, outpatient])

# Merge claims with beneficiary and fraud labels
merged = claims_data.merge(beneficiary, on="BeneID", how="left")
merged = merged.merge(claims, on="Provider", how="left")


merged = merged.drop(['ClaimID', 'BeneID', 'ClaimStartDt', 'ClaimEndDt',
                      'AdmissionDt', 'DOB', 'DOD', 'AttendingPhysician',
                      'OperatingPhysician', 'OtherPhysician'], axis=1, errors='ignore')

merged.fillna(0, inplace=True)

In [9]:
# Encode categorical columns
le = LabelEncoder()
for col in merged.select_dtypes(include='object').columns:
    merged[col] = le.fit_transform(merged[col].astype(str))

merged['PotentialFraud'] = merged['PotentialFraud'].astype(str)


merged['PotentialFraud'].fillna('No', inplace=True) 

print("Unique values in 'PotentialFraud' after fillna:", merged['PotentialFraud'].unique())

y = merged["PotentialFraud"].map({'Yes': 1, 'No': 0, '1': 1, '0': 0})


print("Unique values in y after mapping:", y.unique())


unmapped_fraud = merged[y.isna()]['PotentialFraud'].unique()
print("Unmapped values in 'PotentialFraud':", unmapped_fraud)


X = merged.drop("PotentialFraud", axis=1)


Unique values in 'PotentialFraud' after fillna: ['1' '0']
Unique values in y after mapping: [1 0]
Unmapped values in 'PotentialFraud': []


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)


y_pred = model.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))


submission = pd.DataFrame({
    "Predicted_Probability": model.predict_proba(X_test)[:,1],
    "Predicted_Class": model.predict(X_test)
})
submission.to_csv("Your_Full_Name_Submission.csv", index=False)
print("Submission file saved successfully.")

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.93      0.89     68983
           1       0.87      0.73      0.80     42660

    accuracy                           0.86    111643
   macro avg       0.86      0.83      0.84    111643
weighted avg       0.86      0.86      0.85    111643

Accuracy Score: 0.856282973406304
Submission file saved successfully.
