In [3]:
import pandas as pd

# Load the dataset (place the CSV in the same folder as this notebook)
df = pd.read_csv("/Users/bittu/Downloads/enhanced_health_insurance_claims.csv")

# View first few rows
df.head()

Unnamed: 0,ClaimID,PatientID,ProviderID,ClaimAmount,ClaimDate,DiagnosisCode,ProcedureCode,PatientAge,PatientGender,ProviderSpecialty,ClaimStatus,PatientIncome,PatientMaritalStatus,PatientEmploymentStatus,ProviderLocation,ClaimType,ClaimSubmissionMethod
0,10944daf-f7d5-4e1d-8216-72ffa609fe41,8552381d-7960-4f64-b190-b20b8ada00a1,4a4cb19c-4863-41cf-84b0-c2b21aace988,3807.95,2024-06-07,yy006,hd662,16,M,Cardiology,Pending,90279.43,Married,Retired,Jameshaven,Routine,Paper
1,fcbebb25-fc24-4c0f-a966-749edcf83fb1,327f43ad-e3bd-4473-a9ed-46483a0a156f,422e02dd-c1fd-43dd-8af4-0c3523f997b1,9512.07,2023-05-30,tD052,mH831,27,M,Pediatrics,Approved,130448.02,Single,Student,Beltrantown,Routine,Online
2,9e9983e7-9ea7-45f5-84d8-ce49ccd8a4a1,6f3acdf7-73aa-4afa-9c2e-b25b27bdb5b0,f7733b3f-0980-47b5-a7a0-ee390869355b,7346.74,2022-09-27,zx832,dg637,40,F,Cardiology,Pending,82417.54,Divorced,Employed,West Charlesport,Emergency,Online
3,a06273ed-44bb-452b-bbad-8618de080494,5d58e183-701e-406c-a8c6-5b73cac5e912,f7a04581-de96-44ee-b773-8adac02baa59,6026.72,2023-06-25,kr421,kG326,65,M,Neurology,Pending,68516.96,Widowed,Student,West Aprilhaven,Routine,Phone
4,f702a717-254b-4cff-a0c7-8395db2f6616,8a8ebdf6-3af0-4f14-82f3-37b937c3d270,b80b9e77-97f0-47d7-b561-19f9658a7bdf,1644.58,2023-07-24,LZ261,cx805,24,M,General Practice,Pending,84122.17,Married,Student,Lake Michele,Inpatient,Phone


In [5]:
# Convert ClaimDate to datetime for validation
df['ClaimDate'] = pd.to_datetime(df['ClaimDate'], errors='coerce')

# Create error flags
df['Error_ClaimAmount'] = df['ClaimAmount'] <= 0
df['Error_PatientAge'] = (df['PatientAge'] < 0) | (df['PatientAge'] > 100)
df['Error_PatientGender'] = ~df['PatientGender'].isin(['M', 'F'])
df['Error_ClaimStatus'] = ~df['ClaimStatus'].isin(['Approved', 'Denied', 'Pending'])
df['Error_PatientIncome'] = df['PatientIncome'] <= 0
df['Error_ClaimDate'] = (df['ClaimDate'] < '2010-01-01') | (df['ClaimDate'] > pd.Timestamp.today())

# Flag rows with any error
error_columns = [col for col in df.columns if col.startswith('Error_')]
df['HasError'] = df[error_columns].any(axis=1)

# Show some rows with errors
df[df['HasError']].head(10)

Unnamed: 0,ClaimID,PatientID,ProviderID,ClaimAmount,ClaimDate,DiagnosisCode,ProcedureCode,PatientAge,PatientGender,ProviderSpecialty,...,ProviderLocation,ClaimType,ClaimSubmissionMethod,Error_ClaimAmount,Error_PatientAge,Error_PatientGender,Error_ClaimStatus,Error_PatientIncome,Error_ClaimDate,HasError


In [7]:
df[error_columns].sum()

Error_ClaimAmount      0
Error_PatientAge       0
Error_PatientGender    0
Error_ClaimStatus      0
Error_PatientIncome    0
Error_ClaimDate        0
dtype: int64

In [9]:
df.to_csv("claims_with_error_flags.csv", index=False)
print("✅ File saved as claims_with_error_flags.csv")

✅ File saved as claims_with_error_flags.csv
