In [5]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# Set a random seed for reproducibility
np.random.seed(42)

# Define the number of records
num_records = 1000

# Define possible values
departments = ["Cardiology", "Orthopedics", "Pediatrics", "Oncology", "Emergency", "Neurology"]
procedures = ["MRI", "Surgery", "X-Ray", "Blood Test", "CT Scan", "Physical Therapy"]
hospitals = ["Hospital A", "Hospital B", "Hospital C", "Hospital D"]
outcomes = ["Recovered", "Readmitted", "Ongoing", "Transferred"]
locations = ["New York", "California", "Texas", "Florida", "Illinois"]
insurance_types = ["Private", "Public", "Uninsured"]

# Generate synthetic data
data = {
    "Patient_ID": [f"P{i}" for i in range(1, num_records + 1)],
    "Department": np.random.choice(departments, num_records),
    "Procedure": np.random.choice(procedures, num_records),
    "Medication_Cost": np.round(np.random.uniform(50, 500, num_records)),  # Random medication costs between $50 and $500
    "Procedure_Cost": np.round(np.random.uniform(200, 5000, num_records)),  # Random procedure costs between $200 and $5000
    "Hospital_Stay_Duration": np.random.randint(1, 15, num_records),  # Random hospital stay duration between 1 and 14 days
    "Insurance_Type": np.random.choice(insurance_types, num_records),
    "Outcome": np.random.choice(outcomes, num_records),
    "Location": np.random.choice(locations, num_records),
    "Date": [(datetime.now() - timedelta(days=random.randint(1, 365))).strftime("%Y-%m-%d") for _ in range(num_records)],
}

# Create a DataFrame
df = pd.DataFrame(data)

# Calculate Total Cost (Medication Cost + Procedure Cost + Hospital Stay Cost)
# Assume hospital stay cost is $500 per day
df["Total_Cost"] = df["Medication_Cost"] + df["Procedure_Cost"] + (df["Hospital_Stay_Duration"] * 500)

# Add a column for Patient Age
df["Patient_Age"] = np.random.randint(18, 80, num_records)

# Add a column for Severity Level (random severity level for each patient)
df["Severity_Level"] = np.random.choice(["Low", "Medium", "High"], num_records)

# Save the dataset to a CSV file
df.to_csv("custom_healthcare_cost_data.csv", index=False)
print("Custom healthcare cost dataset saved to custom_healthcare_cost_data.csv")

# Display the first few rows of the dataset
print(df.head())

Custom healthcare cost dataset saved to custom_healthcare_cost_data.csv
  Patient_ID  Department         Procedure  Medication_Cost  Procedure_Cost  \
0         P1    Oncology  Physical Therapy            198.0          1624.0   
1         P2   Emergency           CT Scan            262.0          3648.0   
2         P3  Pediatrics  Physical Therapy            385.0          4779.0   
3         P4   Emergency  Physical Therapy            442.0          3663.0   
4         P5   Emergency        Blood Test            245.0          4342.0   

   Hospital_Stay_Duration Insurance_Type      Outcome    Location        Date  \
0                      10         Public      Ongoing    New York  2024-08-11   
1                       5      Uninsured  Transferred       Texas  2024-09-18   
2                      13      Uninsured  Transferred  California  2024-07-14   
3                      14        Private      Ongoing     Florida  2024-05-21   
4                       7      Uninsured    Reco