In [6]:
!pip install faker


Collecting faker
  Downloading faker-37.4.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.4.0-py3-none-any.whl (1.9 MB)
   ---------------------------------------- 0.0/1.9 MB ? eta -:--:--
   ----- ---------------------------------- 0.3/1.9 MB ? eta -:--:--
   ---------- ----------------------------- 0.5/1.9 MB 1.7 MB/s eta 0:00:01
   ---------------- ----------------------- 0.8/1.9 MB 1.8 MB/s eta 0:00:01
   -------------------------- ------------- 1.3/1.9 MB 1.9 MB/s eta 0:00:01
   ------------------------------------- -- 1.8/1.9 MB 2.1 MB/s eta 0:00:01
   ---------------------------------------- 1.9/1.9 MB 1.9 MB/s eta 0:00:00
Installing collected packages: faker
Successfully installed faker-37.4.0


In [29]:
import pandas as pd
import random
from faker import Faker
import numpy as np
from datetime import timedelta

In [12]:
# Setup
fake = Faker()
random.seed(42)
np.random.seed(42)

# Create's patients CSV

In [14]:
# Constants
NUM_PATIENTS = 10000
genders = ["Male", "Female", "Other"]
regions = ["North America", "South America", "Europe", "Asia", "Africa", "Australia"]
chronic_conditions_list = [
    "Diabetes", "Hypertension", "COPD", "Heart Disease", 
    "Asthma", "Kidney Disease", "Cancer", "Obesity", 
    "Depression", "Arthritis", "None"
]

# Generate data
data = []
for pid in range(1, NUM_PATIENTS + 1):
    gender = random.choice(genders)
    age = random.randint(18, 90)
    region = random.choice(regions)
    count = np.random.choice([0, 1, 2, 3], p=[0.2, 0.4, 0.3, 0.1])
    conditions = random.sample([c for c in chronic_conditions_list if c != "None"], count) if count else ["None"]
    chronic = ",".join(conditions)
    data.append([pid, gender, age, region, chronic])

# Save to CSV
df = pd.DataFrame(data, columns=["patient_id", "gender", "age", "region", "chronic_conditions"])
df.to_csv("patients.csv", index=False)
print("✅ patients.csv generated successfully!")
# Save to CSV
df = pd.DataFrame(data, columns=["patient_id", "gender", "age", "region", "chronic_conditions"])
df.to_csv("patients.csv", index=False)
print("✅ patients.csv generated successfully!")


# Create's doctor CSV

In [22]:
# Constants
NUM_DOCTORS = 500
departments = [
    "Cardiology", "Neurology", "Orthopedics", "Oncology", "Pediatrics",
    "General Medicine", "Endocrinology", "Gastroenterology", "Urology", "Psychiatry"
]

# Generate doctor data
doctors = []
for doc_id in range(1, NUM_DOCTORS + 1):
    name = fake.name()
    department = random.choice(departments)
    experience_years = random.randint(1, 40)
    doctors.append([doc_id, name, department, experience_years])

# Create DataFrame
df_doctors = pd.DataFrame(doctors, columns=["doctor_id", "name", "department", "experience_years"])

# Save to CSV
df_doctors.to_csv("doctors.csv", index=False)

print("✅ doctors.csv has been created successfully.")

✅ doctors.csv has been created successfully.


# Create's admission CSV

In [46]:
# Constants
NUM_ADMISSIONS = 25000
patient_ids = list(range(1, 10001))   # Assuming 10,000 patients
doctor_ids = list(range(1, 501))      # Assuming 500 doctors
departments = ["Cardiology", "Neurology", "Oncology", "Orthopedics", "Pediatrics", "Dermatology"]

# Generate admissions data
admissions = []

for admission_id in range(1, NUM_ADMISSIONS + 1):
    patient_id = random.choice(patient_ids)
    doctor_id = random.choice(doctor_ids)
    admission_date = fake.date_between(start_date='-2y', end_date='-30d')
    discharge_date = fake.date_between(start_date=admission_date, end_date='today')
    department = random.choice(departments)
    readmitted = random.choices([0, 1], weights=[85, 15])[0]  # Use 0/1 instead of True/False

    admissions.append([
        admission_id,
        patient_id,
        doctor_id,
        admission_date,
        discharge_date,
        department,
        readmitted
    ])

# Create DataFrame and export to CSV
df_admissions = pd.DataFrame(admissions, columns=[
    "admission_id",
    "patient_id",
    "doctor_id",
    "admission_date",
    "discharge_date",
    "department",
    "readmitted_within_30_days"
])

df_admissions.to_csv("admissions.csv", index=False)
print("✅ admissions.csv regenerated successfully.")

✅ admissions.csv regenerated successfully.


# Create's diagnose CSV

In [41]:
# Constants
NUM_DIAGNOSES = 50000
admission_ids = list(range(1, 25001))  # Match with 25,000 admissions

# ICD-10 code pool
icd10_codes = [
    ("E11.9", "Type 2 diabetes mellitus"),
    ("I10", "Essential hypertension"),
    ("J45.909", "Unspecified asthma"),
    ("C50.919", "Malignant neoplasm of breast"),
    ("M54.5", "Low back pain"),
    ("F32.9", "Major depressive disorder"),
    ("N18.9", "Chronic kidney disease"),
    ("R10.9", "Unspecified abdominal pain"),
    ("J44.9", "Chronic obstructive pulmonary disease"),
    ("E66.9", "Obesity, unspecified"),
    ("G40.909", "Epilepsy, unspecified")
]

# Generate diagnoses
diagnoses = []
for diag_id in range(1, NUM_DIAGNOSES + 1):
    admission_id = random.choice(admission_ids)
    code, description = random.choice(icd10_codes)
    diagnoses.append([diag_id, admission_id, code, description])

# Create DataFrame and export
df_diagnoses = pd.DataFrame(diagnoses, columns=[
    "diagnosis_id", "admission_id", "diagnosis_code", "diagnosis_description"
])

df_diagnoses.to_csv("diagnoses.csv", index=False)
print("✅ diagnoses.csv created successfully.")


✅ diagnoses.csv created successfully.


# Create's labs CSV

In [44]:
# Constants
NUM_LABS = 100000
admission_ids = list(range(1, 25001))  # 25,000 admissions
lab_tests = [
    "Blood Glucose", "Hemoglobin", "Cholesterol", "Creatinine", 
    "Blood Pressure", "WBC Count", "RBC Count", "Platelet Count", 
    "Electrolytes", "Thyroid Panel"
]

# Value ranges
value_ranges = {
    "Blood Glucose": (70, 200),
    "Hemoglobin": (11, 18),
    "Cholesterol": (100, 300),
    "Creatinine": (0.6, 2.0),
    "Blood Pressure": (90, 180),
    "WBC Count": (4000, 11000),
    "RBC Count": (3.5, 6.0),
    "Platelet Count": (150000, 400000),
    "Electrolytes": (130, 150),
    "Thyroid Panel": (0.4, 4.5)
}

# Generate data
labs = []
for lab_id in range(1, NUM_LABS + 1):
    admission_id = random.choice(admission_ids)
    test = random.choice(lab_tests)
    value = round(random.uniform(*value_ranges[test]), 2)
    test_date = fake.date_between(start_date='-2y', end_date='today')
    labs.append([lab_id, admission_id, test, value, test_date])

# Create DataFrame and save
df_labs = pd.DataFrame(labs, columns=["lab_id", "admission_id", "test_name", "test_value", "test_date"])
df_labs.to_csv("labs.csv", index=False)

print("✅ labs.csv generated successfully.")

✅ labs.csv generated successfully.
