In [2]:
import pandas as pd
import numpy as np
import random
!pip install faker
from faker import Faker





In [3]:
fake = Faker()

np.random.seed(42)
random.seed(42)

num_records = 10000

In [4]:
medication = [
    'Atorvastatin', 'Levothyroxine', 'Lisinopril', 'Metformin', 'Amlodipine',
    'Metoprolol', 'Omeprazole', 'Simvastatin', 'Losartan', 'Albuterol',
    'Gabapentin', 'Hydrochlorothiazide', 'Sertraline', 'Furosemide', 'Fluticasone',
    'Amoxicillin', 'Prednisone', 'Montelukast', 'Pantoprazole', 'Escitalopram'
    ]

In [6]:
def age_distribution():
    age = int(np.random.normal(50, 20))
    return max(0, min(100, age))

def gender_distribution():
    return np.random.choice(['Male', 'Female', 'Other'], p=[0.47, 0.51, 0.02])

def race_distribution():
    return np.random.choice(
        ['White', 'Black or African American', 'Asian', 'Hispanic or Latino', 'Other'],
        p=[0.6, 0.13, 0.06, 0.18, 0.03]
    )

In [7]:
def length_of_stay():
    return int(np.random.exponential(3))

def icd10_code():
    common_diagnoses = ['E11.9', 'I10', 'J06.9', 'M54.5', 'N39.0', 'R10.9']
    return np.random.choice(common_diagnoses)

def cpt_code():
    common_procedures = ['99213', '99214', '20610', '93000', '71020', '73630']
    return np.random.choice(common_procedures)

def insurance_distribution():
    return np.random.choice(['Private', 'Medicare', 'Medicaid', 'Uninsured'], p=[0.4, 0.3, 0.2, 0.1])

In [8]:
def is_readmitted_distribution():
    return np.random.choice([0, 1], p=[0.8, 0.2])

def total_cost_distribution():
    cost = np.random.normal(8000, 3000)
    return max(500, min(15000, cost))

def patient_satisfaction_score_distribution():
    return np.random.choice(['1','2', '3', '4', '5'], p=[0.053,0.08, 0.107, 0.39, 0.37])

def surgery_performed_distribution():
    return np.random.choice(['Yes', 'No'], p=[0.38, 0.62])

def surgery_level_distribution(surgery_performed):
    if surgery_performed == 'No':
        return 'None'
    else:
        return np.random.choice(['Minimal','Mild', 'Moderate', 'Major', 'Critical'], p=[0.37,0.29, 0.21, 0.09, 0.04])

def insurance_coverage_percentage_distribution():
    coverage = np.random.normal(70, 15)
    return max(0, min(100, coverage))

In [9]:
def generate_vital_signs():
    bp_systolic = int(np.random.normal(120, 15))
    bp_diastolic = int(np.random.normal(80, 10))
    heart_rate = int(np.random.normal(70, 10))
    temperature = round(np.random.normal(98.6, 0.7), 1)
    return {
        'Blood Pressure': f"{bp_systolic}/{bp_diastolic}",
        'Heart Rate': heart_rate,
        'Temperature': temperature
    }

In [10]:
def generate_lab_results():
    glucose = int(np.random.normal(100, 20))
    cholesterol = int(np.random.normal(200, 40))
    return {
        'Blood Glucose': glucose,
        'Cholesterol': cholesterol
    }

In [11]:
def hospital_department():
    return np.random.choice(['ER', 'Cardiology', 'Oncology', 'Pediatrics', 'Orthopedics'], p=[0.31, 0.21, 0.08, 0.2, 0.2])

def source_system():
    return np.random.choice(['System A', 'System B', 'System C'], p=[0.4, 0.3, 0.3])

In [12]:
# Generating data

data = {
    'Patient ID': [fake.unique.uuid4() for _ in range(num_records)],
    'Age': [age_distribution() for _ in range(num_records)],
    'Gender': [gender_distribution() for _ in range(num_records)],
    'Race': [race_distribution() for _ in range(num_records)],
    'Primary Diagnosis': [icd10_code() for _ in range(num_records)],
    'Secondary Diagnosis 1': [icd10_code() if np.random.rand() > 0.7 else None for _ in range(num_records)],
    'Secondary Diagnosis 2': [icd10_code() if np.random.rand() > 0.85 else None for _ in range(num_records)],
    'Medication': [np.random.choice(medication) for _ in range(num_records)],
    'Procedure Code': [cpt_code() for _ in range(num_records)],
    'Visit Date': [fake.date_between(start_date='-2y', end_date='today') for _ in range(num_records)],
    'Discharge Date': [fake.date_between(start_date='-2y', end_date='today') if np.random.rand() > 0.5 else None for _ in range(num_records)],
    'Length of Stay': [length_of_stay() for _ in range(num_records)],
    'Insurance Type': [insurance_distribution() for _ in range(num_records)],
    'Physician ID': [fake.unique.uuid4() for _ in range(num_records)],
    'Hospital Department': [hospital_department() for _ in range(num_records)],
    'Source System': [source_system() for _ in range(num_records)],
    'Readmission': [is_readmitted_distribution() for _ in range(num_records)],
    'Total Cost': [total_cost_distribution() for _ in range(num_records)],
    'Patient Satisfaction Score': [patient_satisfaction_score_distribution() for _ in range(num_records)],
    'Surgery Performed': [surgery_performed_distribution() for _ in range(num_records)],
    'Surgery Level': [surgery_level_distribution(surgery_performed_distribution()) for _ in range(num_records)],
    'Insurance Coverage Percentage': [insurance_coverage_percentage_distribution() for _ in range(num_records)]
}


In [14]:
data['Age'] = [
    age_distribution() if dept != 'Pediatrics' else (age_distribution() if np.random.rand() > 0.03 else np.random.randint(18, 100))
    for dept in data['Hospital Department']
]

In [15]:
df_refine = pd.DataFrame(data)

df_refine['Blood Pressure'] = [generate_vital_signs()['Blood Pressure'] for _ in range(num_records)]
df_refine['Heart Rate'] = [generate_vital_signs()['Heart Rate'] for _ in range(num_records)]
df_refine['Temperature'] = [generate_vital_signs()['Temperature'] for _ in range(num_records)]
df_refine['Blood Glucose'] = [generate_lab_results()['Blood Glucose'] for _ in range(num_records)]
df_refine['Cholesterol'] = [generate_lab_results()['Cholesterol'] for _ in range(num_records)]

In [17]:
df_refine.to_csv('emr_data.csv', index=False)

print("DataSet got generated")

DataSet got generated


In [18]:
df = pd.read_csv('emr_data.csv')
df.head()


Unnamed: 0,Patient ID,Age,Gender,Race,Primary Diagnosis,Secondary Diagnosis 1,Secondary Diagnosis 2,Medication,Procedure Code,Visit Date,...,Total Cost,Patient Satisfaction Score,Surgery Performed,Surgery Level,Insurance Coverage Percentage,Blood Pressure,Heart Rate,Temperature,Blood Glucose,Cholesterol
0,353d7764-2364-46e3-95af-dc55f8655820,87,Male,White,M54.5,R10.9,,Fluticasone,71020,2024-10-13,...,6709.010957,5,Yes,,40.222393,103/71,91,98.0,106,222
1,666c4fd7-1f2e-40c0-b9c2-3126d7e2dc91,63,Male,White,M54.5,,,Metformin,93000,2024-05-12,...,5245.663399,4,No,Major,56.67662,120/79,67,99.6,134,158
2,d005e886-8324-4914-8a06-ed382c058c6e,61,Male,Other,E11.9,,,Metformin,99214,2024-08-06,...,4176.932553,2,Yes,Minimal,68.859316,127/77,79,98.5,88,192
3,d0c95952-61f6-4969-982c-f6030240335b,63,Male,Asian,J06.9,E11.9,,Amlodipine,73630,2024-06-03,...,5074.307413,5,No,Mild,100.0,93/70,62,98.4,92,253
4,257fc98c-5aea-47ef-ab93-3d80d2234e2f,50,Female,White,R10.9,,,Fluticasone,93000,2024-11-19,...,5143.343855,4,Yes,,51.776883,125/56,48,98.3,105,203


In [19]:
from google.colab import files
files.download('emr_data.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>