<a href="https://colab.research.google.com/github/ShanBore/Patient-Healthcare-Analysis/blob/main/EMR_data_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
import pandas as pd
import numpy as np
import random
!pip install faker
from faker import Faker

# Initialize Faker for generating synthetic data.
fake = Faker()

# Setting seed for reproducibility.
np.random.seed(42)
random.seed(42)

# Number of records.
num_records = 10000



In [14]:
# List of realistic medication names.
medication = [
    'Atorvastatin', 'Levothyroxine', 'Lisinopril', 'Metformin', 'Amlodipine',
    'Metoprolol', 'Omeprazole', 'Simvastatin', 'Losartan', 'Albuterol',
    'Gabapentin', 'Hydrochlorothiazide', 'Sertraline', 'Furosemide', 'Fluticasone',
    'Amoxicillin', 'Prednisone', 'Montelukast', 'Pantoprazole', 'Escitalopram'
    ]

In [15]:
# Redefine realistic data generation functions.
def age_distribution():
    age = int(np.random.normal(50, 20))
    return max(0, min(100, age))

def gender_distribution():
    return np.random.choice(['Male', 'Female'], p=[0.49, 0.51])

def race_distribution():
    return np.random.choice(
        ['White', 'Black or African American', 'Asian', 'Hispanic or Latino', 'Other'],
        p=[0.6, 0.13, 0.06, 0.18, 0.03]
    )

In [16]:
def length_of_stay():
    return int(np.random.exponential(3))

def icd10_code():
    common_diagnoses = ['E11.9', 'I10', 'J06.9', 'M54.5', 'N39.0', 'R10.9']
    return np.random.choice(common_diagnoses)

def cpt_code():
    common_procedures = ['99213', '99214', '20610', '93000', '71020', '73630']
    return np.random.choice(common_procedures)

def insurance_distribution():
    return np.random.choice(['Private', 'Medicare', 'Medicaid', 'Uninsured'], p=[0.4, 0.3, 0.2, 0.1])

In [17]:
def generate_vital_signs():
    bp_systolic = int(np.random.normal(120, 15))
    bp_diastolic = int(np.random.normal(80, 10))
    heart_rate = int(np.random.normal(70, 10))
    temperature = round(np.random.normal(98.6, 0.7), 1)
    return {
        'Blood Pressure': f"{bp_systolic}/{bp_diastolic}",
        'Heart Rate': heart_rate,
        'Temperature': temperature
    }

In [18]:
def generate_lab_results():
    glucose = int(np.random.normal(100, 20))
    cholesterol = int(np.random.normal(200, 40))
    return {
        'Blood Glucose': glucose,
        'Cholesterol': cholesterol
    }

In [19]:
def hospital_department():
    return np.random.choice(['ER', 'Cardiology', 'Oncology', 'Pediatrics', 'Orthopedics'], p=[0.3, 0.2, 0.1, 0.2, 0.2])

def source_system():
    return np.random.choice(['System A', 'System B', 'System C'], p=[0.4, 0.3, 0.3])

In [20]:
# Generate refined synthetic EMR data.
refined_data = {
    'Patient ID': [fake.unique.uuid4() for _ in range(num_records)],
    'Age': [age_distribution() for _ in range(num_records)],
    'Gender': [gender_distribution() for _ in range(num_records)],
    'Race': [race_distribution() for _ in range(num_records)],
    'Primary Diagnosis': [icd10_code() for _ in range(num_records)],
    'Secondary Diagnosis 1': [icd10_code() if np.random.rand() > 0.7 else None for _ in range(num_records)],
    'Secondary Diagnosis 2': [icd10_code() if np.random.rand() > 0.85 else None for _ in range(num_records)],
    'Medication': [np.random.choice(medication) for _ in range(num_records)],
    'Procedure Code': [cpt_code() for _ in range(num_records)],
    'Visit Date': [fake.date_between(start_date='-2y', end_date='today') for _ in range(num_records)],
    'Discharge Date': [fake.date_between(start_date='-2y', end_date='today') if np.random.rand() > 0.5 else None for _ in range(num_records)],
    'Length of Stay': [length_of_stay() for _ in range(num_records)],
    'Insurance Type': [insurance_distribution() for _ in range(num_records)],
    'Physician ID': [fake.unique.uuid4() for _ in range(num_records)],
    'Hospital Department': [hospital_department() for _ in range(num_records)],
    'Source System': [source_system() for _ in range(num_records)]
}

In [21]:
# Convert refined data to DataFrame.
df_refined = pd.DataFrame(refined_data)

# Extract vital signs and lab results into separate columns.
df_refined['Blood Pressure'] = [generate_vital_signs()['Blood Pressure'] for _ in range(num_records)]
df_refined['Heart Rate'] = [generate_vital_signs()['Heart Rate'] for _ in range(num_records)]
df_refined['Temperature'] = [generate_vital_signs()['Temperature'] for _ in range(num_records)]
df_refined['Blood Glucose'] = [generate_lab_results()['Blood Glucose'] for _ in range(num_records)]
df_refined['Cholesterol'] = [generate_lab_results()['Cholesterol'] for _ in range(num_records)]

In [22]:
# Save the DataFrame to CSV.
df_refined.to_csv('emr_data.csv', index=False)

print("Dataset generated and saved as 'emr_data.csv'")

Dataset generated and saved as 'emr_data.csv'


In [25]:
df = pd.read_csv('emr_data.csv')
df.head()

Unnamed: 0,Patient ID,Age,Gender,Race,Primary Diagnosis,Secondary Diagnosis 1,Secondary Diagnosis 2,Medication,Procedure Code,Visit Date,...,Length of Stay,Insurance Type,Physician ID,Hospital Department,Source System,Blood Pressure,Heart Rate,Temperature,Blood Glucose,Cholesterol
0,c44413a4-bb69-452f-b5ad-b49660c9a01a,59,Male,White,M54.5,R10.9,,Fluticasone,71020,2023-11-18,...,0,Medicare,9e5ddddb-903d-4149-84cb-4a7ede9c5273,Pediatrics,System A,135/77,71,99.8,84,209
1,4e0d8dfa-b388-4edc-95ac-ac84d7e96814,47,Male,White,M54.5,,,Metformin,93000,2023-02-06,...,9,Medicaid,9b71f95e-598d-46bb-8136-551075db6661,Cardiology,System B,112/87,71,98.2,95,129
2,6fa38999-4bb5-4772-acb1-fc7c7da59568,62,Male,Other,E11.9,,,Metformin,99214,2024-01-07,...,0,Private,2d253ba8-983e-4265-adf6-4cbf9ad0f6b0,Cardiology,System C,78/75,57,97.9,127,164
3,a707ffa3-3889-4020-9fe1-55d9a3f502e8,80,Male,Asian,J06.9,E11.9,,Amlodipine,73630,2024-07-01,...,1,Medicare,b7fdece5-d6a0-4b1d-b3bb-8890cb60e8ea,ER,System A,129/88,71,99.3,111,210
4,d2477595-55d1-4694-8c5e-50c94a816c27,45,Female,White,R10.9,,,Fluticasone,93000,2023-09-27,...,0,Private,0b0531d4-f5f3-4cdd-831e-a986dbd76fd8,Cardiology,System C,91/92,73,98.7,71,207
