In [12]:
import numpy as np
import pandas as pd

np.random.seed(42)

first_names = ['John', 'Jane', 'Robert', 'Emily', 'Michael', 'Sarah', 'David', 'Laura', 'Chris', 'Jessica']
last_names = ['Doe', 'Smith', 'Brown', 'Davis', 'Wilson', 'Johnson', 'Martinez', 'Taylor', 'Anderson', 'Thomas']
names = np.array([f"{fn} {ln}" for fn in first_names for ln in last_names])
names = np.random.choice(names, size=1000, replace=True)

# Patients Table
patients_df = pd.DataFrame({
    'PatientID': np.arange(1, 1001),
    'Name': names,
    'Age': np.random.randint(1, 100, size=1000).astype(float),
    'Gender': np.random.choice(['Male', 'Female', 'Other'], size=1000),
    'Diagnosis': np.random.choice(['Flu', 'Diabetes', 'Hypertension', 'Asthma', 'Fracture'], size=1000),
    'BloodType': np.random.choice(['A+', 'A-', 'B+', 'B-', 'AB+', 'AB-', 'O+', 'O-'], size=1000),
    'AdmissionDate': pd.date_range(start='2022-01-01', periods=1000).strftime('%Y-%m-%d'),
})

patients_df['DischargeDate'] = (
    pd.to_datetime(patients_df['AdmissionDate']) + pd.to_timedelta(np.random.randint(1, 21, size=1000), unit='D')
).dt.strftime('%Y-%m-%d')

# Missing Values
for col in ['Name', 'Diagnosis', 'AdmissionDate', 'DischargeDate']:
    patients_df.loc[np.random.choice(patients_df.index, 30, replace=False), col] = ""

patients_df.loc[np.random.choice(patients_df.index, 25, replace=False), 'Age'] = np.nan

# Typing mistakes for user error
def introduce_typos(data, error_rate=0.05):
    alphabet = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
    for i in np.random.choice(len(data), int(len(data) * error_rate), replace=False):
        if isinstance(data[i], str) and len(data[i]) > 3:
            pos = np.random.randint(0, len(data[i]))
            typo_char = np.random.choice(list(alphabet))
            data[i] = data[i][:pos] + typo_char + data[i][pos+1:]
    return data

patients_df['Name'] = introduce_typos(patients_df['Name'].values)
patients_df['Diagnosis'] = introduce_typos(patients_df['Diagnosis'].values)

# Duplicate rows
duplicate_patients = patients_df.iloc[np.random.choice(930, 70, replace=False)].copy()
duplicate_patients['PatientID'] = np.arange(1001, 1001 + len(duplicate_patients))
patients_df = pd.concat([patients_df, duplicate_patients], ignore_index=True)

# Hospitals Table
hospital_names = np.random.choice(['City Hospital', 'Green Valley Clinic', 'Sunrise Medical Center', 'Blue Cross Hospital'], size=200)
unique_hospitals = list(set(hospital_names))
established_year_dict = {name: np.random.randint(1900, 2022) for name in unique_hospitals}

hospitals_df = pd.DataFrame({
    'HospitalID': np.arange(1, 201),
    'HospitalName': hospital_names,
    'City': np.random.choice(['New York', 'Los Angeles', 'Chicago', 'Houston', 'Miami'], size=200),
    'Capacity': np.random.randint(50, 500, size=200).astype(float),
    'Specialization': np.random.choice(['Cardiology', 'Neurology', 'Orthopedics', 'Pediatrics', 'Oncology'], size=200),
    'NumOfDoctors': np.random.randint(10, 200, size=200).astype(int),
    'EstablishedYear': [established_year_dict[name] for name in hospital_names]
})

# Missing values
for col in ['HospitalName', 'City']:
    hospitals_df.loc[np.random.choice(hospitals_df.index, 10, replace=False), col] = ""

hospitals_df.loc[np.random.choice(hospitals_df.index, 5, replace=False), 'Capacity'] = np.nan

# Typing mistakes for user error
hospitals_df['HospitalName'] = introduce_typos(hospitals_df['HospitalName'].values)

# Duplicate rows
duplicate_hospitals = hospitals_df.iloc[np.random.choice(200, 20, replace=False)].copy()
duplicate_hospitals['HospitalID'] = np.arange(201, 201 + len(duplicate_hospitals))
hospitals_df = pd.concat([hospitals_df, duplicate_hospitals], ignore_index=True)

# Appointments Table
appointments_df = pd.DataFrame({
    'AppointmentID': np.arange(1, 301),
    'PatientID': np.random.randint(1, 1001, size=300),
    'HospitalID': np.random.randint(1, 100, size=300),
    'Date': pd.date_range(start='2020-01-01', periods=300).strftime('%Y-%m-%d'),
    'Status': np.random.choice(['Scheduled', 'Completed', 'Cancelled'], size=300),
    'BillAmount': np.random.randint(50, 5000, size=300).astype(float),
    'PatientComplaint': np.random.choice(['Fever', 'Headache', 'Cough', 'Back Pain', 'Fatigue', 'Nausea', 'Dizziness', 'Shortness of Breath'], size=300),
    'DurationMinutes': np.random.randint(10, 120, size=300).astype(int)
})

# Missing values
for col in ['Date', 'PatientComplaint']:
    appointments_df.loc[np.random.choice(appointments_df.index, 15, replace=False), col] = ""

appointments_df.loc[np.random.choice(appointments_df.index, 10, replace=False), 'BillAmount'] = np.nan

# Duplicate rows
duplicate_appointments = appointments_df.iloc[np.random.choice(300, 30, replace=False)].copy()
duplicate_appointments['AppointmentID'] = np.arange(301, 301 + len(duplicate_appointments))
appointments_df = pd.concat([appointments_df, duplicate_appointments], ignore_index=True)

# Outputting CSV
patients_df.to_csv('Patients.csv', index=False)
hospitals_df.to_csv('Hospitals.csv', index=False)
appointments_df.to_csv('Appointments.csv', index=False)

print("Patients.csv, Hospitals.csv, Appointments.csv")


Patients.csv, Hospitals.csv, Appointments.csv
