In [3]:
%pip install Faker



Collecting faker
  Downloading Faker-25.2.0-py3-none-any.whl.metadata (15 kB)
Downloading Faker-25.2.0-py3-none-any.whl (1.8 MB)
   ---------------------------------------- 0.0/1.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.8 MB ? eta -:--:--
    --------------------------------------- 0.0/1.8 MB 445.2 kB/s eta 0:00:04
    --------------------------------------- 0.0/1.8 MB 281.8 kB/s eta 0:00:07
   -- ------------------------------------- 0.1/1.8 MB 479.1 kB/s eta 0:00:04
   -- ------------------------------------- 0.1/1.8 MB 547.6 kB/s eta 0:00:04
   -- ------------------------------------- 0.1/1.8 MB 481.4 kB/s eta 0:00:04
   --- ------------------------------------ 0.2/1.8 MB 484.3 kB/s eta 0:00:04
   ---- ----------------------------------- 0.2/1.8 MB 513.4 kB/s eta 0:00:04
   ------ --------------------------------- 0.3/1.8 MB 682.7 kB/s eta 0:00:03
   ------ --------------------------------- 0.3/1.8 MB 705.4 kB/s eta 0:00:03
   ---------- -----------------

In [4]:
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import datetime, timedelta

fake = Faker()

# Number of samples
n_samples = 57592

# Generate random data
data = []

for _ in range(n_samples):
    dob = fake.date_of_birth(minimum_age=18, maximum_age=80).strftime("%Y-%m-%d")
    dob_date = datetime.strptime(dob, "%Y-%m-%d")
    
    date_confirmed_hiv_positive = fake.date_between(start_date=dob_date + timedelta(days=6570), end_date='today').strftime("%Y-%m-%d")  # 6570 days = 18 years
    date_confirmed_hiv_positive_date = datetime.strptime(date_confirmed_hiv_positive, "%Y-%m-%d")

    start_date_current_art = fake.date_between(start_date=date_confirmed_hiv_positive_date, end_date='today').strftime("%Y-%m-%d")
    start_date_current_art_date = datetime.strptime(start_date_current_art, "%Y-%m-%d")

    date_first_visit = fake.date_between(start_date=dob_date + timedelta(days=6570), end_date='today').strftime("%Y-%m-%d")
    date_first_visit_date = datetime.strptime(date_first_visit, "%Y-%m-%d")

    date_most_recent_visit = fake.date_between(start_date=date_first_visit_date, end_date='today').strftime("%Y-%m-%d")

    date_of_exit = fake.date_between(start_date=date_first_visit_date, end_date='today').strftime("%Y-%m-%d")

    data.append([
        fake.unique.uuid4(),  # Patient ID
        dob,
        random.choice(['Male', 'Female']),
        date_confirmed_hiv_positive,
        random.choice(['Yes', 'No']),
        random.choice(['Regimen A', 'Regimen B', 'Regimen C']),
        start_date_current_art,
        date_first_visit,
        date_most_recent_visit,
        random.randint(200, 1500),  # CD4 count at first visit
        random.randint(20, 200000),  # Viral load at first visit
        random.randint(200, 1500),  # CD4 count at most recent visit
        random.randint(20, 200000),  # Viral load at most recent visit
        (date_first_visit_date - dob_date).days // 365,  # Age at first visit
        random.choice(['Employed', 'Unemployed', 'Student', 'Retired', 'Other']),
        random.choice(['Primary education', 'Secondary education', 'Tertiary education', 'Other']),
        random.choice(['Low', 'Medium', 'High', 'Prefer not to say']),
        random.choice(['Single', 'Married', 'Divorced', 'Widowed', 'Other']),
        random.choice(['None', 'Alcohol', 'Tobacco', 'Illicit drugs', 'Other']),
        random.choice(['None', 'Diabetes', 'Hypertension', 'Tuberculosis', 'Hepatitis B/C', 'Cardiovascular diseases', 'Other']),
        random.choice(['None', 'Fever', 'Weight loss', 'Night sweats', 'Cough', 'Other']),
        random.choice(['Healthy', 'Average', 'Poor']),
        random.choice(['Regular', 'Occasional', 'None']),
        random.choice(['Always', 'Often', 'Sometimes', 'Rarely', 'Never']),
        random.randint(0, 30),  # Missed doses in the last month
        random.choice(['Yes', 'No']),
        (datetime.strptime(date_most_recent_visit, "%Y-%m-%d") - date_first_visit_date).days,  # Duration of followups in days
        date_of_exit,
        random.choice(['Transferred out', 'Died', 'Lost to followups', 'Other'])
    ])

In [5]:
# Create DataFrame
data = pd.DataFrame(data, columns=[
    'Patient ID', 'Date of birth', 'Gender', 'Date confirmed HIV positive',
    'Previous ART exposure', 'Current ART regimen', 'Start date of current ART',
    'Date of first visit', 'Date of most recent visit', 'CD4 count at first visit',
    'Viral load at first visit', 'CD4 count at most recent visit', 'Viral load at most recent visit',
    'Age at first visit', 'Employment status', 'Education level', 'Income level', 'Marital status',
    'Substance use history', 'Comorbidities', 'Reported symptoms', 'Dietary habits', 'Physical activity',
    'Adherence to ART', 'Missed doses in the last month', 'Adverse event', 'Duration of followups',
    'Date of exit from the study', 'Reason for exit'
])

In [6]:
data.head()

Unnamed: 0,Patient ID,Date of birth,Gender,Date confirmed HIV positive,Previous ART exposure,Current ART regimen,Start date of current ART,Date of first visit,Date of most recent visit,CD4 count at first visit,...,Comorbidities,Reported symptoms,Dietary habits,Physical activity,Adherence to ART,Missed doses in the last month,Adverse event,Duration of followups,Date of exit from the study,Reason for exit
0,bacdb318-38ed-4bf1-ac9a-104183b7faa9,1955-10-06,Male,2020-04-12,No,Regimen C,2020-12-17,2004-07-06,2019-06-19,372,...,Diabetes,Fever,Average,Regular,Rarely,30,Yes,5461,2022-08-04,Other
1,95e84dde-ab5b-468f-afa3-94d4687aa71d,1994-07-01,Female,2017-10-10,Yes,Regimen A,2019-04-16,2018-05-08,2020-03-31,1430,...,Hypertension,Other,Average,,Sometimes,16,No,693,2020-02-19,Died
2,6a0f9de6-50bd-41f3-8681-72dd7b657a61,1980-01-23,Male,2008-02-09,No,Regimen C,2020-10-26,2022-05-11,2023-03-02,1315,...,Hypertension,,Healthy,Regular,Always,7,Yes,295,2022-11-26,Lost to followups
3,356b799b-cb8a-4f4c-a250-441b76273150,1961-12-07,Male,2012-01-24,Yes,Regimen A,2017-08-29,2009-06-23,2021-08-04,848,...,Diabetes,,Poor,Regular,Often,24,No,4425,2022-08-04,Transferred out
4,5e9c4ec5-e7ed-4ef7-b944-a27a18ad4dff,2005-01-09,Male,2023-03-22,No,Regimen C,2024-05-17,2024-03-31,2024-04-30,1031,...,Diabetes,Other,Average,,Never,8,Yes,30,2024-04-22,Transferred out


In [7]:
#Convert to csv file
data.to_csv('Synthetic_data.csv', index=False)