In [1]:
# Author: Dr James K Ruffle
# Email: j.ruffle@ucl.ac.uk

import pandas as pd
import numpy as np
from random_username.generate import generate_username
import random
import glob
import datetime
import names
from faker import Faker
from tqdm import tqdm
import os
import seaborn as sns

import warnings #suppress buggy pandas warning
warnings.filterwarnings("ignore")

# simulate patient data

In [2]:
sample_size = 10
timepoints = 50

max_points_per_epoch = 1150

In [3]:
def generate_random_date(start_year, end_year):
    start_date = datetime.date(start_year, 1, 1)
    end_date = datetime.date(end_year, 12, 31)

    days_between = (end_date - start_date).days
    random_days = random.randint(0, days_between)
    
    random_date = start_date + datetime.timedelta(days=random_days)
    return random_date

In [4]:
ethnic_groups = ['Asian or Asian British','Black, Black British, Caribbean or African',
                 'Mixed or multiple ethnic groups','White','Not disclosed']

In [5]:
df = pd.DataFrame(generate_username(sample_size),columns=['username'])

df['Trial name']='MindfulAI'

#constitutional data
df['First name']=''
df['Last name']=''
df['DOB']=''
df['Address']=''
df['email']=''
df['Latitude']=0
df['Longitude']=0
df['Postcode']=''
df['Telephone number']=''
df['Sex']=''
df['Gender']=''
df['Ethnicity']=''
df['Weight']=''
df['Height']=''
df['Employment status']=''
df['Smoking status']=''
df['ETOH units per week']=''
df['Recreational drug use']=''

#Healthcare acess data
df['GP']=''
df['GP address']=''
df['GP postcode']=''
df['GP latitude']=''
df['GP longitude']=''
df['GP telephone number']=''
df['Hospital Consultant']=''
df['Hospital address']=''
df['Hospital postcode']=''
df['Hospital latitude']=0
df['Hospital longitude']=0
df['Hospital telephone number']=''

#Medical data
df['Diagnosis']=''
df['Duration of diagnosis']=0
df['Family History']=''
df['Under Community Mental Health Team']=''
df['Has had inpatient treatment']=''
df['Has had therapy']=''
df['On anxiolytic']=''
df['On anti-depressant']=''
df['On lithium']=''
df['On valproate']=''
df['Polypharmacy']=''
df['Physical disability']=''
df['Carers']=''

#start of gamification
df['Enrollment points']=1000

for i, row in tqdm(df.iterrows()):
    total_points=0
    total_attendance_points=0
    total_log_points=0
    total_check_in_points=0
    total_asked_question_points=0
    total_answered_question_points=0
    total_spoke_to_bot_points=0
    
    df.loc[i,'DOB']=np.random.choice([generate_random_date(1970, 2005),'Not disclosed'], p=[0.9,0.1])
    df.loc[i,'Sex']=np.random.choice(['Male','Female','Not disclosed'],p=[0.45,0.45,0.1])
    df.loc[i,'Gender']=np.random.choice(['Male','Female','Non-binary','Not disclosed'],p=[0.35,0.35,0.2,0.1])
    
    if df.loc[i,'Gender']=='Male':
        df.loc[i,'First name']=names.get_first_name(gender='male')
    elif df.loc[i,'Gender']=='Female':
        df.loc[i,'First name']=names.get_first_name(gender='female')
    else:
        df.loc[i,'First name']=names.get_first_name()
    df.loc[i,'Last name']=names.get_last_name()
    
    df.loc[i,'email']=df.loc[i,'First name']+'_'+df.loc[i,'Last name']+'@mindfulai.com'
    
    df.loc[i,'Ethnicity']=np.random.choice(ethnic_groups)
    df.loc[i,'Weight']=np.random.choice([np.round(80*random.uniform(0.5, 2),1),'Not disclosed'], p=[0.9,0.1])
    df.loc[i,'Height']=np.random.choice([np.round(random.uniform(100, 200),1),'Not disclosed'], p=[0.9,0.1])
    df.loc[i,'Employment status']=np.random.choice(['Employed','Unemployed','Not disclosed'], p=[0.45,0.45,0.1])
    
    df.loc[i,'Smoking status']=np.random.choice(['Current smoker','Ex-smoker','Never smoked'])
    df.loc[i,'ETOH units per week']=int(random.uniform(0, 35))
    df.loc[i,'Recreational drug use']=np.random.choice(['Yes','No'])
    
    fake = Faker('en-UK')
    df.loc[i,'Address']=fake.address()
    df.loc[i,'Postcode']=fake.postcode()
    df.loc[i,'Telephone number']=fake.phone_number()
    
    country='US'
    while country != 'GB':
        result = fake.location_on_land()
        country = result[3]
    df.loc[i,'Latitude']=result[0]
    df.loc[i,'Longitude']=result[1]
    
    fake = Faker('en-UK')
    df.loc[i,'GP']='Dr '+names.get_full_name()
    df.loc[i,'GP address']=fake.address()
    df.loc[i,'GP postcode']=fake.postcode()
    
    country='US'
    while country != 'GB':
        result = fake.location_on_land()
        country = result[3]
    df.loc[i,'GP Latitude']=result[0]
    df.loc[i,'GP longitude']=result[1]
    df.loc[i,'GP telephone number']=fake.phone_number()
    
    fake = Faker('en-UK')
    df.loc[i,'Hospital Consultant']='Dr '+names.get_full_name()
    df.loc[i,'Hospital address']=fake.address()
    df.loc[i,'Hospital postcode']=fake.postcode()
    
    country='US'
    while country != 'GB':
        result = fake.location_on_land()
        country = result[3]
        
    df.loc[i,'Hospital latitude']=result[0]
    df.loc[i,'Hospital longitude']=result[0]
    df.loc[i,'Hospital telephone number']=fake.phone_number()
    
    df.loc[i,'Diagnosis']=np.random.choice(['Major depressive disorder','Bipolar depression','Generalized Anxiety Disorder'])
    if df.loc[i,'DOB']!='Not disclosed':
        df.loc[i,'Duration of diagnosis']=(datetime.date.today().year-df.loc[i,'DOB'].year)*random.uniform(0, 0.5)
    else:
        df.loc[i,'Duration of diagnosis']=random.uniform(0, 10)
    df.loc[i,'Family History']=np.random.choice(['Positive','Negative'])
    df.loc[i,'Under Community Mental Health Team']=np.random.choice(['Yes','No'])
    df.loc[i,'Has had inpatient treatment']=np.random.choice(['Yes','No'])
    df.loc[i,'Has had therapy']=np.random.choice(['Yes','No'])

    df.loc[i,'On anxiolytic']=np.random.choice(['Yes','No'])
    df.loc[i,'On anti-depressant']=np.random.choice(['Yes','No'])
    df.loc[i,'On lithium']=np.random.choice(['Yes','No'])
    df.loc[i,'On valproate']=np.random.choice(['Yes','No'])
    df.loc[i,'Polypharmacy']=np.random.choice(['Yes','No'])
    df.loc[i,'Physical disability']=np.random.choice(['Yes','No'])
    df.loc[i,'Carers']=np.random.choice(['Yes','No'])
    
    for timepoint in range(timepoints):
        #prime an initial value with probability as shown
        if timepoint==0:
            df.loc[i,'Attended last appointment points: timepoint '+str(timepoint)]=np.random.choice([0,1000], p=[0.5,0.5])
            df.loc[i,'Logged symptoms today points: timepoint '+str(timepoint)]=np.random.choice([0,50], p=[0.5,0.5])
            df.loc[i,'Checked in today points: timepoint '+str(timepoint)]=np.random.choice([0,10], p=[0.5,0.5])
            df.loc[i,'Asked a question points: timepoint '+str(timepoint)]=np.random.choice([0,20], p=[0.8,0.2])
            df.loc[i,'Answered a question points: timepoint '+str(timepoint)]=np.random.choice([0,50], p=[0.8,0.2])
            df.loc[i,'Spoke to trialbot points: timepoint '+str(timepoint)]=np.random.choice([0,20], p=[0.5,0.5])
            df.loc[i,'Epoch points: timepoint '+str(timepoint)]=df.loc[i,'Spoke to trialbot points: timepoint '+str(timepoint)]+df.loc[i,'Answered a question points: timepoint '+str(timepoint)]+df.loc[i,'Asked a question points: timepoint '+str(timepoint)]+df.loc[i,'Checked in today points: timepoint '+str(timepoint)]+df.loc[i,'Logged symptoms today points: timepoint '+str(timepoint)]+df.loc[i,'Attended last appointment points: timepoint '+str(timepoint)]
            
            total_attendance_points+=df.loc[i,'Attended last appointment points: timepoint '+str(timepoint)]
            total_log_points+=df.loc[i,'Logged symptoms today points: timepoint '+str(timepoint)]
            total_check_in_points+=df.loc[i,'Checked in today points: timepoint '+str(timepoint)]
            total_asked_question_points+=df.loc[i,'Asked a question points: timepoint '+str(timepoint)]
            total_answered_question_points+=df.loc[i,'Answered a question points: timepoint '+str(timepoint)]
            total_spoke_to_bot_points+=df.loc[i,'Spoke to trialbot points: timepoint '+str(timepoint)]
            total_points+=df.loc[i,'Epoch points: timepoint '+str(timepoint)]
            
            df.loc[i,'Cumulative points: timepoint '+str(timepoint)]=total_points
            df.loc[i,'Cumulative attendance points: timepoint '+str(timepoint)]=total_attendance_points
            df.loc[i,'Cumulative log points: timepoint '+str(timepoint)]=total_log_points
            df.loc[i,'Cumulative check in points: timepoint '+str(timepoint)]=total_check_in_points
            df.loc[i,'Cumulative answered question points: timepoint '+str(timepoint)]=total_asked_question_points
            df.loc[i,'Cumulative spoke to bot points: timepoint '+str(timepoint)]=total_spoke_to_bot_points

            #simulate randomly anxiety as a function of depression, and vice versa
            order = np.random.choice(['Anxiety','Depression'])
            if order == 'Anxiety':
                df.loc[i,'HADS-Anxiety: timepoint '+str(timepoint)]=random.randint(12, 21)
                df.loc[i,'HADS-Depression: timepoint '+str(timepoint)]=int(df.loc[i,'HADS-Anxiety: timepoint '+str(timepoint)]*random.uniform(0.2, 1))
            if order == 'Depression':
                df.loc[i,'HADS-Depression: timepoint '+str(timepoint)]=random.randint(12, 21)
                df.loc[i,'HADS-Anxiety: timepoint '+str(timepoint)]=int(df.loc[i,'HADS-Depression: timepoint '+str(timepoint)]*random.uniform(0.2, 1))
            
        #work to the hypothesis that prior values have SOME temporal relationship    
        if timepoint>0:      
            if df.loc[i,'Logged symptoms today points: timepoint '+str(timepoint-1)]==0:
#                 p0,p1 = 0.7,0.3
                p0=random.uniform(0.6, 1)
                p1=1-p0
            else:
#                 p0,p1 = 0.4,0.6    
                p1=random.uniform(0.6, 1)
                p0=1-p1
            df.loc[i,'Logged symptoms today points: timepoint '+str(timepoint)]=np.random.choice([0,50], p=[p0,p1])
            
            if df.loc[i,'Checked in today points: timepoint '+str(timepoint-1)]==0:
#                 p0,p1 = 0.6,0.4
                p0=random.uniform(0.6, 1)
                p1=1-p0
            else:
                p0,p1 = 0.4,0.6
                p1=random.uniform(0.6, 1)
                p0=1-p1
            df.loc[i,'Checked in today points: timepoint '+str(timepoint)]=np.random.choice([0,10], p=[p0,p1])
            
            if df.loc[i,'Asked a question points: timepoint '+str(timepoint-1)]==0:
#                 p0,p1 = 0.8,0.2
                p0=random.uniform(0.8, 1)
                p1=1-p0
            else:
#                 p0,p1 = 0.4,0.6
                p1=random.uniform(0.6, 1)
                p0=1-p1
            df.loc[i,'Asked a question points: timepoint '+str(timepoint)]=np.random.choice([0,20], p=[p0,p1])
            
            if df.loc[i,'Answered a question points: timepoint '+str(timepoint-1)]==0:
#                 p0,p1 = 0.8,0.2
                p0=random.uniform(0.8, 1)
                p1=1-p0
            else:
#                 p0,p1 = 0.4,0.6
                p1=random.uniform(0.6, 1)
                p0=1-p1
            df.loc[i,'Answered a question points: timepoint '+str(timepoint)]=np.random.choice([0,50], p=[p0,p1])
            
            if df.loc[i,'Spoke to trialbot points: timepoint '+str(timepoint-1)]==0:
#                 p0,p1 = 0.6,0.4
                p0=random.uniform(0.6, 1)
                p1=1-p0
            else:
#                 p0,p1 = 0.4,0.6
                p1=random.uniform(0.6, 1)
                p0=1-p1
            df.loc[i,'Spoke to trialbot points: timepoint '+str(timepoint)]=np.random.choice([0,20], p=[p0,p1])

            
            engagement = df.loc[i,'Epoch points: timepoint '+str(timepoint-1)]/max_points_per_epoch
            
            #multi-level engagement probabalistic attendence from app engagement
            if df.loc[i,'Attended last appointment points: timepoint '+str(timepoint-1)]>0 and engagement>0.9:
                p1=random.uniform(0.7, 1)
                p0=1-p1
            if df.loc[i,'Attended last appointment points: timepoint '+str(timepoint-1)]>0 and engagement>0.8:
                p1=random.uniform(0.6, 1)
                p0=1-p1
            if df.loc[i,'Attended last appointment points: timepoint '+str(timepoint-1)]>0 and engagement>0.7:
                p1=random.uniform(0.5, 1)
                p0=1-p1
            if df.loc[i,'Attended last appointment points: timepoint '+str(timepoint-1)]>0 and engagement>0.6:
                p1=random.uniform(0.4, 1)
                p0=1-p1
            if df.loc[i,'Attended last appointment points: timepoint '+str(timepoint-1)]>0 and engagement>0.5:
                p1=random.uniform(0.3, 1)
                p0=1-p1
            else:
                p0=random.uniform(0.75, 1)
                p1=1-p0

            df.loc[i,'Attended last appointment points: timepoint '+str(timepoint)]=np.random.choice([0,1000], p=[p0,p1])
        
            total_attendance_points+=df.loc[i,'Attended last appointment points: timepoint '+str(timepoint)]
            total_log_points+=df.loc[i,'Logged symptoms today points: timepoint '+str(timepoint)]
            total_check_in_points+=df.loc[i,'Checked in today points: timepoint '+str(timepoint)]
            total_asked_question_points+=df.loc[i,'Asked a question points: timepoint '+str(timepoint)]
            total_answered_question_points+=df.loc[i,'Answered a question points: timepoint '+str(timepoint)]
            total_spoke_to_bot_points+=df.loc[i,'Spoke to trialbot points: timepoint '+str(timepoint)]
            
            df.loc[i,'Cumulative points: timepoint '+str(timepoint)]=total_points
            df.loc[i,'Cumulative attendance points: timepoint '+str(timepoint)]=total_attendance_points
            df.loc[i,'Cumulative log points: timepoint '+str(timepoint)]=total_log_points
            df.loc[i,'Cumulative check in points: timepoint '+str(timepoint)]=total_check_in_points
            df.loc[i,'Cumulative answered question points: timepoint '+str(timepoint)]=total_asked_question_points
            df.loc[i,'Cumulative spoke to bot points: timepoint '+str(timepoint)]=total_spoke_to_bot_points
            df.loc[i,'Epoch points: timepoint '+str(timepoint)]=df.loc[i,'Spoke to trialbot points: timepoint '+str(timepoint)]+df.loc[i,'Answered a question points: timepoint '+str(timepoint)]+df.loc[i,'Asked a question points: timepoint '+str(timepoint)]+df.loc[i,'Checked in today points: timepoint '+str(timepoint)]+df.loc[i,'Logged symptoms today points: timepoint '+str(timepoint)]+df.loc[i,'Attended last appointment points: timepoint '+str(timepoint)]
            total_points+=df.loc[i,'Epoch points: timepoint '+str(timepoint)]
            
            df.loc[i,'HADS-Anxiety: timepoint '+str(timepoint)]=int((df.loc[i,'HADS-Anxiety: timepoint '+str(timepoint-1)]-engagement)*random.uniform(0.95, 1.2))
            df.loc[i,'HADS-Depression: timepoint '+str(timepoint)]=int((df.loc[i,'HADS-Depression: timepoint '+str(timepoint-1)]-engagement)*random.uniform(0.95, 1.2))
    
            if df.loc[i,'HADS-Anxiety: timepoint '+str(timepoint)]>21:
                df.loc[i,'HADS-Anxiety: timepoint '+str(timepoint)]=21
            if df.loc[i,'HADS-Depression: timepoint '+str(timepoint)]>21:
                df.loc[i,'HADS-Depression: timepoint '+str(timepoint)]=21
            if df.loc[i,'HADS-Anxiety: timepoint '+str(timepoint)]<0:
                df.loc[i,'HADS-Anxiety: timepoint '+str(timepoint)]=0
            if df.loc[i,'HADS-Depression: timepoint '+str(timepoint)]<0:
                df.loc[i,'HADS-Depression: timepoint '+str(timepoint)]=0
                
        #simulate personal health rating as a function of perfect health MINUS anxiety and depression, plus a metric of app engagement, multipled a random variable of life event
        engagement = df.loc[i,'Epoch points: timepoint '+str(timepoint)]/max_points_per_epoch
        enagement_weight=15*engagement
        df.loc[i,'Personal health rating: timepoint '+str(timepoint)]=int((100-(df.loc[i,'HADS-Depression: timepoint '+str(timepoint)]+df.loc[i,'HADS-Anxiety: timepoint '+str(timepoint)])+enagement_weight)*random.uniform(0.7, 1.1))
        if df.loc[i,'Personal health rating: timepoint '+str(timepoint)]>100:
            df.loc[i,'Personal health rating: timepoint '+str(timepoint)]=100
    
    df.loc[i,'Total points']=total_points
    df.loc[i,'Total attendance points']=total_attendance_points
    df.loc[i,'Total log points']=total_log_points
    df.loc[i,'Total check in points']=total_check_in_points
    df.loc[i,'Total answered question points']=total_asked_question_points
    df.loc[i,'Total spoke to bot points']=total_spoke_to_bot_points
        
print(df.shape)
df.head()

10it [00:01,  9.22it/s]

(10, 853)





Unnamed: 0,username,Trial name,First name,Last name,DOB,Address,email,Latitude,Longitude,Postcode,...,Epoch points: timepoint 49,HADS-Anxiety: timepoint 49,HADS-Depression: timepoint 49,Personal health rating: timepoint 49,Total points,Total attendance points,Total log points,Total check in points,Total answered question points,Total spoke to bot points
0,decimalCod8,MindfulAI,Victor,Laumbach,2005-03-03,9 Carter extensions\nVanessahaven\nB1D 0YW,Victor_Laumbach@mindfulai.com,52.41667,0.75,BS5 0HP,...,110.0,21.0,1.0,76.0,21510.0,19000.0,600.0,240.0,520.0,700.0
1,peacefulOcelot9,MindfulAI,Michelle,Garcia,1977-06-07,06 Smart lakes\nLake Mohammad\nL8 8HG,Michelle_Garcia@mindfulai.com,51.39148,-0.29825,L55 1QJ,...,70.0,21.0,21.0,63.0,14410.0,11000.0,1450.0,260.0,180.0,620.0
2,wearyRat3,MindfulAI,Harriet,Schilling,1979-12-16,18 Connor wells\nJonesville\nTQ44 0YG,Harriet_Schilling@mindfulai.com,56.56317,-2.58736,IP1P 1NQ,...,80.0,19.0,17.0,53.0,16680.0,13000.0,900.0,280.0,540.0,860.0
3,eagerCrackers0,MindfulAI,Mary,Clark,1989-10-11,8 Ellis curve\nCraigville\nKT2 9FU,Mary_Clark@mindfulai.com,50.82882,-0.32247,SE5 9QJ,...,60.0,20.0,0.0,56.0,12940.0,11000.0,900.0,210.0,220.0,560.0
4,truthfulHawk9,MindfulAI,Jeffrey,Weber,1977-12-03,Studio 65\nJane lights\nKathleenberg\nM40 9PT,Jeffrey_Weber@mindfulai.com,53.78333,-1.06667,LA7H 6TF,...,10.0,19.0,0.0,74.0,21650.0,19000.0,1200.0,210.0,260.0,680.0


In [6]:
print("Example temporal health rating over time of users")
df_melt = pd.melt(df,id_vars='username')
df_melt = df_melt[df_melt['variable'].str.contains('Cumulative points')]

Example temporal health rating over time of users


In [7]:
df.to_csv('simulated_data.csv')

# now simulate researcher data

In [8]:
researcher_count = 15
# Name, Department (rightnow values: Clinician, IT, Developer), Email, Address, Phone

df_researcher = pd.DataFrame(generate_username(researcher_count),columns=['username'])

df_researcher['Trial name']='MindfullAI'

#constitutional data
df_researcher['First name']=''
df_researcher['Last name']=''
df_researcher['email']=''
df_researcher['Address']=''
df_researcher['Latitude']=0
df_researcher['Longitude']=0
df_researcher['Postcode']=''
df_researcher['Telephone number']=''
df_researcher['Department']=''

for i, row in tqdm(df_researcher.iterrows()):

    df_researcher.loc[i,'First name']=names.get_first_name()
    df_researcher.loc[i,'Last name']=names.get_last_name()
    df_researcher.loc[i,'email']=df_researcher.loc[i,'First name'].lower()+'_'+df_researcher.loc[i,'Last name'].lower()+'@mindfulai.com'
    
    fake = Faker('en-UK')
    df_researcher.loc[i,'Address']=fake.address()
    
    country='US'
    while country != 'GB':
        result = fake.location_on_land()
        country = result[3]
    
    df_researcher.loc[i,'Latitude']=result[0]
    df_researcher.loc[i,'Longitude']=result[1]
    df_researcher.loc[i,'Postcode']=fake.postcode()
    df_researcher.loc[i,'Telephone number']=fake.phone_number()
    
    df_researcher.loc[i,'Department']=np.random.choice(['Healthcare','Developer','Administrator'])
        
print(df_researcher.shape)

15it [00:00, 45.75it/s]

(15, 11)





In [9]:
df_researcher.to_csv('simulated_staff_data.csv')