Generate Simulated Data on participant characteristics, app usage, research satisfaction and retention in order to build a ML model that identifies participants at risk of drop-out

In [None]:
import numpy as np
import pandas as pd
import random
import datetime

In [106]:
## seed for reproducibility
random.seed(405)

In [107]:
#App Usage Heuristics

# Function to calculate recency, the freshness of app activity
def calculate_recency(last_activity_date):
    current_date = datetime.date.today()
    recency = (current_date - last_activity_date.date()).days
    return recency

# Function to calculate frequency (how many times the participant logged into the app within the last 30 days)
def calculate_frequency(login_datetimes):
    if len(login_datetimes) == 0:
        return 0

    current_datetime = datetime.datetime.now()
    thirty_days_ago = current_datetime - datetime.timedelta(days=30)
    count = sum(1 for login_datetime in login_datetimes if thirty_days_ago <= login_datetime <= current_datetime)
    frequency = count
    return frequency

# Function to calculate latency, average gap between app use
def calculate_latency(login_datetimes):
    if len(login_datetimes) < 2:
        return 0

    time_diffs = [(login_datetimes[i] - login_datetimes[i-1]).days for i in range(1, len(login_datetimes))]
    mean_latency = sum(time_diffs) / len(time_diffs)
    return mean_latency

In [116]:
# Generate data
def generate_unique_id():
    # Generate a unique numeric ID
    return random.randint(100000, 999999)

participants = []
retention_probabilities = [0] * 3 + [1] * 7  # 30% 0's (i.e., not retained), 70% 1's (retained)

# Generate a list of unique participant IDs
participant_ids = random.sample(range(100000, 999999 + 1), 100)

for participant_id in participant_ids:
    age = random.randint(13, 18)
    sex = random.choice(['Male', 'Female'])
    mh_score = round(random.normalvariate(5, 2))
    participant_satisfaction_score = random.randint(1, 5)

    # Ensure the MH_score is between 1 and 10
    mh_score = max(1, min(10, mh_score))

    num_logins = round(((18 - age) * age_weight + mh_score * mh_score_weight + participant_satisfaction_score * satisfaction_score_weight) * 2)
    num_logins = max(0, num_logins)

    participant_logins = []
    for _ in range(num_logins):
        login_datetime = datetime.datetime.now() - datetime.timedelta(days=random.randint(1, 90))
        participant_logins.append(login_datetime)

    # Sort login datetimes in ascending order
    participant_logins.sort()

    last_login_datetime = max(participant_logins) if participant_logins else datetime.datetime.now()
    recency = calculate_recency(last_login_datetime)
    frequency = calculate_frequency(participant_logins)
    latency = calculate_latency(participant_logins)
    responsiveness = max(-0.2 * num_logins + 6, 0.1) # Generate responsiveness to notifications in hours as a function of the number of logins, with a slope of -0.2 and an intercept of 6, and ensure it is at least 0.1.

    # Calculate the probability of getting retention as 0
    retention_prob = 0.7
    if mh_score > 5:
        retention_prob -= 0.15
    if participant_satisfaction_score < 3:
        retention_prob -= 0.1
    if recency > 30:
        retention_prob -= 0.05

    # Determine the retention value based on the probability
    retention_after_10_years = random.choices([0, 1], weights=[1 - retention_prob, retention_prob])[0]

    participant = {
        'ID': participant_id,
        'Age': age,
        'Sex': sex,
        'MH_score': mh_score,
        'Participant_Satisfaction_Score': participant_satisfaction_score,
        'Num_Logins': num_logins,
        'Login_Datetimes': participant_logins,
        'Recency': recency,
        'Frequency': frequency,
        'Latency': latency,
        'Responsiveness': responsiveness,
        'Retention_After_10_Years': retention_after_10_years
    }

    participants.append(participant)

In [117]:
#Print simulated data (without login data times - Login Datetimes: {participant['Login_Datetimes']})
for participant in participants:
    print(f"ID: {participant['ID']}, Age: {participant['Age']}, Sex: {participant['Sex']}, MH Score: {participant['MH_score']}, Participant Satisfaction Score: {participant['Participant_Satisfaction_Score']}, Number of Logins: {participant['Num_Logins']}, Recency: {participant['Recency']} days, Frequency: {participant['Frequency']} logins per month, Latency: {participant['Latency']} days, Responsiveness: {participant['Responsiveness']}, Retention After 10 Years: {participant['Retention_After_10_Years']}")


ID: 838315, Age: 18, Sex: Male, MH Score: 7, Participant Satisfaction Score: 5, Number of Logins: 19, Recency: 6 days, Frequency: 6 logins per month, Latency: 4.444444444444445 days, Responsiveness: 2.1999999999999997, Retention After 10 Years: 0
ID: 739940, Age: 18, Sex: Female, MH Score: 6, Participant Satisfaction Score: 2, Number of Logins: 11, Recency: 14 days, Frequency: 2 logins per month, Latency: 7.5 days, Responsiveness: 3.8, Retention After 10 Years: 0
ID: 453576, Age: 15, Sex: Male, MH Score: 5, Participant Satisfaction Score: 5, Number of Logins: 22, Recency: 2 days, Frequency: 7 logins per month, Latency: 3.857142857142857 days, Responsiveness: 1.5999999999999996, Retention After 10 Years: 1
ID: 455124, Age: 16, Sex: Male, MH Score: 4, Participant Satisfaction Score: 1, Number of Logins: 10, Recency: 12 days, Frequency: 4 logins per month, Latency: 8.555555555555555 days, Responsiveness: 4.0, Retention After 10 Years: 0
ID: 685231, Age: 13, Sex: Male, MH Score: 3, Partici

In [120]:
# Save data
df = pd.DataFrame(participants)

# Specify the output file path
output_file_path = "MH1_simulated_data_Iterative_Insights.csv"

# Save the dataframe to a CSV file
df.to_csv(output_file_path, index=False)