In [7]:
import pandas as pd
pd.set_option('mode.copy_on_write', True)
import numpy as np

Create simulated patient data

In [72]:
# Set random seed for reproducibility
np.random.seed(42)

# Initial number of patients
n_patients = 1000

In [75]:
# Creating initial pain scores from ethnographic data
real_pain_scores = np.array([8,8,5,2])
mean_pain_score = np.mean(real_pain_scores) 
std_dev_pain_score = np.std(real_pain_scores)
initial_pain_scores = np.random.normal(std_dev_pain_score, mean_pain_score, n_patients).round().astype(int)
initial_pain_scores = np.clip(initial_pain_scores, 1, 10)

In [76]:
# Generate random ages for patients between 30 and 70
ages = np.random.randint(30,70, size=n_patients) 

# Assign patients randomly to vr therapy or control group
vr_therapy = np.random.choice([0,1], size=n_patients)

In [78]:
# simulate standard of care variables 
prescription_pain_relief = np.random.choice(['yes', 'no'], size=n_patients)
cbt = np.random.choice(['yes', 'no'], size=n_patients)
support_group = np.random.choice(['yes', 'no'], size=n_patients)
physiotherapy = np.random.choice(['yes', 'no'], size=n_patients)
                                        
                                             

In [79]:
# create a dataframe for our simulated trial data
data = {
    'Age': ages,
    'Initial Pain Score': initial_pain_scores,
    'VR Therapy': vr_therapy,
    'Prescription Pain Relief': prescription_pain_relief,
    'CBT': cbt,
    'Support Group': support_group,
    'Physiotherapy': physiotherapy,
    'Group': ['VR Therapy' if i == 1 else 'Control' for i in vr_therapy],   
    'Diagnosis': ['Chronic Back Pain'] * n_patients
}

patient_df = pd.DataFrame(data)

# df.to_pickle('simulated_clinical_trial_data.pkl')

# Check the first few rowss
patient_df.head()

Unnamed: 0,Age,Initial Pain Score,VR Therapy,Prescription Pain Relief,CBT,Support Group,Physiotherapy,Group,Diagnosis
0,69,5,0,no,yes,no,no,Control,Chronic Back Pain
1,40,2,0,no,no,no,no,Control,Chronic Back Pain
2,32,6,0,yes,no,yes,yes,Control,Chronic Back Pain
3,35,10,1,no,no,yes,yes,VR Therapy,Chronic Back Pain
4,38,1,1,yes,yes,yes,yes,VR Therapy,Chronic Back Pain


In [80]:
vr_therapy_counts = patient_df['VR Therapy'].value_counts()
vr_therapy_counts

VR Therapy
0    518
1    482
Name: count, dtype: int64

### Estimating VR Effect Size ###
The ethnographic anonymous data provides four patients pain scores before and after trialing a 7 minute 'self care' immersive VR therapy. I'll compute the differences between the pain scores before and after the VR session for each patient, calculate the mean and standard deviation of these differences and use this to calculate Cohen's d (the mean difference divided by the standard deviation of the differences) to determine a reasonable estimate for effect size. 

In [29]:
from statsmodels.stats.power import TTestIndPower

In [33]:
# Determining effect size from ethnographic data
pain_before = np.array([8,8,5,2])
pain_after = np.array([4,1,5,1]) 

# Calculate the differences in scores
pain_differences = pain_before - pain_after

# Calculate the mean and standard deviation of the differences
mean_difference = np.mean(pain_differences)
std_dev_difference = np.std(pain_differences, ddof=1) # numpy advice to provide an unbiased estimator of the variance of the infinite population

# Calculate the effect size
effect_size = mean_difference / std_dev_difference

# Print results
print(f"Mean Difference: {mean_difference}")
print(f"Standard Deviation of Differences: {std_dev_difference}")
print(f"Effect Size: {effect_size}")

Mean Difference: 3.0
Standard Deviation of Differences: 3.1622776601683795
Effect Size: 0.9486832980505138


So this is a pretty large effect and based on a TINY sample (95% difference after VR therapy)... not sure this is realistic? Speak to Matthew

In [36]:
# Investigating sample size with an effect of 95% and a power of 80%
from statsmodels.stats.power import TTestIndPower

# Parameters for power analysis
effect_size = 0.95  # effect size based on ethnographic data
alpha = 0.05  # The common threshold for significance level
power = 0.80  # Desired power aka probability of correctly identifying an effect when there is one 80% of the time - also a medical standard to miss an effect 20% is OK

# Perform power analysis
analysis = TTestIndPower()
sample_size = analysis.solve_power(effect_size=effect_size, alpha=alpha, power=power, ratio=1.0)

# Print the required sample size
print(f"Required sample size per group: {int(np.ceil(sample_size))}")


Required sample size per group: 19


# Let's simulate a trial #

### First we need to set some parameters for our trial simulation, including, defining the following variables: ###
- Number of patients per group: this is pulled from our earlier power calculation 
- Initial pain scores (mean): derived from the mean pain score from ethnographic data
- Initial pain scores (std.dev): derived from the std_dev of pain scores from ethnographic data
- Number of simulations: how many times we want to run the trial 

### We also need to define and take account of (and can adjust) the expected pain reduction from each SOC care treatment ###
- CBT 
- Physio 
- Pain relief 
- Support group

In [67]:

# Setting parameters for trial simulation
n_patients_per_group = 19
mean_initial_pain = 5.75 # mean pain score from ethnographic data
std_dev_initial_pain = 3.20 # standard deviation of pain scores from ethnographic data
effect_size = 0.95 # effect size from ethnographic data
n_simulations = 1000

# Estimated average pain reduction for each SOC treatment
pain_reduction_CBT = 1.0
pain_reduction_physio = 0.8
pain_reduction_prescription = 1.5
pain_reduction_support_group = 0.5


In [68]:


# Function to simulate the trial
def simulate_trial(n_patients_per_group, mean_pain, std_dev, effect_size):
    # Generate initial pain scores
    initial_pain_scores = np.random.normal(mean_pain, std_dev, n_patients_per_group * 2).round().astype(int)
    initial_pain_scores = np.clip(initial_pain_scores, 1, 10)
    
    # Assign SOC treatments
    CBT = np.random.choice([0, 1], size=n_patients_per_group* 2)
    physiotherapy = np.random.choice([0, 1], size=n_patients_per_group * 2)
    prescription = np.random.choice([0, 1], size=n_patients_per_group * 2)
    support_group = np.random.choice([0, 1], size=n_patients_per_group * 2)
    
    # Calculate SOC effect
    SOC_effect = (CBT * pain_reduction_CBT +
                  physiotherapy * pain_reduction_physiotherapy +
                  prescription * pain_reduction_prescription +
                  support_group * pain_reduction_support_group)
    
    # Apply SOC effect to initial pain scores
    adjusted_pain_scores = initial_pain_scores - SOC_effect
    adjusted_pain_scores = np.clip(adjusted_pain_scores, 1, 10)
    
    # Split into control and experimental groups
    control_group = adjusted_pain_scores[:n_patients]
    experimental_group = adjusted_pain_scores[n_patients:]
    
    # Apply VR therapy effect to the experimental group
    experimental_group -= int(effect_size * std_dev)
    experimental_group = np.clip(experimental_group, 1, 10)
    
    return control_group, experimental_group



mean_control: the mean pain score for the control group in each simulation 
mean_experimental: the mean pain score for the experimental group in each simulation
mean_difference: the difference between the mean pain scores of experimental and control groups in each simulation

In [69]:
# Run multiple simulations
np.random.seed(42)

results = []
for i in range(n_simulations):
    control, experimental = simulate_trial(n_patients_per_group, mean_initial_pain, std_dev_initial_pain, effect_size)
    mean_control = np.mean(control)
    mean_experimental = np.mean(experimental)
    results.append((mean_control, mean_experimental, mean_experimental - mean_control))

# Convert results to DataFrame
results_df = pd.DataFrame(results, columns=['Mean_Control', 'Mean_Experimental', 'Mean_Difference'])

results_df

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Unnamed: 0,Mean_Control,Mean_Experimental,Mean_Difference
0,3.610526,,
1,3.715789,,
2,3.931579,,
3,4.602632,,
4,3.997368,,
...,...,...,...
995,4.171053,,
996,4.200000,,
997,4.186842,,
998,3.934211,,


In [70]:
# Calculate summary statistics
summary_stats = results_df.describe()
print(summary_stats)

       Mean_Control  Mean_Experimental  Mean_Difference
count   1000.000000                0.0              0.0
mean       4.101292                NaN              NaN
std        0.408984                NaN              NaN
min        2.842105                NaN              NaN
25%        3.820395                NaN              NaN
50%        4.114474                NaN              NaN
75%        4.378947                NaN              NaN
max        5.665789                NaN              NaN
