Designing the Notification System using the uploaded files: order_stages.csv and orders.csv. 
This code snippet loads order data, calculates courier wait times, visualizes their distribution, and computes correlations with other variables.

In [3]:
pip install scipy

Collecting scipyNote: you may need to restart the kernel to use updated packages.

  Obtaining dependency information for scipy from https://files.pythonhosted.org/packages/3f/72/305686527c68f33f1dd3ebdd28f53340d372b2f9e44dccaf6f92e17739d3/scipy-1.14.0-cp312-cp312-win_amd64.whl.metadata
  Downloading scipy-1.14.0-cp312-cp312-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.8 kB ? eta -:--:--
     ------ --------------------------------- 10.2/60.8 kB ? eta -:--:--
     ------------ ------------------------- 20.5/60.8 kB 165.2 kB/s eta 0:00:01
     ------------------------- ------------ 41.0/60.8 kB 281.8 kB/s eta 0:00:01
     -------------------------------------- 60.8/60.8 kB 323.9 kB/s eta 0:00:00
Downloading scipy-1.14.0-cp312-cp312-win_amd64.whl (44.5 MB)
   ---------------------------------------- 0.0/44.5 MB ? eta -:--:--
   ---------------------------------------- 0.2/44.5 MB 4.1 MB/s eta 0:00:11
   ---------------------------------------- 0.5/4


[notice] A new release of pip is available: 23.2.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Load the data
orders = pd.read_csv('orders.csv')
order_stages = pd.read_csv('order_stages.csv')

# Convert timestamp columns to datetime
orders['order_promised_delivery'] = pd.to_datetime(orders['order_promised_delivery'], utc=True)
orders['restaurant_finished_preparation'] = pd.to_datetime(orders['restaurant_finished_preparation'], utc=True)
order_stages['order_stage_start'] = pd.to_datetime(order_stages['order_stage_start'], utc=True)

# Merge orders and order_stages data
merged_data = pd.merge(orders, order_stages, on='order_id', suffixes=('_order', '_stage'))

# Calculate key metrics
merged_data['courier_wait_time'] = (merged_data['restaurant_finished_preparation'] - merged_data['order_stage_start']).dt.total_seconds() / 60
merged_data['early_arrival'] = merged_data['courier_wait_time'] > 0

# Print summary statistics
print("Summary Statistics:")
print(merged_data['courier_wait_time'].describe())
print(f"\
Percentage of early arrivals: {merged_data['early_arrival'].mean() * 100:.2f}%")

# Visualize courier wait time distribution
plt.figure(figsize=(10, 6))
sns.histplot(merged_data['courier_wait_time'].dropna(), bins=50, kde=True)
plt.title('Distribution of Courier Wait Time')
plt.xlabel('Wait Time (minutes)')
plt.ylabel('Count')
plt.savefig('courier_wait_time_distribution.png')
plt.close()

print("\
Key metrics identified:")
print("1. Courier wait time (minutes)")
print("2. Percentage of early arrivals")

# Calculate correlation between wait time and other numerical variables
correlation_matrix = merged_data[['courier_wait_time', 'estimated_travel_time_in_seconds_to_restaurant', 'estimated_travel_time_in_seconds_to_eater', 'distance_courier_to_restaurant_address', 'distance_courier_to_customer_address']].corr()

print("\
Correlation matrix:")
print(correlation_matrix['courier_wait_time'].sort_values(ascending=False))

Summary Statistics:
count    384856.000000
mean         -4.284304
std          12.057329
min        -944.533333
25%         -11.333333
50%          -1.750000
75%           3.450000
max          95.116667
Name: courier_wait_time, dtype: float64
Percentage of early arrivals: 36.82%
Key metrics identified:
1. Courier wait time (minutes)
2. Percentage of early arrivals
Correlation matrix:
courier_wait_time                                 1.000000
distance_courier_to_customer_address              0.271098
estimated_travel_time_in_seconds_to_restaurant    0.115935
distance_courier_to_restaurant_address            0.078905
estimated_travel_time_in_seconds_to_eater        -0.116251
Name: courier_wait_time, dtype: float64


Handling some errors here

In [8]:
import nbformat
from nbformat.v4 import new_notebook, new_code_cell

# Create a new notebook
nb = new_notebook()

# Cell 1: Imports and setup
cell1 = new_code_cell("""
import numpy as np
import pandas as pd
from scipy import stats

# Define A/B test strategy and simulation parameters
np.random.seed(42)

# Define test parameters
sample_size = 1000  # Arbitrary sample size for simulation
""")
nb.cells.append(cell1)

# Cell 2: Create test groups
cell2 = new_code_cell("""
# Simulate courier arrival and food ready times for Group A (Control)
group_a = pd.DataFrame({
    'courier_arrival_time': pd.date_range(start='2023-01-01', periods=sample_size, freq='5T'),
    'food_ready_time': pd.date_range(start='2023-01-01', periods=sample_size, freq='5T') + pd.Timedelta(minutes=5)
})

# Simulate courier arrival and food ready times for Group B (Test)
group_b = pd.DataFrame({
    'courier_arrival_time': pd.date_range(start='2023-01-01', periods=sample_size, freq='5T'),
    'food_ready_time': pd.date_range(start='2023-01-01', periods=sample_size, freq='5T') + pd.Timedelta(minutes=4)
})

# Add random noise to simulate real-world variation
group_a['courier_arrival_time'] += pd.to_timedelta(np.random.normal(0, 2, sample_size), unit='m')
group_b['courier_arrival_time'] += pd.to_timedelta(np.random.normal(0, 2, sample_size), unit='m')
""")
nb.cells.append(cell2)

# Cell 3: Define metrics calculation function
cell3 = new_code_cell("""
def calculate_metrics(group_data):
    early_arrivals = (group_data['courier_arrival_time'] <= group_data['food_ready_time']).mean()
    avg_wait_time = (group_data['food_ready_time'] - group_data['courier_arrival_time']).mean().total_seconds() / 60
    return early_arrivals, avg_wait_time

early_arrivals_a, avg_wait_time_a = calculate_metrics(group_a)
early_arrivals_b, avg_wait_time_b = calculate_metrics(group_b)

print(f"Simulated results:")
print(f"Group A - Early arrivals: {early_arrivals_a:.2%}, Avg wait time: {avg_wait_time_a:.2f} minutes")
print(f"Group B - Early arrivals: {early_arrivals_b:.2%}, Avg wait time: {avg_wait_time_b:.2f} minutes")
""")
nb.cells.append(cell3)

# Cell 4: Perform statistical test
cell4 = new_code_cell("""
_, p_value = stats.ttest_ind(
    (group_a['courier_arrival_time'] <= group_a['food_ready_time']),
    (group_b['courier_arrival_time'] <= group_b['food_ready_time'])
)

print(f"p-value: {p_value:.4f}")
print(f"Statistically significant difference: {'Yes' if p_value < 0.05 else 'No'}")
""")
nb.cells.append(cell4)

# Cell 5: Calculate test duration (fixed)
cell5 = new_code_cell("""
# Determine the length of the A/B test
average_daily_orders = sample_size / 30  # Assuming we want to complete the test in a month
test_duration_days = int(np.ceil(sample_size / average_daily_orders))

print(f"Estimated test duration: {test_duration_days} days")
""")
nb.cells.append(cell5)

# Write the new notebook
with open('fixed_AB_test.ipynb', 'w') as f:
    nbformat.write(nb, f)

print("New notebook 'fixed_AB_test.ipynb' has been created with the corrected A/B test code.")

New notebook 'fixed_AB_test.ipynb' has been created with the corrected A/B test code.


Now , define the A/B test strategy and simulation parameters. We'll simulate the A/B test results and evaluate the impact, determine the length of the A/B test, and prepare recommendations for the full rollout. Let's proceed with these steps.

This code simulates an A/B test comparing two notification strategies for couriers and calculates early arrival rates and average wait times.

In [9]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
orders = pd.read_csv('orders.csv')
order_stages = pd.read_csv('order_stages.csv')

# Merge the datasets
merged_data = pd.merge(orders, order_stages, on='order_id')

# Convert timestamp columns to datetime and make them timezone-naive
timestamp_columns = ['order_promised_delivery', 'restaurant_finished_preparation', 'order_stage_start', 'updated_expected_delivery_time']
for col in timestamp_columns:
    merged_data[col] = pd.to_datetime(merged_data[col], utc=True).dt.tz_localize(None)

# Calculate actual preparation time
merged_data['actual_prep_time'] = (merged_data['restaurant_finished_preparation'] - merged_data['order_stage_start']).dt.total_seconds() / 60

print("Data preparation completed successfully.")
print("Sample of merged_data:")
print(merged_data[['order_id', 'order_promised_delivery', 'restaurant_finished_preparation', 'order_stage_start', 'actual_prep_time']].head())

# Simulate A/B test
np.random.seed(42)
sample_size = len(merged_data)

# Group A: Current system
group_a = merged_data.copy()
group_a['notification_time'] = group_a['restaurant_finished_preparation']

# Group B: New system (notify 5 minutes before food is ready)
group_b = merged_data.copy()
group_b['notification_time'] = group_b['restaurant_finished_preparation'] - pd.Timedelta(minutes=5)

# Add random variation to courier arrival times
group_a['courier_arrival_time'] = group_a['notification_time'] + pd.to_timedelta(np.random.normal(5, 2, sample_size), unit='m')
group_b['courier_arrival_time'] = group_b['notification_time'] + pd.to_timedelta(np.random.normal(5, 2, sample_size), unit='m')

print("\
A/B test groups created successfully.")
print("Sample of group_a:")
print(group_a[['order_id', 'notification_time', 'courier_arrival_time']].head())
print("\
Sample of group_b:")
print(group_b[['order_id', 'notification_time', 'courier_arrival_time']].head())

Data preparation completed successfully.
Sample of merged_data:
   order_id order_promised_delivery restaurant_finished_preparation  \
0         1     2021-10-15 03:31:11             2021-10-15 03:08:44   
1         1     2021-10-15 03:31:11             2021-10-15 03:08:44   
2         1     2021-10-15 03:31:11             2021-10-15 03:08:44   
3         1     2021-10-15 03:31:11             2021-10-15 03:08:44   
4         1     2021-10-15 03:31:11             2021-10-15 03:08:44   

    order_stage_start  actual_prep_time  
0 2021-10-15 03:04:02          4.700000  
1 2021-10-15 03:04:12          4.533333  
2 2021-10-15 03:08:51         -0.116667  
3 2021-10-15 03:08:54         -0.166667  
4 2021-10-15 03:38:01        -29.283333  
A/B test groups created successfully.
Sample of group_a:
   order_id   notification_time          courier_arrival_time
0         1 2021-10-15 03:08:44 2021-10-15 03:14:43.605698360
1         1 2021-10-15 03:08:44 2021-10-15 03:13:27.408283862
2         1 20