In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from preprocessing3 import Preprocessor
import os

# why time average estimation performs well? 
- because every day aggregate amount of orders stays stable, that is to say, low variances. Thus, we want to see the variance distribution of grids, and look for the boundary of time-average method, i.e, when will time-average method fails, i.e, variance become too high.

In [8]:
# Load the data

data_path = '/home/go3/wch_code/jx/real_data/data/cleaned_data2.csv'
df = pd.read_csv(data_path)

# Define the dates
all_dates = ['2022-10-17', '2022-10-18', '2022-10-19', '2022-10-20', '2022-10-21']
all_dates = [pd.to_datetime(date).date() for date in all_dates]

# Preprocess the data
df = Preprocessor.preprocess(df, all_dates)

# Define time windows (hourly from 08:00 to 23:59)
time_windows = [
        ('08:00', '09:59:59'), 
        ('10:00', '11:59:59'), 
        ('12:00', '13:59:59'),
        ('14:00', '15:59:59'), 
        ('16:00', '17:59:59'), 
        ('18:00', '19:59:59'),
        ('20:00', '21:59:59'), 
        ('22:00', '23:59:59')
    # ('08:00:00', '08:59:59'),
    # ('09:00:00', '09:59:59'),
    # ('10:00:00', '10:59:59'),
    # ('11:00:00', '11:59:59'),
    # ('12:00:00', '12:59:59'),
    # ('13:00:00', '13:59:59'),
    # ('14:00:00', '14:59:59'),
    # ('15:00:00', '15:59:59'),
    # ('16:00:00', '16:59:59'),
    # ('17:00:00', '17:59:59'),
    # ('18:00:00', '18:59:59'),
    # ('19:00:00', '19:59:59'),
    # ('20:00:00', '20:59:59'),
    # ('21:00:00', '21:59:59'),
    # ('22:00:00', '22:59:59'),
    # ('23:00:00', '23:59:59')
]

# Define grid sizes in kilometers
dist_per_grid_list = [
    0.1, 0.2,
    0.3, 0.4,
    0.5, 
    0.6, 0.7,
    0.8, 0.9, 
    1.0, 1.5, 2.0, 
    2.5, 3.0, 
    4.0, 
    5.0
    ]

# Process each grid size
# for dist_per_grid in dist_per_grid_list:

def run_single_combination(args):
    dist_per_grid= args
    # Print the current grid size being processed
    print(f"Processing grid size: {dist_per_grid} km")
    
    # Assign grid indices based on sender location
    base_df, num_total_cells, _, _, _, _, _ = Preprocessor.cut_df(df, dist_per_grid, timeperiod=60)
    
    # Dictionary to store variances for each time window
    variance_data = {}
    mean_var = {}
    # Process each time window
    for tw in time_windows:
        start_time = pd.to_datetime(tw[0]).time()
        end_time = pd.to_datetime(tw[1]).time()
        
        # Filter orders within the time window
        tw_df = base_df[(base_df['time'] >= start_time) & (base_df['time'] <= end_time)]
        
        # Count orders per grid cell per day
        counts = tw_df.groupby(['sell_index', 'date']).size().reset_index(name='count')
        
        # Calculate variance of order counts across days for each grid cell
        variances = counts.groupby('sell_index')['count'].var().reset_index(name='variance')
        
        # Store variances, excluding NaN (grid cells with orders on only one day)
        variance_data[tw] = variances['variance'].dropna().values

        # drop out outliers
        if len(variance_data[tw]) > 0:
            q1 = np.percentile(variance_data[tw], 25)
            q3 = np.percentile(variance_data[tw], 75)
            q95 = np.percentile(variance_data[tw], 95)
            iqr = q3 - q1
            lower_bound = q1 - 1.5 * iqr
            upper_bound = q3 + 1.5 * iqr
            upper_bound = min(upper_bound, q95)  # cap the upper bound at the 95th percentile
            # variance_data[tw] = variance_data[tw][(variance_data[tw] >= lower_bound) & (variance_data[tw] <= upper_bound)]
            variance_data[tw] = variance_data[tw][(variance_data[tw] <= upper_bound)]

        mean_var[tw] = variances['variance'].mean()



    # Prepare data for plotting
    plot_data = []
    for tw in time_windows:
        variances = variance_data[tw]
        for var in variances:
            plot_data.append({'time_window': f"{tw[0]}-{tw[1]}", 'variance': var})
    plot_df = pd.DataFrame(plot_data)

    save_path = 'var_boxplot'
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    # Create and save the boxplot
    fig, ax = plt.subplots(figsize=(12, 8))
    # plt.figure(figsize=(12, 8))
    lab_size = 15
    font_size = 15
    sns.boxplot(x='time_window', y='variance', data=plot_df)
    # for idx, time_window in enumerate(plot_df.columns):
    #         mean_val = mean_var[time_window]
    #         if not np.isnan(mean_val):
    #             plt.plot(idx, mean_val, 'rD', markersize=8, label='Mean' if idx == 0 else '')
    # for each time window, plot the mean variance
    for idx, tw in enumerate(time_windows):
        mean_val = mean_var[tw]
        if not np.isnan(mean_val):
            plt.plot(idx, mean_val, 'rD', markersize=8, label='Mean' if idx == 0 else '')
    plt.title(f'Variance of Order Counts Across Days for Grid Size {dist_per_grid} km', fontsize=15)
    ax.tick_params(labelsize=15)
    ax.tick_params(labelsize=lab_size)
    ax.set_xlabel("Time Window'",fontsize=font_size)
    ax.set_ylabel("Variance",fontsize=font_size)
    # plt.xlabel('Time Window', fontsize=15)
    # plt.ylabel('Variance', fontdict=15)
    plt.xticks(rotation=45)
    sns.set(font_scale=1) # set the font scale for seaborn
    sns.set(style='white')  # 
    sns.despine() # means removing the top and right spines
    plt.grid(axis='y')
    plt.tight_layout()
    plt.savefig(f'{save_path}/variance_boxplot_dpg_{dist_per_grid}.png')
    plt.close()

# print("Processing complete. Boxplot figures have been saved.")

# make the whole process parallel
import multiprocessing
max_processors = multiprocessing.cpu_count() - 1  # Leave one processor free
combinations = [(dpg) for dpg in dist_per_grid_list]
with multiprocessing.Pool(processes=max_processors) as pool:
    args_list = [(dpg) 
                for dpg in combinations]
    results_list = pool.map(run_single_combination, args_list)

Processing grid size: 0.6 kmProcessing grid size: 0.2 kmProcessing grid size: 1.0 kmProcessing grid size: 0.8 kmProcessing grid size: 0.4 kmProcessing grid size: 0.5 kmProcessing grid size: 0.3 kmProcessing grid size: 0.1 kmProcessing grid size: 0.7 kmProcessing grid size: 0.9 kmProcessing grid size: 5.0 kmProcessing grid size: 1.5 kmProcessing grid size: 4.0 km
Processing grid size: 3.0 km

Processing grid size: 2.0 km




Processing grid size: 2.5 km









In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import multiprocessing
# from Preprocessor import Preprocessor  # Assuming this is defined elsewhere

# Load the data
data_path = '/home/go3/wch_code/jx/real_data/data/cleaned_data2.csv'
df = pd.read_csv(data_path)

# Define the dates
all_dates = ['2022-10-17', '2022-10-18', '2022-10-19', '2022-10-20', '2022-10-21']
all_dates = [pd.to_datetime(date).date() for date in all_dates]

# Preprocess the data
df = Preprocessor.preprocess(df, all_dates)

# Define time windows (hourly from 08:00 to 23:59)
time_windows = [
    ('08:00', '09:59:59'), 
    ('10:00', '11:59:59'), 
    ('12:00', '13:59:59'),
    ('14:00', '15:59:59'), 
    ('16:00', '17:59:59'), 
    ('18:00', '19:59:59'),
    ('20:00', '21:59:59'), 
    ('22:00', '23:59:59')
]

# Define grid sizes in kilometers
dist_per_grid_list = [
    0.1, 0.2, 0.3, 0.4, 0.5, 
    0.6, 0.7, 0.8, 0.9, 1.0, 
    1.3, 1.5, 1.8,
    2.0, 2.3, 2.5, 2.8, 
    3.0, 3.3, 3.5, 3.8, 
    4.0, 4.3, 4.5, 4.8, 5.0
]

def run_single_combination(dist_per_grid):
    # Print the current grid size being processed
    print(f"Processing grid size: {dist_per_grid} km")
    
    # Assign grid indices based on sender location
    base_df, num_total_cells, _, _, _, _, _ = Preprocessor.cut_df(df, dist_per_grid, timeperiod=60)
    
    # Dictionary to store variances for each time window
    variance_data = {}
    mean_var = {}
    # Process each time window
    for tw in time_windows:
        start_time = pd.to_datetime(tw[0]).time()
        end_time = pd.to_datetime(tw[1]).time()
        
        # Filter orders within the time window
        tw_df = base_df[(base_df['time'] >= start_time) & (base_df['time'] <= end_time)]
        
        # Count orders per grid cell per day
        counts = tw_df.groupby(['sell_index', 'date']).size().reset_index(name='count')
        
        # Calculate variance of order counts across days for each grid cell
        variances = counts.groupby('sell_index')['count'].var().reset_index(name='variance')
        
        # Store variances, excluding NaN
        variance_data[tw] = variances['variance'].dropna().values

        # Remove outliers
        if len(variance_data[tw]) > 0:
            q1 = np.percentile(variance_data[tw], 25)
            q3 = np.percentile(variance_data[tw], 75)
            q95 = np.percentile(variance_data[tw], 95)
            iqr = q3 - q1
            lower_bound = q1 - 1.5 * iqr
            upper_bound = q3 + 1.5 * iqr
            upper_bound = min(upper_bound, q95)
            variance_data[tw] = variance_data[tw][(variance_data[tw] <= upper_bound)]

        mean_var[tw] = variances['variance'].mean()

    # Prepare data for boxplot
    plot_data = []
    for tw in time_windows:
        variances = variance_data[tw]
        for var in variances:
            plot_data.append({'time_window': f"{tw[0]}-{tw[1]}", 'variance': var})
    plot_df = pd.DataFrame(plot_data)

    save_path = 'var_boxplot'
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    # Create and save the boxplot
    fig, ax = plt.subplots(figsize=(12, 8))
    lab_size = 15
    font_size = 15
    sns.boxplot(x='time_window', y='variance', data=plot_df)
    for idx, tw in enumerate(time_windows):
        mean_val = mean_var[tw]
        if not np.isnan(mean_val):
            plt.plot(idx, mean_val, 'rD', markersize=8, label='Mean' if idx == 0 else '')
    plt.title(f'Variance of Order Counts Across Days for Grid Size {dist_per_grid} km', fontsize=15)
    ax.tick_params(labelsize=lab_size)
    ax.set_xlabel("Time Window", fontsize=font_size)
    ax.set_ylabel("Variance", fontsize=font_size)
    plt.xticks(rotation=45)
    sns.set(font_scale=1)
    sns.set(style='white')
    sns.despine()
    plt.grid(axis='y')
    plt.tight_layout()
    plt.savefig(f'{save_path}/variance_boxplot_dpg_{dist_per_grid}.png')
    plt.close()

    # Collect all variances for the new plot
    all_variances = np.concatenate([variance_data[tw] for tw in time_windows if len(variance_data[tw]) > 0])
    if len(all_variances) > 1:
        mean_var = np.mean(all_variances)
        std_var = np.std(all_variances, ddof=1)
        n = len(all_variances)
        se = std_var / np.sqrt(n)
        error = 1.96 * se  # 95% confidence interval error term
    else:
        mean_var = np.nan
        error = np.nan

    return (dist_per_grid, mean_var, error)

# Parallel processing
max_processors = multiprocessing.cpu_count() - 1
combinations = dist_per_grid_list  # Simplified to list of scalars

with multiprocessing.Pool(processes=max_processors) as pool:
    results_list = pool.map(run_single_combination, combinations)

# Create the new plot
grid_sizes = [result[0] for result in results_list]
mean_vars = [result[1] for result in results_list]
errors = [result[2] for result in results_list]

plt.figure(figsize=(12, 8))
plt.errorbar(grid_sizes, mean_vars, yerr=errors, fmt='-o', capsize=5)
plt.xlabel('Grid Size (km)', fontsize=15)
plt.ylabel('Mean Variance', fontsize=15)
plt.title('Mean Variance of Order Counts vs Grid Size with 95% Confidence Intervals', fontsize=15)
plt.grid(True)
plt.tight_layout()
plt.savefig('mean_variance_vs_grid_size.png')
plt.close()

print("Processing complete. All plots have been saved.")

Processing grid size: 0.2 kmProcessing grid size: 0.5 kmProcessing grid size: 0.3 kmProcessing grid size: 0.1 kmProcessing grid size: 0.4 km
Processing grid size: 0.6 kmProcessing grid size: 0.9 kmProcessing grid size: 0.7 kmProcessing grid size: 2.0 kmProcessing grid size: 0.8 km
Processing grid size: 1.0 km
Processing grid size: 1.3 km
Processing grid size: 2.5 km
Processing grid size: 3.0 kmProcessing grid size: 1.5 kmProcessing grid size: 3.8 kmProcessing grid size: 3.3 km
Processing grid size: 4.5 km

Processing grid size: 2.8 kmProcessing grid size: 1.8 km
Processing grid size: 2.3 kmProcessing grid size: 3.5 km

Processing grid size: 4.3 kmProcessing grid size: 4.0 km

Processing grid size: 5.0 kmProcessing grid size: 4.8 km












Processing complete. All plots have been saved.


In [13]:
# Create the new plot
grid_sizes = [result[0] for result in results_list]
mean_vars = [result[1] for result in results_list]
errors = [result[2] for result in results_list]

# plt.figure(figsize=(12, 8))
flg, ax = plt.subplots(figsize=(12, 8))
ax.tick_params(labelsize=15)
plt.errorbar(grid_sizes, mean_vars, yerr=errors, fmt='-o', capsize=5)
plt.xlabel('Grid Size (km)', fontsize=15)
plt.ylabel('Mean Variance', fontsize=15)
plt.title('Mean Variance of Order Counts vs Grid Size with 95% Confidence Intervals', fontsize=15)
plt.grid(True)
plt.tight_layout()
plt.savefig('mean_variance_vs_grid_size.png')
plt.close()

# normalization for orders per grid cell each day

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import multiprocessing
# from Preprocessor import Preprocessor  # Assuming this is defined elsewhere

# Load the data
data_path = '/home/go3/wch_code/jx/real_data/data/cleaned_data2.csv'
df = pd.read_csv(data_path)

# Define the dates
all_dates = ['2022-10-17', '2022-10-18', '2022-10-19', '2022-10-20', '2022-10-21']
all_dates = [pd.to_datetime(date).date() for date in all_dates]

# Preprocess the data
df = Preprocessor.preprocess(df, all_dates)

# Define time windows (hourly from 08:00 to 23:59)
time_windows = [
    ('08:00', '09:59:59'), 
    ('10:00', '11:59:59'), 
    ('12:00', '13:59:59'),
    ('14:00', '15:59:59'), 
    ('16:00', '17:59:59'), 
    ('18:00', '19:59:59'),
    ('20:00', '21:59:59'), 
    ('22:00', '23:59:59')
]

# Define grid sizes in kilometers
dist_per_grid_list = [
    0.1, 0.2, 0.3, 0.4, 0.5, 
    0.6, 0.7, 0.8, 0.9, 1.0, 
    1.3, 1.5, 1.8,
    2.0, 2.3, 2.5, 2.8, 
    3.0, 3.3, 3.5, 3.8, 
    4.0, 4.3, 4.5, 4.8, 5.0
]

def run_single_combination(dist_per_grid):
    # Print the current grid size being processed
    print(f"Processing grid size: {dist_per_grid} km")
    
    # Assign grid indices based on sender location
    base_df, num_total_cells, _, _, _, _, _ = Preprocessor.cut_df(df, dist_per_grid, timeperiod=60)
    
    # Dictionary to store variances for each time window
    variance_data = {}
    mean_var = {}
    # Process each time window
    for tw in time_windows:
        start_time = pd.to_datetime(tw[0]).time()
        end_time = pd.to_datetime(tw[1]).time()
        
        # Filter orders within the time window
        tw_df = base_df[(base_df['time'] >= start_time) & (base_df['time'] <= end_time)]
        
        # Count orders per grid cell per day
        counts = tw_df.groupby(['sell_index', 'date']).size().reset_index(name='count')

        # normalize the counts to avoid large variances due to high order counts
        counts['count'] = (counts['count'] - counts['count'].mean()) / counts['count'].std()
        
        # Calculate variance of order counts across days for each grid cell
        variances = counts.groupby('sell_index')['count'].var().reset_index(name='variance')
        
        # Store variances, excluding NaN
        variance_data[tw] = variances['variance'].dropna().values

        # Remove outliers
        if len(variance_data[tw]) > 0:
            q1 = np.percentile(variance_data[tw], 25)
            q3 = np.percentile(variance_data[tw], 75)
            q95 = np.percentile(variance_data[tw], 95)
            iqr = q3 - q1
            lower_bound = q1 - 1.5 * iqr
            upper_bound = q3 + 1.5 * iqr
            upper_bound = min(upper_bound, q95)
            variance_data[tw] = variance_data[tw][(variance_data[tw] <= upper_bound)]

        mean_var[tw] = variances['variance'].mean()

    # Prepare data for boxplot
    plot_data = []
    for tw in time_windows:
        variances = variance_data[tw]
        for var in variances:
            plot_data.append({'time_window': f"{tw[0]}-{tw[1]}", 'variance': var})
    plot_df = pd.DataFrame(plot_data)

    save_path = 'var_boxplot/normalized'
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    # Create and save the boxplot
    fig, ax = plt.subplots(figsize=(12, 8))
    lab_size = 15
    font_size = 15
    sns.boxplot(x='time_window', y='variance', data=plot_df)
    for idx, tw in enumerate(time_windows):
        mean_val = mean_var[tw]
        if not np.isnan(mean_val):
            plt.plot(idx, mean_val, 'rD', markersize=8, label='Mean' if idx == 0 else '')
    plt.title(f'Variance of Order Counts Across Days for Grid Size {dist_per_grid} km', fontsize=15)
    ax.tick_params(labelsize=lab_size)
    ax.set_xlabel("Time Window", fontsize=font_size)
    ax.set_ylabel("Variance", fontsize=font_size)
    plt.xticks(rotation=45)
    sns.set(font_scale=1)
    sns.set(style='white')
    sns.despine()
    plt.grid(axis='y')
    plt.tight_layout()
    plt.savefig(f'{save_path}/variance_boxplot_dpg_{dist_per_grid}.png')
    plt.close()

    # Collect all variances for the new plot
    all_variances = np.concatenate([variance_data[tw] for tw in time_windows if len(variance_data[tw]) > 0])
    if len(all_variances) > 1:
        mean_var = np.mean(all_variances)
        std_var = np.std(all_variances, ddof=1)
        n = len(all_variances)
        se = std_var / np.sqrt(n)
        error = 1.96 * se  # 95% confidence interval error term
    else:
        mean_var = np.nan
        error = np.nan

    return (dist_per_grid, mean_var, error)

# Parallel processing
max_processors = multiprocessing.cpu_count() - 1
combinations = dist_per_grid_list  # Simplified to list of scalars

with multiprocessing.Pool(processes=max_processors) as pool:
    results_list = pool.map(run_single_combination, combinations)

# Create the new plot
grid_sizes = [result[0] for result in results_list]
mean_vars = [result[1] for result in results_list]
errors = [result[2] for result in results_list]

save_path = 'var_boxplot/normalized'
plt.figure(figsize=(12, 8))
plt.errorbar(grid_sizes, mean_vars, yerr=errors, fmt='-o', capsize=5)
plt.xlabel('Grid Size (km)', fontsize=15)
plt.ylabel('Mean Variance', fontsize=15)
plt.title('Mean Variance of Order Counts vs Grid Size with 95% Confidence Intervals', fontsize=15)
plt.grid(True)
plt.tight_layout()
plt.savefig(f'{save_path}mean_variance_vs_grid_size.png')
plt.close()

print("Processing complete. All plots have been saved.")



Processing grid size: 0.1 kmProcessing grid size: 0.6 kmProcessing grid size: 0.2 kmProcessing grid size: 0.3 kmProcessing grid size: 0.4 kmProcessing grid size: 0.5 kmProcessing grid size: 1.8 kmProcessing grid size: 2.3 km
Processing grid size: 0.7 kmProcessing grid size: 0.8 kmProcessing grid size: 0.9 kmProcessing grid size: 2.0 kmProcessing grid size: 1.0 km



Processing grid size: 1.3 km
Processing grid size: 3.8 kmProcessing grid size: 3.3 km
Processing grid size: 1.5 kmProcessing grid size: 4.3 kmProcessing grid size: 2.5 kmProcessing grid size: 4.8 kmProcessing grid size: 2.8 km
Processing grid size: 5.0 kmProcessing grid size: 3.5 km



Processing grid size: 3.0 km
Processing grid size: 4.0 kmProcessing grid size: 4.5 km












Processing complete. All plots have been saved.
