In [7]:
import pandas as pd
import numpy as np

# 加载数据
user_data = pd.read_csv('JD_user_data.csv')
order_data = pd.read_csv('JD_order_data.csv')
delivery_data = pd.read_csv('JD_delivery_data.csv')
delivery_data['is_1P'] = delivery_data['type'] == 1  # 1为1P，0为3P

# 合并数据
merged_data = pd.merge(delivery_data, order_data, on='order_ID')
merged_data = pd.merge(merged_data, user_data, on='user_ID')

# 标记同城配送和订单类型
merged_data['is_local'] = merged_data['dc_ori'] == merged_data['dc_des']


# 改进的时间处理函数
def clean_time(time_str):
    if pd.isna(time_str):
        return pd.NaT
    
    # 处理日期时间格式
    if isinstance(time_str, str):
        # 处理"2018/3/1 08:00"这样的格式
        if ' ' in time_str:
            try:
                return pd.to_datetime(time_str, format='%Y/%m/%d %H:%M')
            except:
                pass
        
        # 处理只有日期的格式
        try:
            return pd.to_datetime(time_str, format='%Y/%m/%d')
        except:
            pass
    
    return pd.NaT

# 处理订单日期和时间
def clean_order_time(date_str, time_str):
    try:
        # 处理日期
        date_part = pd.to_datetime(date_str, format='%Y/%m/%d')
        
        # 处理时间 - 处理异常格式如"50:41.0"
        if isinstance(time_str, str):
            time_parts = time_str.split(':')
            if len(time_parts) == 2:
                hours = min(int(time_parts[0]), 23)  # 限制小时不超过23
                minutes = min(int(float(time_parts[1])), 59)  # 限制分钟不超过59
                time_part = pd.Timedelta(hours=hours, minutes=minutes)
                return date_part + time_part
        
        return date_part  # 如果时间无效，只返回日期部分
    except:
        return pd.NaT

# 处理时间列
merged_data['arr_time'] = merged_data['arr_time'].apply(clean_time)
merged_data['arr_station_time'] = merged_data['arr_station_time'].apply(clean_time)
merged_data['order_datetime'] = merged_data.apply(
    lambda x: clean_order_time(x['order_date'], x['order_time']), axis=1
)

# 转换promise天数为小时
merged_data['promise'] = pd.to_numeric(merged_data['promise'], errors='coerce')
merged_data['promise'] = merged_data['promise'] * 24

# 计算两种配送时间（小时）
valid_time_mask = merged_data['order_datetime'].notnull() & merged_data['arr_time'].notnull()
merged_data['actual_delivery_time'] = pd.Series(np.nan, index=merged_data.index)
merged_data.loc[valid_time_mask, 'actual_delivery_time'] = (
    merged_data.loc[valid_time_mask, 'arr_time'] - merged_data.loc[valid_time_mask, 'order_datetime']
).dt.total_seconds() / 3600

# 计算站点到客户的配送时间
station_valid_mask = merged_data['arr_station_time'].notnull() & merged_data['arr_time'].notnull()
merged_data['station_to_customer_time'] = pd.Series(np.nan, index=merged_data.index)
merged_data.loc[station_valid_mask, 'station_to_customer_time'] = (
    merged_data.loc[station_valid_mask, 'arr_time'] - merged_data.loc[station_valid_mask, 'arr_station_time']
).dt.total_seconds() / 3600

# 按PLUS会员、配送类型和订单类型分组计算指标
results = {}
time_metrics = ['actual_delivery_time', 'station_to_customer_time']

for plus_status in [0, 1]:
    for delivery_type in [True, False]:
        for order_type in [True, False]:
            subset = merged_data[
                (merged_data['plus'] == plus_status) & 
                (merged_data['is_local'] == delivery_type) &
                (merged_data['is_1P'] == order_type)
            ]
            
            metrics = {}
            for metric in time_metrics:
                valid_subset = subset[subset[metric].notnull()]
                
                if len(valid_subset) == 0:
                    metrics[metric] = {
                        'avg_time': 0,
                        'valid_orders': 0
                    }
                    continue
                    
                # 计算平均时间
                avg_time = valid_subset[metric].mean()
                
                metrics[metric] = {
                    'avg_time': avg_time,
                    'valid_orders': len(valid_subset)
                }
            
            # 计算超时比例（仅针对actual_delivery_time）
            promise_subset = subset.dropna(subset=['actual_delivery_time', 'promise'])
            total_orders = len(promise_subset)
            late_orders = len(promise_subset[promise_subset['actual_delivery_time'] > promise_subset['promise']])
            late_ratio = (late_orders / total_orders) * 100 if total_orders > 0 else 0
            
            results[(plus_status, delivery_type, order_type)] = {
                'metrics': metrics,
                'late_order_ratio': late_ratio,
                'valid_orders': total_orders
            }

# 输出结果
def print_results(plus_status, title):
    print(f"\n\n{title}配送表现对比：")
    print("-" * 60)
    
    for delivery_type in [True, False]:
        type_name = "同城配送" if delivery_type else "非同城配送"
        
        for order_type in [True, False]:
            order_name = "1P订单" if order_type else "3P订单"
            data = results[(plus_status, delivery_type, order_type)]
            
            print(f"\n{type_name} - {order_name}:")
            print(f"  总有效订单数: {data['valid_orders']}")
            print(f"  整体配送超时比例: {data['late_order_ratio']:.2f}%")
            
            print("\n  平均配送时间:")
            print(f"    下单到送达: {data['metrics']['actual_delivery_time']['avg_time']:.2f} 小时")
            print(f"    站点到客户: {data['metrics']['station_to_customer_time']['avg_time']:.2f} 小时")
            print("-" * 60)

print_results(1, "PLUS会员")
print_results(0, "非PLUS会员")



PLUS会员配送表现对比：
------------------------------------------------------------

同城配送 - 1P订单:
  总有效订单数: 45473
  整体配送超时比例: 21.01%

  平均配送时间:
    下单到送达: 19.74 小时
    站点到客户: 6.07 小时
------------------------------------------------------------

同城配送 - 3P订单:
  总有效订单数: 3384
  整体配送超时比例: 16.84%

  平均配送时间:
    下单到送达: 49.18 小时
    站点到客户: 6.61 小时
------------------------------------------------------------

非同城配送 - 1P订单:
  总有效订单数: 23947
  整体配送超时比例: 12.16%

  平均配送时间:
    下单到送达: 35.36 小时
    站点到客户: 6.18 小时
------------------------------------------------------------

非同城配送 - 3P订单:
  总有效订单数: 3346
  整体配送超时比例: 13.93%

  平均配送时间:
    下单到送达: 57.90 小时
    站点到客户: 6.59 小时
------------------------------------------------------------


非PLUS会员配送表现对比：
------------------------------------------------------------

同城配送 - 1P订单:
  总有效订单数: 131878
  整体配送超时比例: 20.01%

  平均配送时间:
    下单到送达: 20.64 小时
    站点到客户: 6.63 小时
------------------------------------------------------------

同城配送 - 3P订单:
  总有效订单数: 18937
  整体配送超时比例: 19