In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import folium
import warnings
warnings.filterwarnings('ignore')


In [2]:

# 1. 数据加载和基础清洗
print("Loading data...")
file_path = 'yellow_tripdata_2015-01.csv'
data = pd.read_csv(file_path, usecols=[
    'tpep_pickup_datetime', 'tpep_dropoff_datetime',
    'pickup_longitude', 'pickup_latitude',
    'dropoff_longitude', 'dropoff_latitude'
])


Loading data...


In [3]:
# 转换时间格式
data['tpep_pickup_datetime'] = pd.to_datetime(data['tpep_pickup_datetime'])
data['tpep_dropoff_datetime'] = pd.to_datetime(data['tpep_dropoff_datetime'])


In [4]:
# 定义纽约市区范围
nyc_bounds = {
    'lon_min': -74.05, 'lon_max': -73.75,
    'lat_min': 40.63, 'lat_max': 40.85
}

In [5]:
# 过滤异常值
clean_data = data[
    (data['pickup_longitude'].between(nyc_bounds['lon_min'], nyc_bounds['lon_max'])) &
    (data['pickup_latitude'].between(nyc_bounds['lat_min'], nyc_bounds['lat_max'])) &
    (data['dropoff_longitude'].between(nyc_bounds['lon_min'], nyc_bounds['lon_max'])) &
    (data['dropoff_latitude'].between(nyc_bounds['lat_min'], nyc_bounds['lat_max']))
]

In [6]:

print(f"原始数据量: {len(data)}")
print(f"清洗后数据量: {len(clean_data)}")
print(f"数据清洗比例: {len(clean_data)/len(data)*100:.2f}%")

原始数据量: 12748986
清洗后数据量: 12346105
数据清洗比例: 96.84%


In [7]:
# 2. 数据可视化检查
# 采样1000个点进行可视化
sample_size = 1000
sampled_data = clean_data.sample(n=min(sample_size, len(clean_data)))


In [13]:
# 创建地图
center_lat = (nyc_bounds['lat_min'] + nyc_bounds['lat_max']) / 2
center_lon = (nyc_bounds['lon_min'] + nyc_bounds['lon_max']) / 2
m = folium.Map(location=[center_lat, center_lon], zoom_start=12)

In [14]:
# 添加采样点
for _, row in sampled_data.iterrows():
    # 上车点（蓝色）
    folium.CircleMarker(
        [row['pickup_latitude'], row['pickup_longitude']],
        radius=2,
        color='blue',
        fill=True,
        popup='Pickup'
    ).add_to(m)
    
    # 下车点（红色）
    folium.CircleMarker(
        [row['dropoff_latitude'], row['dropoff_longitude']],
        radius=2,
        color='red',
        fill=True,
        popup='Dropoff'
    ).add_to(m)

m.save('taxi_points_map.html')

In [15]:
# 3. 拆分上车点和下车点数据
# 创建上车点数据
pickup_data = clean_data[['tpep_pickup_datetime', 'pickup_longitude', 'pickup_latitude']].copy()
pickup_data['point_type'] = 1  # 1表示上车点
pickup_data.columns = ['datetime', 'longitude', 'latitude', 'point_type']

# 创建下车点数据
dropoff_data = clean_data[['tpep_dropoff_datetime', 'dropoff_longitude', 'dropoff_latitude']].copy()
dropoff_data['point_type'] = 0  # 0表示下车点
dropoff_data.columns = ['datetime', 'longitude', 'latitude', 'point_type']

# 合并数据并按时间排序
combined_data = pd.concat([pickup_data, dropoff_data], ignore_index=True)
combined_data.sort_values('datetime', inplace=True)

In [16]:
# 4. 创建网格数据
# 设置参数
grid_size = (50, 50)
time_slot_minutes = 30

# 添加时间片和星期几特征
combined_data['time_slot'] = combined_data['datetime'].dt.floor(f'{time_slot_minutes}T')
combined_data['day_of_week'] = combined_data['datetime'].dt.dayofweek
combined_data['hour'] = combined_data['datetime'].dt.hour

# 计算网格索引
combined_data['grid_x'] = ((combined_data['longitude'] - nyc_bounds['lon_min']) / 
                          (nyc_bounds['lon_max'] - nyc_bounds['lon_min']) * grid_size[0]).astype(int)
combined_data['grid_y'] = ((combined_data['latitude'] - nyc_bounds['lat_min']) / 
                          (nyc_bounds['lat_max'] - nyc_bounds['lat_min']) * grid_size[1]).astype(int)

# 确保网格索引在有效范围内
combined_data = combined_data[
    (combined_data['grid_x'] >= 0) & (combined_data['grid_x'] < grid_size[0]) &
    (combined_data['grid_y'] >= 0) & (combined_data['grid_y'] < grid_size[1])
]

In [17]:
# 5. 统计网格流量
grid_counts = combined_data.groupby(
    ['time_slot', 'grid_x', 'grid_y', 'point_type', 'day_of_week', 'hour']
).size().reset_index(name='flow')


In [18]:
# 6. 检查数据连续性
time_slots = sorted(grid_counts['time_slot'].unique())
time_diff = np.diff([pd.Timestamp(ts) for ts in time_slots])
print("\n时间片检查:")
print(f"总时间片数量: {len(time_slots)}")
print(f"时间片间隔: {time_diff[0]}")
print(f"是否所有时间片间隔相等: {all(td == time_diff[0] for td in time_diff)}")



时间片检查:
总时间片数量: 1540
时间片间隔: 0 days 00:30:00
是否所有时间片间隔相等: False


In [20]:
# 7. 创建用于预测的数据数组
print("\n创建预测数据数组...")
# 创建五维数组 (时间, 空间X, 空间Y, 通道数[pickup,dropoff], 特征[flow,day_of_week,hour])
volume_data = np.zeros((len(time_slots), grid_size[0], grid_size[1], 2, 3))

# 填充数据
for t_idx, time_slot in enumerate(time_slots):
    if t_idx % 100 == 0:  # 每处理100个时间片打印一次进度
        print(f"处理时间片: {t_idx}/{len(time_slots)}")
    time_data = grid_counts[grid_counts['time_slot'] == time_slot]
    
    for _, row in time_data.iterrows():
        # 流量数据
        volume_data[t_idx, row['grid_x'], row['grid_y'], row['point_type'], 0] = row['flow']
        # 时间特征
        volume_data[t_idx, row['grid_x'], row['grid_y'], row['point_type'], 1] = row['day_of_week']
        volume_data[t_idx, row['grid_x'], row['grid_y'], row['point_type'], 2] = row['hour']


创建预测数据数组...
处理时间片: 0/1540
处理时间片: 100/1540
处理时间片: 200/1540
处理时间片: 300/1540
处理时间片: 400/1540
处理时间片: 500/1540
处理时间片: 600/1540
处理时间片: 700/1540
处理时间片: 800/1540
处理时间片: 900/1540
处理时间片: 1000/1540
处理时间片: 1100/1540
处理时间片: 1200/1540
处理时间片: 1300/1540
处理时间片: 1400/1540
处理时间片: 1500/1540


In [21]:
# 8. 数据统计和可视化
print("\n数据统计:")
print(f"数据形状: {volume_data.shape}")
print(f"时间范围: {time_slots[0]} 到 {time_slots[-1]}")
print(f"平均上车流量: {volume_data[:,:,:,1,0].mean():.2f}")
print(f"平均下车流量: {volume_data[:,:,:,0,0].mean():.2f}")

# 绘制某个时间片的流量分布
sample_time_idx = len(time_slots) // 2  # 取中间时间片
plt.figure(figsize=(15, 5))

# 上车点热力图
plt.subplot(121)
sns.heatmap(volume_data[sample_time_idx, :, :, 1, 0].T, 
            cmap='YlOrRd', 
            cbar_kws={'label': 'Pickup Flow'})
plt.title(f'Pickup Flow at {time_slots[sample_time_idx]}')

# 下车点热力图
plt.subplot(122)
sns.heatmap(volume_data[sample_time_idx, :, :, 0, 0].T, 
            cmap='YlOrRd', 
            cbar_kws={'label': 'Dropoff Flow'})
plt.title(f'Dropoff Flow at {time_slots[sample_time_idx]}')

plt.tight_layout()
plt.savefig('flow_distribution.png')
plt.close()


数据统计:
数据形状: (1540, 50, 50, 2, 3)
时间范围: 2015-01-01 00:00:00 到 2015-03-04 17:30:00
平均上车流量: 3.21
平均下车流量: 3.21


In [22]:
# 9. 划分训练集和测试集
split_idx = int(len(time_slots) * 0.7)
volume_train = volume_data[:split_idx]
volume_test = volume_data[split_idx:]
# 保存数据
np.savez('volume_train.npz', 
         volume=volume_train, 
         time_slots=time_slots[:split_idx])
np.savez('volume_test.npz', 
         volume=volume_test,
         time_slots=time_slots[split_idx:])

print("\n数据集划分:")
print(f"训练集形状: {volume_train.shape}")
print(f"测试集形状: {volume_test.shape}")


数据集划分:
训练集形状: (1078, 50, 50, 2, 3)
测试集形状: (462, 50, 50, 2, 3)
