## create month, day, density columns

In [1]:
import pandas as pd

# 读取 CSV 文件
file_path = r'CSV_common_data.csv'
traffic = pd.read_csv(file_path, low_memory=False)

# 尝试自动解析日期格式
traffic['count_date'] = pd.to_datetime(traffic['count_date'], errors='coerce')

# 提取月份和日期
traffic['month'] = traffic['count_date'].dt.month
traffic['day'] = traffic['count_date'].dt.day

# 将新列放在 count_date 列后面
cols = ['count_date', 'month', 'day'] + [col for col in traffic.columns if col not in ['count_date', 'month', 'day']]
traffic = traffic[cols]

# 查看结果
traffic.head()


Unnamed: 0,count_date,month,day,count_point_id,direction_of_travel,year,hour,region_id,region_name,region_ons_code,...,buses_and_coaches,LGVs,HGVs_2_rigid_axle,HGVs_3_rigid_axle,HGVs_4_or_more_rigid_axle,HGVs_3_or_4_articulated_axle,HGVs_5_articulated_axle,HGVs_6_articulated_axle,all_HGVs,all_motor_vehicles
0,2014-05-19,5,19,60,N,2014,7,2,East Midlands,E12000004,...,4.0,72.0,8.0,2.0,5.0,3.0,2.0,5.0,25.0,704.0
1,2014-05-19,5,19,60,N,2014,8,2,East Midlands,E12000004,...,8.0,94.0,13.0,0.0,6.0,5.0,2.0,3.0,29.0,716.0
2,2014-05-19,5,19,60,N,2014,9,2,East Midlands,E12000004,...,3.0,83.0,17.0,6.0,2.0,2.0,6.0,1.0,34.0,444.0
3,2014-05-19,5,19,60,N,2014,10,2,East Midlands,E12000004,...,3.0,57.0,20.0,3.0,7.0,7.0,4.0,6.0,47.0,408.0
4,2014-05-19,5,19,60,N,2014,11,2,East Midlands,E12000004,...,4.0,71.0,13.0,1.0,5.0,3.0,1.0,1.0,24.0,402.0


In [2]:
# 计算不符合日期格式的行数
invalid_dates_count = traffic['count_date'].isna().sum()

print(f"有 {invalid_dates_count} 行日期格式不符合预期。")

有 0 行日期格式不符合预期。


In [3]:
# 获取从 W 到 AI 的列名（对应序号 22 到 35）
columns_to_process = traffic.columns[22:36]

# 创建新列并计算密度
for col in columns_to_process:
    new_col_name = col + '_density'
    traffic[new_col_name] = traffic[col] / traffic['link_length_km']

# 显示处理后的 DataFrame
traffic.head()


Unnamed: 0,count_date,month,day,count_point_id,direction_of_travel,year,hour,region_id,region_name,region_ons_code,...,cars_and_taxis_density,buses_and_coaches_density,LGVs_density,HGVs_2_rigid_axle_density,HGVs_3_rigid_axle_density,HGVs_4_or_more_rigid_axle_density,HGVs_3_or_4_articulated_axle_density,HGVs_5_articulated_axle_density,HGVs_6_articulated_axle_density,all_HGVs_density
0,2014-05-19,5,19,60,N,2014,7,2,East Midlands,E12000004,...,1490.0,10.0,180.0,20.0,5.0,12.5,7.5,5.0,12.5,62.5
1,2014-05-19,5,19,60,N,2014,8,2,East Midlands,E12000004,...,1450.0,20.0,235.0,32.5,0.0,15.0,12.5,5.0,7.5,72.5
2,2014-05-19,5,19,60,N,2014,9,2,East Midlands,E12000004,...,802.5,7.5,207.5,42.5,15.0,5.0,5.0,15.0,2.5,85.0
3,2014-05-19,5,19,60,N,2014,10,2,East Midlands,E12000004,...,750.0,7.5,142.5,50.0,7.5,17.5,17.5,10.0,15.0,117.5
4,2014-05-19,5,19,60,N,2014,11,2,East Midlands,E12000004,...,752.5,10.0,177.5,32.5,2.5,12.5,7.5,2.5,2.5,60.0


In [4]:
# 保存修改后的 DataFrame 到 CSV 文件
traffic.to_csv(r'CSV_common_data.csv', index=False)


In [2]:
import pandas as pd
file_path = r'CSV_common_data.csv'
traffic = pd.read_csv(file_path, low_memory=False)

nan_count = traffic.isna().sum().sum()
print("NaN 总数:", nan_count)

NaN 总数: 11125178


In [3]:
nan_per_column = traffic.isna().sum()
print(nan_per_column)


count_date                                   0
month                                        0
day                                          0
count_point_id                               0
direction_of_travel                          0
year                                         0
hour                                         0
region_id                                    0
region_name                                  0
region_ons_code                              0
local_authority_id                           0
local_authority_name                         0
local_authority_code                         0
road_name                                    0
road_category                                0
road_type                                    0
start_junction_road_name                619380
end_junction_road_name                  619308
easting                                      0
northing                                     0
latitude                                     0
longitude    

In [None]:
# 用均值填充nan
traffic_filled = traffic.fillna(traffic.mean(numeric_only=True))

In [None]:
nan_per_column = traffic_filled.isna().sum()
print(nan_per_column)

count_date                                   0
month                                        0
day                                          0
count_point_id                               0
direction_of_travel                          0
year                                         0
hour                                         0
region_id                                    0
region_name                                  0
region_ons_code                              0
local_authority_id                           0
local_authority_name                         0
local_authority_code                         0
road_name                                    0
road_category                                0
road_type                                    0
start_junction_road_name                619380
end_junction_road_name                  619308
easting                                      0
northing                                     0
latitude                                     0
longitude    

In [7]:
# 保存
traffic_filled.to_csv(r'CSV_common_data_filled.csv', index=False)