In [1]:
import pandas as pd
from difflib import SequenceMatcher

In [2]:
# Read raw files
file_path_traffic = r"datasets\dft_traffic_counts_raw_counts.csv"
df_traffic = pd.read_csv(file_path_traffic, low_memory=False)

file_path_eco = r"datasets\Regional gross domestic product(all ITL).xlsx"
df_eco = pd.read_excel(
    file_path_eco, sheet_name="Table 5", header=1
)  # Table 5: Gross domestic product (GDP) at current market prices, pounds million

In [4]:
df_traffic.columns

Index(['count_point_id', 'direction_of_travel', 'year', 'count_date', 'hour',
       'region_id', 'region_name', 'region_ons_code', 'local_authority_id',
       'local_authority_name', 'local_authority_code', 'road_name',
       'road_category', 'road_type', 'start_junction_road_name',
       'end_junction_road_name', 'easting', 'northing', 'latitude',
       'longitude', 'link_length_km', 'link_length_miles', 'pedal_cycles',
       'two_wheeled_motor_vehicles', 'cars_and_taxis', 'buses_and_coaches',
       'LGVs', 'HGVs_2_rigid_axle', 'HGVs_3_rigid_axle',
       'HGVs_4_or_more_rigid_axle', 'HGVs_3_or_4_articulated_axle',
       'HGVs_5_articulated_axle', 'HGVs_6_articulated_axle', 'all_HGVs',
       'all_motor_vehicles'],
      dtype='object')

In [4]:
df_traffic.shape, df_eco.shape

((3174678, 35), (236, 28))

In [5]:
# Get unique city names from traffic and economic data
unique_authorities = df_traffic["local_authority_name"].dropna().unique()
region_names = df_eco["Region name"].dropna().unique()

# Set operations to find common and unique values
set_traffic = set(unique_authorities)
set_eco = set(region_names)

common = set_traffic.intersection(set_eco)
only_traffic = set_traffic.difference(set_eco)
only_eco = set_eco.difference(set_traffic)

print("共有的地区:", len(common))
print("只有交通数据的地区:", len(only_traffic))
print("只有经济数据的地区:", len(only_eco))

共有的地区: 83
只有交通数据的地区: 130
只有经济数据的地区: 150


In [6]:
# Output unique city names
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

# Set similarity threshold
threshold = 0.8

# Use list comprehension to find similar name pairs
similar_pairs = [
    (name_traffic, name_eco, similar(name_traffic, name_eco))
    for name_traffic in only_traffic
    for name_eco in only_eco
    if similar(name_traffic, name_eco) >= threshold
]

# Output similar but not exactly matched name pairs
print("Name pairs that are similar but not identified as identical (similarity >= 0.8):")
for name_traffic, name_eco, ratio in similar_pairs:
    print(f"traffic set: '{name_traffic}'  <-->  economic set: '{name_eco}'  similarity: {ratio:.2f}")


Name pairs that are similar but not identified as identical (similarity >= 0.8):
traffic set: 'Shropshire'  <-->  economic set: 'Shropshire CC'  similarity: 0.87
traffic set: 'Derbyshire'  <-->  economic set: 'East Derbyshire'  similarity: 0.80
traffic set: 'West Sussex'  <-->  economic set: 'West Essex'  similarity: 0.86
traffic set: 'Dumfries & Galloway'  <-->  economic set: 'Dumfries and Galloway'  similarity: 0.90
traffic set: 'Cornwall excluding Isles of Scilly'  <-->  economic set: 'Cornwall and Isles of Scilly'  similarity: 0.84
traffic set: 'Worcestershire'  <-->  economic set: 'Worcestershire CC'  similarity: 0.90
traffic set: 'North East Lincolnshire'  <-->  economic set: 'North and North East Lincolnshire'  similarity: 0.82
traffic set: 'Cambridgeshire'  <-->  economic set: 'Cambridgeshire CC'  similarity: 0.90
traffic set: 'South Tyneside'  <-->  economic set: 'South Teesside'  similarity: 0.86
traffic set: 'Warwickshire'  <-->  economic set: 'Warwickshire CC'  similarity: 

In [7]:
print(len(similar_pairs))

23


In [8]:
# Get the matching results
common_traffic = common.union({pair[0] for pair in similar_pairs})
common_eco = common.union({pair[1] for pair in similar_pairs})

# Extract corresponding data and form new tables
df_traffic_common = df_traffic[df_traffic["local_authority_name"].isin(common_traffic)]
df_eco_common = df_eco[df_eco["Region name"].isin(common_eco)]

print("\nTraffic new table row count:", df_traffic_common.shape[0])
print("Economic new table row count:", df_eco_common.shape[0])
print(f"Datasets with same city names created.")



Traffic new table row count: 2074248
Economic new table row count: 107
Datasets with same city names created.


In [11]:
# Save the new tables as CSV files
df_traffic_common.to_csv("Traffic_common_data.csv", index=False)
df_eco_common.to_csv("Economic_common_data.csv", index=False)


In [12]:
# check matching result
file_path_traffic = r"Traffic_common_data.csv"
df_traffic_common = pd.read_csv(file_path_traffic, low_memory=False)
file_path_eco = r"Economic_common_data.csv"
df_eco_common = pd.read_csv(file_path_eco, low_memory=False)


In [14]:
df_traffic_common.shape, df_eco_common.shape

((2074248, 35), (107, 28))

In [17]:
# Get unique city names from traffic and economic data
unique_authorities_new = df_traffic_common["local_authority_name"].dropna().unique()
region_names_new = df_eco_common["Region name"].dropna().unique()

# Set operations to find common and unique values
set_traffic_new = set(unique_authorities_new)
set_eco_new = set(region_names_new)

common_new = set_traffic_new.intersection(set_eco_new)
only_traffic_new = set_traffic_new.difference(set_eco_new)
only_eco_new = set_eco_new.difference(set_traffic_new)

print("共有的地区:", len(common_new))
print("只有交通数据的地区:", len(only_traffic_new))
print("只有经济数据的地区:", len(only_eco_new))

共有的地区: 83
只有交通数据的地区: 22
只有经济数据的地区: 23


In [1]:
import pandas as pd
import folium
from folium.plugins import HeatMap

def plot_heatmap(df, value_column="all_motor_vehicles", zoom_start=7):
    """
    绘制基于经纬度的热力图，并支持不同指标的可视化。
    
    参数:
        df (pd.DataFrame): 需要包含 'latitude', 'longitude' 和指标列（默认 'all_motor_vehicless'）。
        value_column (str): 选择用于热力图的列（默认为 'all_motor_vehicles'）。
        zoom_start (int): 地图初始缩放级别（默认 7）。
        
    返回:
        folium.Map 对象（可视化地图）。
    """
    # 确保数据包含必要列
    if not {"latitude", "longitude", value_column}.issubset(df.columns):
        raise ValueError("数据集必须包含 'latitude', 'longitude' 和 '{}' 列".format(value_column))

    # 清理数据，去除缺失值和错误坐标
    df_clean = df.dropna(subset=["latitude", "longitude", value_column])
    
    # 确保数据类型正确
    df_clean = df_clean.astype({"latitude": float, "longitude": float, value_column: float})

    # 过滤掉明显错误的经纬度值
    df_clean = df_clean[
        (df_clean["latitude"].between(-90, 90)) &
        (df_clean["longitude"].between(-180, 180))
    ]

    # 计算权重（归一化，避免极值影响）
    max_value = df_clean[value_column].max()
    df_clean["weight"] = df_clean[value_column] / max_value if max_value > 0 else 1

    # 生成地图（以数据中心点为中心）
    map_center = [df_clean["latitude"].mean(), df_clean["longitude"].mean()]
    m = folium.Map(location=map_center, zoom_start=zoom_start)

    # 添加热力图
    heat_data = df_clean[["latitude", "longitude", "weight"]].values.tolist()
    HeatMap(heat_data, radius=10, blur=15, max_zoom=1).add_to(m)

    return m


In [3]:
# 读取数据
df_traffic = pd.read_csv("Traffic_common_data_cleaned.csv")

# 绘制热力图（默认 all_HGVs）
heatmap = plot_heatmap(df_traffic, value_column="all_HGVs")

# 保存 folium 地图为 HTML 文件
heatmap.save("heatmap.html")



In [16]:
## deperecated after this

In [9]:
print(*list(only_traffic),sep='\n')

Ceredigion
Comhairle nan Eilean Siar
Shropshire
Torfaen
Bolton
Monmouthshire
Derbyshire
Cardiff
Leicestershire
Hartlepool
Bury
Neath Port Talbot
Inverclyde
Pembrokeshire
Merton
East Dunbartonshire
Barnsley
Greenwich
North Ayrshire
Cumberland
West Sussex
Dumfries & Galloway
Salford
Norfolk
Kingston upon Thames
Newham
Hammersmith and Fulham
Richmond upon Thames
Flintshire
Wigan
South Gloucestershire
Rutland
Calderdale
West Cheshire
Argyll & Bute
Harrow
Havering
Kirklees
Westmorland and Furness
Islington
Doncaster
Stockport
Aberdeen City
Merthyr Tydfil
Oldham
North Somerset
Lewisham
Cornwall excluding Isles of Scilly
Windsor and Maidenhead
Moray
Middlesbrough
Poole
East Ayrshire
Bracknell Forest
Hillingdon
Worcestershire
Tameside
Carmarthenshire
Newport
North East Lincolnshire
Conwy
Southwark
North Tyneside
Hackney
Dundee City
Sutton
Newcastle upon Tyne
Cambridgeshire
Bath and North East Somerset
Barking and Dagenham
Denbighshire
Hampshire
Bridgend
Blaenau Gwent
Isles of Scilly
South Tyne

In [10]:
print(*list(only_eco),sep='\n')

Breckland and South Norfolk
East Lothian and Midlothian
Merseyside
Hackney and Newham
Chorley and West Lancashire
Redbridge and Waltham Forest
Cheshire West and Chester
North East
South Teesside
Surrey, East and West Sussex
Gloucestershire CC
East Surrey
East Dunbartonshire, West Dunbartonshire, and Helensburgh and Lomond
Central Hampshire
Cornwall and Isles of Scilly
Tyneside
Extra-Regio
Na h-Eileanan Siar
East Ayrshire and North Ayrshire mainland
Inverness and Nairn, Moray, Badenoch and Strathspey
Kent Thames Gateway
Somerset CC
Leicestershire CC and Rutland
Durham CC
Gloucestershire, Wiltshire and Bath/Bristol Area
East Sussex CC
Perth and Kinross, and Stirling
Bexley and Greenwich
East Wales
Conwy and Denbighshire
Merton, Kingston upon Thames and Sutton
Lancaster and Wyre
South West Wales
South West
Southern Scotland
Greater Manchester South West
South Yorkshire
Leicestershire, Rutland and Northamptonshire
West Wales and The Valleys
Gwent Valleys
Scotland
Buckinghamshire CC
Aberdee

In [None]:
from rapidfuzz import process

# 预处理城市名称（去除空格，转换小写）
only_traffic_cleaned = [name.strip().lower() for name in only_traffic]
only_eco_cleaned = [name.strip().lower() for name in only_eco]

# 进行模糊匹配
city_mapping = {}

for name_traffic in only_traffic_cleaned:
    result = process.extractOne(name_traffic, only_eco_cleaned, score_cutoff=80)
    if result:
        match, score, _ = result
        city_mapping[name_traffic] = match

# 替换 DataFrame 里的城市名称（确保大小写一致）
df_traffic["local_authority_name"] = df_traffic["local_authority_name"].str.lower().replace(city_mapping)

# 统计匹配结果
matched_cities = set(df_traffic["local_authority_name"].unique()) & set(df_eco["Region name"].str.lower().unique())
print(f"成功匹配的城市数量: {len(matched_cities)}")



成功匹配的城市数量: 144


In [None]:
from prettytable import PrettyTable

table = PrettyTable(["Traffic Name", "Matched Economic Name"])
for traffic, eco in city_mapping.items():
    table.add_row([traffic, eco])

print(table.get_string(max_width=80))  # 限制最大宽度，防止超出屏幕


+------------------------------------+------------------------------------------------------------------------+
|            Traffic Name            |                         Matched Economic Name                          |
+------------------------------------+------------------------------------------------------------------------+
|             cumberland             |                   northumberland, and tyne and wear                    |
|      westmorland and furness       |                east yorkshire and northern lincolnshire                |
|           staffordshire            |                            staffordshire cc                            |
|       the vale of glamorgan        |                             heart of essex                             |
|           aberdeen city            |                    aberdeen city and aberdeenshire                     |
|              rutland               |                     leicestershire cc and rutland                

In [None]:
# 获取匹配后的城市列表
traffic_cities_after = set(df_traffic["local_authority_name"].unique())
eco_cities_after = set(df_eco["Region name"].unique())

# 仍然未匹配的城市
unmatched_traffic = traffic_cities_after - eco_cities_after
unmatched_eco = eco_cities_after - traffic_cities_after

print(f"未匹配的交通数据城市{len(unmatched_traffic)}个:", unmatched_traffic)
print(f"未匹配的经济数据城市{len(unmatched_eco)}个:", unmatched_eco)



未匹配的交通数据城市173个: {'wiltshire', 'inverclyde, east renfrewshire, and renfrewshire', 'trafford', 'lambeth', 'tower hamlets', 'wolverhampton', 'hounslow and richmond upon thames', 'barnet', 'swindon', 'blaenau gwent', 'ceredigion', 'leicester', 'somerset cc', 'west lothian', 'bournemouth, christchurch and poole', 'peterborough', 'wokingham', 'leicestershire cc and rutland', 'warwickshire cc', 'west sussex (south west)', 'kent', 'suffolk cc', 'st. helens', 'leicestershire, rutland and northamptonshire', 'cheshire', 'lewisham and southwark', 'cheshire east', 'torbay', 'telford and wrekin', 'rochdale', 'blackburn with darwen', 'swansea', 'east ayrshire and north ayrshire mainland', 'luton', 'tameside', 'highlands and islands', 'torfaen', 'caerphilly', 'shetland islands', 'westminster', 'sefton', 'bolton', 'stockport', 'west northamptonshire', 'bedfordshire and hertfordshire', 'cardiff and vale of glamorgan', 'powys', 'sandwell', 'norwich and east norfolk', 'thurrock', 'wandsworth', 'central ha