In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

# 读取CSV文件
file_path = '/Users/hansen/Desktop/MATH3836proj/香港物种数据/2022.csv'
df = pd.read_csv(file_path)

# 处理geometry列，将其转换为字符串以便比较
df['geometry_str'] = df['geometry'].astype(str)

# 从date列提取月份
try:
    df['date'] = pd.to_datetime(df['date'])
    df['month'] = df['date'].dt.month
except:
    if df['date'].dtype == object:
        try:
            df['month'] = df['date'].str.split('/').str[0].astype(int)
        except:
            try:
                df['month'] = df['date'].str.split('-').str[1].astype(int)
            except:
                print("无法从date列提取月份，请检查date列的格式")
                df['month'] = None

# 按照scientific、geometry_str和month分组
grouped = df.groupby(['scientific', 'geometry_str', 'month'])

# 计算每组的gno平均值和OBJECTID列表
group_stats = grouped.agg({
    'gno': 'mean',
    'OBJECTID': lambda x: list(x)
}).reset_index()

# 添加每组中OBJECTID的数量
group_stats['count'] = group_stats['OBJECTID'].apply(len)

# 按照scientific字母顺序和月份从小到大排序
group_stats = group_stats.sort_values(['scientific', 'month'])

# 显示前20行结果
print("相同scientific、geometry和月份下OBJECTID的gno平均值（按scientific字母顺序和月份排序，前20行）:")
print(group_stats[['scientific', 'month', 'gno', 'count', 'OBJECTID']].head(20))

# 保存结果到CSV文件
group_stats[['scientific', 'month', 'geometry_str', 'gno', 'count', 'OBJECTID']].to_csv(
    '/Users/hansen/Desktop/MATH3836proj/香港物种数据/gno_avg_by_species_location_month_sorted_2022.csv', 
    index=False
)

# 为了更好地展示结果，创建一个更易读的输出
readable_results = []
for _, row in group_stats.head(20).iterrows():
    readable_results.append({
        'scientific': row['scientific'],
        'month': row['month'],
        'gno_avg': round(row['gno'], 2),
        'count': row['count'],
        'sample_objectids': str(row['OBJECTID'][:5]) + ('...' if row['count'] > 5 else '')
    })

readable_df = pd.DataFrame(readable_results)
print("\n更易读的结果格式（前20行）:")
print(readable_df)


In [5]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import ast
import re
import sys

# 读取分组数据文件
file_path = '/Users/hansen/Desktop/MATH3836proj/香港物种数据/整理后2022.csv'
df = pd.read_csv(file_path)

# 清理数据 - 只保留有效的行（非分隔符和非空行）
valid_rows = df[
    (~df['OBJECTID'].astype(str).str.contains('组|---', na=False)) & 
    (~df['OBJECTID'].isna())
].copy()

# 筛选4月份的数据
april_data = valid_rows[valid_rows['month'] == 4].copy()

# 如果没有4月份数据，打印警告
if len(april_data) == 0:
    print("警告: 没有找到4月份的数据！")
    print("数据中的月份分布:")
    print(valid_rows['month'].value_counts())
    sys.exit(1)

# 从geometry_str字符串中提取坐标
def extract_coordinates(geometry_str):
    if pd.isna(geometry_str):
        return None, None
    
    try:
        # 尝试直接解析为列表
        try:
            coords = ast.literal_eval(str(geometry_str))
            
            # 处理多边形情况
            if isinstance(coords, list) and len(coords) > 0 and isinstance(coords[0], list):
                # 计算多边形的中心点
                points = coords[0] if isinstance(coords[0][0], list) else coords
                lats = [point[1] for point in points if len(point) >= 2]
                lons = [point[0] for point in points if len(point) >= 2]
                if lats and lons:
                    center_lat = sum(lats) / len(lats)
                    center_lon = sum(lons) / len(lons)
                    return center_lat, center_lon
            
            # 处理点情况
            elif isinstance(coords, list) and len(coords) >= 2:
                return coords[1], coords[0]
                
        except:
            # 尝试使用正则表达式提取坐标
            pattern = r'\[(\d+\.\d+),\s*(\d+\.\d+)'
            match = re.search(pattern, str(geometry_str))
            if match:
                lon, lat = float(match.group(1)), float(match.group(2))
                return lat, lon
    except:
        pass
    
    return None, None

# 提取坐标
april_data['coordinates'] = april_data['geometry_str'].astype(str).apply(extract_coordinates)
april_data['latitude'] = april_data['coordinates'].apply(lambda x: x[0] if x is not None else None)
april_data['longitude'] = april_data['coordinates'].apply(lambda x: x[1] if x is not None else None)

# 删除无效坐标的行
april_data = april_data.dropna(subset=['latitude', 'longitude'])

# 确保gno列为数值型
april_data['gno_numeric'] = pd.to_numeric(april_data['gno'], errors='coerce')
april_data = april_data.dropna(subset=['gno_numeric'])

print(f"有效数据点数量: {len(april_data)}")

# 创建物种颜色映射
unique_species = april_data['scientific'].unique()
species_colors = {}
color_palette = px.colors.qualitative.Plotly + px.colors.qualitative.D3 + px.colors.qualitative.G10

for i, species in enumerate(unique_species):
    species_colors[str(species)] = color_palette[i % len(color_palette)]

# 创建3D散点图
fig = go.Figure()

# 获取前20个最常见的物种
top_species = april_data['scientific'].value_counts().head(20).index.tolist()

# 计算GNO值的缩放因子
# 找到合适的缩放因子，使最大值在可视范围内
max_gno = april_data['gno_numeric'].max()
scale_factor = 0.01  # 基础缩放因子

# 为每个常见物种添加一个散点图层
for species in top_species:
    species_data = april_data[april_data['scientific'] == species]
    
    # 对GNO值进行对数缩放，但保留原始值用于显示
    z_values = np.log1p(species_data['gno_numeric']) * scale_factor
    
    fig.add_trace(go.Scatter3d(
        x=species_data['longitude'],
        y=species_data['latitude'],
        z=z_values,
        mode='markers',
        name=species,
        marker=dict(
            size=5,
            color=species_colors.get(str(species), '#333333'),
            opacity=0.7,
            symbol='circle'
        ),
        hovertemplate=
        '<b>%{text}</b><br>' +
        '经度: %{x:.4f}<br>' +
        '纬度: %{y:.4f}<br>' +
        'GNO值: %{customdata:.2f}<br>',
        text=species_data['scientific'],
        customdata=species_data['gno_numeric']  # 使用原始GNO值
    ))

# 创建热力图表面
# 使用网格插值来创建连续的表面
grid_size = 50
lon_range = np.linspace(april_data['longitude'].min(), april_data['longitude'].max(), grid_size)
lat_range = np.linspace(april_data['latitude'].min(), april_data['latitude'].max(), grid_size)

# 创建网格
grid_lon, grid_lat = np.meshgrid(lon_range, lat_range)
grid_z = np.zeros((grid_size, grid_size))
grid_raw = np.zeros((grid_size, grid_size))  # 存储原始值用于悬停显示

# 使用简单的距离加权插值
for i in range(grid_size):
    for j in range(grid_size):
        weights = 0
        value_sum = 0
        for _, row in april_data.iterrows():
            # 计算距离的平方
            dist_squared = (grid_lon[i, j] - row['longitude'])**2 + (grid_lat[i, j] - row['latitude'])**2
            if dist_squared < 0.0001:  # 避免除以零
                dist_squared = 0.0001
            # 使用距离的倒数作为权重
            weight = 1 / dist_squared
            weights += weight
            value_sum += weight * row['gno_numeric']
        
        if weights > 0:
            grid_raw[i, j] = value_sum / weights
            grid_z[i, j] = np.log1p(grid_raw[i, j]) * scale_factor
        else:
            grid_raw[i, j] = 0
            grid_z[i, j] = 0

# 添加表面图
fig.add_trace(go.Surface(
    z=grid_z,
    x=grid_lon,
    y=grid_lat,
    colorscale='Viridis',
    opacity=0.6,
    showscale=True,
    name='GNO热力表面',
    surfacecolor=grid_raw,  # 使用原始值作为颜色映射
    colorbar=dict(
        title='GNO值',
        x=0.9,
        y=0.5
    ),
    hovertemplate='经度: %{x:.4f}<br>纬度: %{y:.4f}<br>GNO值: %{surfacecolor:.2f}<br>'
))

# 创建Z轴刻度标签映射
# 计算一些有代表性的Z值点
z_ticks = np.linspace(0, np.log1p(max_gno) * scale_factor, 6)
z_labels = [f"{np.expm1(z/scale_factor):.1f}" for z in z_ticks]

# 设置图表布局
fig.update_layout(
    title='香港2022年4月物种GNO值3D分布图',
    scene=dict(
        xaxis_title='经度',
        yaxis_title='纬度',
        zaxis=dict(
            title='GNO值',
            # 设置自定义刻度
            tickvals=z_ticks,
            ticktext=z_labels
        ),
        aspectratio=dict(x=1, y=1, z=0.5),
        camera=dict(
            eye=dict(x=1.5, y=1.5, z=1.2)
        )
    ),
    legend=dict(
        itemsizing='constant',
        font=dict(size=10),
        yanchor="top",
        y=0.99,
        xanchor="left",
        x=0.01
    ),
    margin=dict(l=0, r=0, b=0, t=30),
    height=800,
    width=1000,
)

# 添加注释说明对数缩放
fig.add_annotation(
    x=0.02,
    y=0.02,
    xref="paper",
    yref="paper",
    text="注: 高度使用对数缩放以更好地显示GNO值分布",
    showarrow=False,
    font=dict(size=12),
    bgcolor="white",
    opacity=0.8,
    bordercolor="black",
    borderwidth=1,
    borderpad=4
)

# 保存为HTML文件
output_file = '/Users/hansen/Desktop/MATH3836proj/香港物种数据/april_2022_species_3d_map.html'
fig.write_html(output_file)
print(f"3D地图已保存至: {output_file}")

# 创建物种统计信息
print("\n4月份物种统计信息:")
print(f"物种总数: {len(unique_species)}")
print(f"观测记录总数: {len(april_data)}")
print(f"平均GNO值: {april_data['gno_numeric'].mean():.2f}")
print(f"最高GNO值: {april_data['gno_numeric'].max():.2f}")

# 显示前10个最常见的物种
top10_species = april_data['scientific'].value_counts().head(10)
print("\n前10个最常见的物种:")
for species, count in top10_species.items():
    print(f"{species}: {count}条记录")


有效数据点数量: 949
3D地图已保存至: /Users/hansen/Desktop/MATH3836proj/香港物种数据/april_2022_species_3d_map.html

4月份物种统计信息:
物种总数: 299
观测记录总数: 949
平均GNO值: 1148.01
最高GNO值: 3129.00

前10个最常见的物种:
Muntiacus vaginalis: 46条记录
Sus scrofa: 30条记录
Hystrix brachyura: 24条记录
Viverricula indica: 21条记录
Delias pasithoe: 13条记录
Sylvirana guentheri: 12条记录
Graphium sarpedon: 11条记录
Cupha erymanthis: 11条记录
Prionailurus bengalensis: 11条记录
Pycnonotus jocosus: 11条记录
