In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# 设置中文字体（如果需要）
plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号

print("AIS数据探索分析")
print("=" * 50)

AIS数据探索分析


In [9]:
# 1. 加载数据
try:
    df = pd.read_csv("../data/raw/AIS_2023_01_01.csv", nrows=100000)
    print(f" 数据加载成功！形状: {df.shape}")
except:
    # 如果文件不存在，创建测试数据
    print(" 数据文件不存在，创建测试数据...")
    np.random.seed(42)
    n_points = 100000
    data = {
        'MMSI': np.random.randint(100000000, 999999999, n_points),
        'LAT': np.random.uniform(20.0, 45.0, n_points),
        'LON': np.random.uniform(-130.0, -70.0, n_points),
        'SOG': np.random.uniform(0, 30, n_points),
        'VesselType': np.random.choice([30, 50, 60, 70, 80, 90], n_points)
    }
    df = pd.DataFrame(data)
    df.to_csv("../data/raw/AIS_2023_01_01.csv", index=False)
    print(" 测试数据创建完成")

 数据加载成功！形状: (100000, 17)


In [10]:
# 2. 基本信息
print(f"\n数据形状: {df.shape}")
print(f"列名: {df.columns.tolist()}")
print(f"内存占用: {df.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB")

# 3. 数据概览
print("\n前5行数据:")
print(df.head())

# 4. 数据类型
print("\n数据类型:")
print(df.dtypes)

# 5. 缺失值分析
print("\n缺失值统计:")
missing = df.isnull().sum()
print(missing[missing > 0])

# 6. 关键字段分析
print("\n关键字段分析:")
key_fields = ['LAT', 'LON', 'SOG', 'VesselType']
for field in key_fields:
    if field in df.columns:
        print(f"\n{field}:")
        print(f"  非空值: {df[field].notnull().sum()}")
        print(f"  唯一值: {df[field].nunique()}")
        if df[field].dtype in ['int64', 'float64']:
            print(f"  范围: [{df[field].min():.2f}, {df[field].max():.2f}]")
            print(f"  均值: {df[field].mean():.2f}")

# 7. 保存样本数据用于后续实验
sample_df = df.sample(10000, random_state=42)
sample_df.to_csv("../data/processed/sample_data.csv", index=False)
print(f"\n 保存10000条样本数据到: ../data/processed/sample_data.csv")


数据形状: (100000, 17)
列名: ['MMSI', 'BaseDateTime', 'LAT', 'LON', 'SOG', 'COG', 'Heading', 'VesselName', 'IMO', 'CallSign', 'VesselType', 'Status', 'Length', 'Width', 'Draft', 'Cargo', 'TransceiverClass']
内存占用: 38.96 MB

前5行数据:
        MMSI         BaseDateTime       LAT        LON   SOG    COG  Heading  \
0  368926035  2023-01-01T00:00:05  38.65165  -90.17964   0.1  360.0    511.0   
1  367647050  2023-01-01T00:00:04  30.16506  -90.99936   6.3  161.4    511.0   
2  352001704  2023-01-01T00:00:04  28.91963  -94.37033  14.7   97.5     97.0   
3  367104060  2023-01-01T00:00:00  34.33537 -119.56046  14.8  202.9    511.0   
4  367099730  2023-01-01T00:00:02  26.53889  -97.40485   0.0  360.0    511.0   

       VesselName         IMO CallSign  VesselType  Status  Length  Width  \
0       KIMMSWICK         NaN     AENA        33.0    15.0     NaN    NaN   
1       USS CAIRO         NaN  WDH7325        31.0     0.0    20.0    NaN   
2  ENEOS EXPLORER  IMO9935492   3E2723        80.0     0.0   23