In [None]:
import sys
from pathlib import Path
sys.path.insert(0, str(Path.cwd().parent / 'src'))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from src.data_loader import DataLoader
from src.visualization import Visualizer
from src.config import *

# 设置显示选项
pd.set_option('display.max_columns', None)
plt.rcParams['figure.figsize'] = (12, 6)

print("环境设置完成")

## 1. 加载数据

In [None]:
# 初始化数据加载器
loader = DataLoader()

# 加载天气数据
weather_df = loader.load_weather_data()
print("\n天气数据:")
print(weather_df.head())
print(f"\n形状: {weather_df.shape}")

In [None]:
# 加载建筑能耗数据
building_type = 'Hospitals'
building_df = loader.load_building_data(building_type)
print(f"\n{building_type}能耗数据:")
print(building_df.head())
print(f"\n形状: {building_df.shape}")

In [None]:
# 合并数据
merged_df = loader.merge_weather_building(building_type)
print("\n合并后的数据:")
print(merged_df.head())
print(f"\n形状: {merged_df.shape}")
print(f"\n列名: {merged_df.columns.tolist()}")

## 2. 数据基本信息

In [None]:
# 数据类型
print("数据类型:")
print(merged_df.dtypes)

In [None]:
# 缺失值统计
print("\n缺失值统计:")
missing = merged_df.isnull().sum()
missing = missing[missing > 0]
if len(missing) > 0:
    print(missing)
else:
    print("没有缺失值")

In [None]:
# 统计描述
print("\n数值列统计描述:")
merged_df.describe()

## 3. 数据可视化

In [None]:
# 初始化可视化器
visualizer = Visualizer()

In [None]:
# 时间序列图
energy_cols = ['Total_Energy_kWh', 'SpaceHeating_kWh', 'SpaceCooling_kWh', 'Electricity_kWh']
visualizer.plot_time_series(merged_df.head(1000), energy_cols, datetime_col='DateTime')

In [None]:
# 天气变量时间序列
weather_cols = ['Temperature', 'DNI', 'GHI', 'WindSpeed']
visualizer.plot_time_series(merged_df.head(1000), weather_cols, datetime_col='DateTime')

In [None]:
# 数据分布
visualizer.plot_data_distribution(merged_df, energy_cols)

In [None]:
# 相关性矩阵
corr_cols = energy_cols + weather_cols
visualizer.plot_correlation_matrix(merged_df, columns=corr_cols)

## 4. 关键发现

记录你的观察和发现:
1. 能耗模式是否有明显的日周期和季节性？
2. 哪些天气因素与能耗相关性最强？
3. 是否存在异常值？
4. 数据质量如何？

In [None]:
# 按小时统计平均能耗
merged_df['hour'] = merged_df['DateTime'].dt.hour
hourly_avg = merged_df.groupby('hour')['Total_Energy_kWh'].mean()

plt.figure(figsize=(12, 5))
plt.plot(hourly_avg.index, hourly_avg.values, marker='o')
plt.xlabel('Hour of Day')
plt.ylabel('Average Energy Consumption (kWh)')
plt.title('Daily Energy Consumption Pattern')
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# 温度与能耗的关系
plt.figure(figsize=(12, 5))
plt.scatter(merged_df['Temperature'], merged_df['Total_Energy_kWh'], alpha=0.3, s=10)
plt.xlabel('Temperature (°C)')
plt.ylabel('Total Energy Consumption (kWh)')
plt.title('Temperature vs Energy Consumption')
plt.grid(True, alpha=0.3)
plt.show()