# 01. 資料探索與前處理 (EDA & Preprocessing)

**目的**：整合 TWCDC 病例資料、Stringency Index 與移動數據。
**內容**：
* 讀取 CSV
* 處理缺失值
* 時間序列視覺化
* 輸出 `data/processed/`

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# 設定繪圖風格
sns.set(style="whitegrid")
plt.rcParams['font.sans-serif'] = ['Microsoft JhengHei'] # 設定中文字型
plt.rcParams['axes.unicode_minus'] = False # 解決負號顯示問題

In [None]:
# 定義檔案路徑
raw_data_dir = "../data/raw"
processed_data_dir = "../data/processed"

file_case = os.path.join(raw_data_dir, "Age_County_Gender_day_19CoV.csv")
file_sim = os.path.join(raw_data_dir, "simulated_outbreak.csv")

print(f"Reading data from: {raw_data_dir}")

In [None]:
# 1. 讀取 TWCDC 病例資料
try:
    df_case = pd.read_csv(file_case)
    print("TWCDC Data Loaded Successfully")
    display(df_case.head())
    print(df_case.info())
except FileNotFoundError:
    print(f"File not found: {file_case}")

In [None]:
# 2. 讀取模擬爆發資料 (Simulated Outbreak)
try:
    df_sim = pd.read_csv(file_sim)
    print("Simulated Data Loaded Successfully")
    display(df_sim.head())
    print(df_sim.info())
except FileNotFoundError:
    print(f"File not found: {file_sim}")

## 3. 資料前處理 (Data Preprocessing)
包含日期格式轉換與欄位標準化。

In [None]:
# 日期格式轉換
if 'df_case' in locals():
    df_case['發病日'] = pd.to_datetime(df_case['發病日'])
    # 建立統一的 Location 欄位 (縣市_鄉鎮)
    df_case['Location'] = df_case['縣市'] + '_' + df_case['鄉鎮']
    
if 'df_sim' in locals():
    df_sim['Date'] = pd.to_datetime(df_sim['Date'])

## 4. 探索性資料分析 (EDA)
### 4.1 每日病例數趨勢

In [None]:
# 聚合每日病例數 (TWCDC)
if 'df_case' in locals():
    daily_cases_tw = df_case.groupby('發病日')['確定病例數'].sum().reset_index()
    
    plt.figure(figsize=(15, 6))
    sns.lineplot(data=daily_cases_tw, x='發病日', y='確定病例數')
    plt.title('TWCDC: Daily Confirmed Cases (All Taiwan)')
    plt.xlabel('Date')
    plt.ylabel('Cases')
    plt.show()

## 5. 資料聚合與輸出 (Aggregation & Export)
將資料轉換為 `Date` x `Location` 的寬表格，方便模型使用。

In [None]:
# TWCDC Data Processing
print("Processing TWCDC Data...")
if 'df_case' in locals():
    # Pivot: Date x Location
    df_case_pivot = df_case.pivot_table(index='發病日', columns='Location', values='確定病例數', aggfunc='sum').fillna(0)

    # Reindex to ensure continuous date range
    all_dates_tw = pd.date_range(start=df_case['發病日'].min(), end=df_case['發病日'].max(), freq='D')
    df_case_pivot = df_case_pivot.reindex(all_dates_tw, fill_value=0)
    df_case_pivot.index.name = 'Date'

    # Save
    output_path_tw = os.path.join(processed_data_dir, "twcdc_daily_cases_by_location.csv")
    df_case_pivot.to_csv(output_path_tw)
    print(f"Saved: {output_path_tw}, Shape: {df_case_pivot.shape}")

In [None]:
# Simulated Data Processing
print("Processing Simulated Data...")
if 'df_sim' in locals():
    # Simulated data is individual cases, need to count
    daily_cases_sim = df_sim.groupby(['Date', 'Location']).size().reset_index(name='Cases')

    # Pivot
    df_sim_pivot = daily_cases_sim.pivot_table(index='Date', columns='Location', values='Cases', aggfunc='sum').fillna(0)

    # Reindex
    all_dates_sim = pd.date_range(start=df_sim['Date'].min(), end=df_sim['Date'].max(), freq='D')
    df_sim_pivot = df_sim_pivot.reindex(all_dates_sim, fill_value=0)
    df_sim_pivot.index.name = 'Date'

    # Save
    output_path_sim = os.path.join(processed_data_dir, "simulated_daily_cases_by_location.csv")
    df_sim_pivot.to_csv(output_path_sim)
    print(f"Saved: {output_path_sim}, Shape: {df_sim_pivot.shape}")