In [None]:
import pandas as pd
import numpy as np
import zipfile
import os
import re

# --- 1. 解壓縮資料
zip_path = '/content/data.zip'
extract_path = '/content/data/raw/'
os.makedirs(extract_path, exist_ok=True)

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print(f"資料已解壓縮到：{extract_path}")

# --- 2. 開始批次清洗
raw_folder = os.path.join(extract_path, 'data', 'raw')  # 注意這邊又多了一層 data/raw/
cleaned_base_folder = '/content/data/cleaned/'
os.makedirs(cleaned_base_folder, exist_ok=True)

# 保留欄位
keep_columns = ['ObsTime', 'StnPres', 'Temperature', 'RH', 'WS', 'Precp']

# 遍歷所有測站
for station_id in os.listdir(raw_folder):
    station_path = os.path.join(raw_folder, station_id)
    if not os.path.isdir(station_path):
        continue

    for year_folder in os.listdir(station_path):
        year_path = os.path.join(station_path, year_folder)
        if not os.path.isdir(year_path):
            continue

        print(f"\n正在處理站點 {station_id} 年 {year_folder}")

        cleaned_folder = os.path.join(cleaned_base_folder, station_id, year_folder.split('_')[1])
        os.makedirs(cleaned_folder, exist_ok=True)

        for file_name in sorted(os.listdir(year_path)):
            if not file_name.endswith('.csv'):
                continue

            file_path = os.path.join(year_path, file_name)
            print(f"\n正在處理：{file_path}")

            try:
                # 嘗試讀取
                df = pd.read_csv(file_path, encoding='utf-8', skiprows=1)
                print(f"{file_name} (utf-8) 成功讀取")
            except Exception as e1:
                try:
                    df = pd.read_csv(file_path, encoding='big5', skiprows=1)
                    print(f"{file_name} (big5) 成功讀取")
                except Exception as e2:
                    print(f"{file_name} 讀取失敗！錯誤訊息：{e2}")
                    continue

            # 印出欄位名和前3筆資料
            print(f"欄位名稱：{list(df.columns)}")
            print(df.head(3))

            # =============== 清洗流程 ===============
            try:
                df = df[keep_columns]

                df['Precp'] = df['Precp'].replace('T', 0)
                df.replace('--', np.nan, inplace=True)

                for col in keep_columns[1:]:  # ObsTime不用轉
                    df[col] = pd.to_numeric(df[col], errors='coerce')

                df.fillna(method='ffill', inplace=True)
                df.fillna(method='bfill', inplace=True)

                # 從檔名提取日期
                date_match = re.search(r'(\d{4}-\d{2}-\d{2})', file_name)
                if date_match:
                    date_str = date_match.group(1)
                    base_date = pd.to_datetime(date_str, format='%Y-%m-%d')
                else:
                    print(f"無法解析日期：{file_name}")
                    continue

                df['time'] = df['ObsTime'].apply(lambda x: base_date + pd.to_timedelta(int(x), unit='h'))
                df = df.sort_values('time').reset_index(drop=True)

                # 累積雨量
                df['Precp_3h'] = df['Precp'].rolling(window=3, min_periods=1).sum()
                df['Precp_6h'] = df['Precp'].rolling(window=6, min_periods=1).sum()

                # 整理欄位
                df_final = df[['time', 'StnPres', 'Temperature', 'RH', 'WS', 'Precp', 'Precp_3h', 'Precp_6h']]

                # 儲存
                cleaned_file_path = os.path.join(cleaned_folder, file_name.replace('.csv', '_cleaned.csv'))
                df_final.to_csv(cleaned_file_path, index=False, encoding='utf-8-sig')

                print(f"{file_name} 清理成功並儲存！")

            except Exception as e:
                print(f"{file_name} 清理失敗！錯誤：{e}")

print("\n整個多站點多年資料清洗完成！")


In [None]:
import shutil
shutil.make_archive('/content/cleaned_data', 'zip', '/content/data/cleaned')

from google.colab import files
files.download('/content/cleaned_data.zip')
