## Data Preprocessing

In [16]:
import numpy as np
import pandas as pd

### Por Location


In [None]:
import pandas as pd
import glob
import re
from sklearn.cluster import KMeans

# --------------------------------------------------------------
# 1) Helper: convert DMS (degrees/minutes/seconds + N/S/E/W) → decimal degrees
# --------------------------------------------------------------
def dms_to_decimal(degrees, minutes, seconds, direction):
    """
    Convert DMS (degrees, minutes, seconds) + direction ('N','S','E','W') to decimal degrees.
    """
    dec = float(degrees) + float(minutes)/60 + float(seconds)/3600
    if direction in ['S', 'W']:
        dec = -dec
    return dec

# --------------------------------------------------------------
# 2) Function to parse one station CSV:
#    - Read header lines to extract station name and DMS coordinates.
#    - Read the day×month table, melt into (Date, Temp), attach metadata.
# --------------------------------------------------------------
def parse_station_csv(file_path):
    # Read the first 6 lines (metadata + header row)
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        header_lines = [f.readline().strip() for _ in range(6)]
    
    # --- 2.1) Extract Station Name from line 2.  e.g. "測站：南投  Nantou C0I460"
    station_line = header_lines[1]
    station_match = re.search(r"測站：([^ ]+)", station_line)
    station_name = station_match.group(1) if station_match else file_path.split('/')[-1]
    
    # --- 2.2) Extract DMS Latitude/Longitude from line 5.  
    #       e.g. "緯度：23°54'47.55〞N經度：120°40'45.06〞E"
    lat_lon_line = header_lines[4]
    dms_pattern = re.compile(
        r"緯度：(\d+)°(\d+)'([\d.]+)〞([NS])經度：(\d+)°(\d+)'([\d.]+)〞([EW])"
    )
    match = dms_pattern.search(lat_lon_line)
    if match:
        lat_deg, lat_min, lat_sec, lat_dir, lon_deg, lon_min, lon_sec, lon_dir = match.groups()
        lat = dms_to_decimal(lat_deg, lat_min, lat_sec, lat_dir)
        lon = dms_to_decimal(lon_deg, lon_min, lon_sec, lon_dir)
    else:
        lat, lon = None, None

    # --- 2.3) Read the actual table. 
    # The header row is on line 6 (0‐based index row 5), so we skip the first 5 lines.
    # That header looks like: "Day/Month,1,2,…,12,Day/Month"
    df = pd.read_csv(file_path, skiprows=5)
    
    # Drop the last "Day/Month" column if pandas has named it "Day/Month.1"
    if 'Day/Month.1' in df.columns:
        df = df.drop(columns=['Day/Month.1'])
    
    # Rename the first column from "Day/Month" → "Day"
    df = df.rename(columns={'Day/Month': 'Day'})
    
    # Melt from wide (day×month) → long (Day, Month, Temp)
    df_long = df.melt(id_vars='Day', var_name='Month', value_name='Temp')
    
    # Convert Day & Month to numeric; drop any rows where Temp is missing or invalid
    df_long['Day']   = pd.to_numeric(df_long['Day'], errors='coerce')
    df_long['Month'] = pd.to_numeric(df_long['Month'], errors='coerce')
    df_long = df_long.dropna(subset=['Day', 'Month', 'Temp'])
    
    # Build a proper datetime (year=2024) from Day/Month
    df_long['Date'] = pd.to_datetime(
        dict(year=2024, month=df_long['Month'].astype(int), day=df_long['Day'].astype(int)),
        errors='coerce'
    )
    df_long = df_long.dropna(subset=['Date'])
    
    # Attach station metadata
    df_long['Station']   = station_name
    df_long['Latitude']  = lat
    df_long['Longitude'] = lon
    
    # Keep only the columns we care about
    return df_long[['Station', 'Latitude', 'Longitude', 'Date', 'Temp']]


# --------------------------------------------------------------
# 3) Loop over all CSV files in /mnt/data (adjust pattern if needed)
# --------------------------------------------------------------
file_paths = glob.glob('C:/Users/Tino/Documents/PowerComsuption_MachineLearning/PowerConsumtionPrediction/Raw Data/Meteorological Data/*.csv')
all_stations = []

for fp in file_paths:
    try:
        station_df = parse_station_csv(fp)
        all_stations.append(station_df)
    except Exception as e:
        print(f"Error parsing {fp}: {e}")

# If nothing parsed, exit early
if not all_stations:
    raise RuntimeError("No station data was successfully parsed. Check file formats.")


# --------------------------------------------------------------
# 4) Concatenate all stations → one big DataFrame of daily temps
# --------------------------------------------------------------
df_all = pd.concat(all_stations, ignore_index=True)

# Sanity check: 
# print(df_all[['Station','Latitude','Longitude','Date','Temp']].head())


# --------------------------------------------------------------
# 5) Compute each station’s monthly mean temperature
# --------------------------------------------------------------
df_all['Month'] = df_all['Date'].dt.month
monthly_means = (
    df_all
    .groupby(['Station', 'Latitude', 'Longitude', 'Month'])['Temp']
    .mean()
    .reset_index()
)

# Pivot so that each station is one row, with 12 columns (Month=1…12)
df_features = (
    monthly_means
    .pivot_table(
        index=['Station', 'Latitude', 'Longitude'],
        columns='Month',
        values='Temp'
    )
    .reset_index()
)

# Rename columns from 1→'Month_1', 2→'Month_2', … up to 12
new_cols = ['Station', 'Latitude', 'Longitude'] + [f'Month_{m}' for m in range(1, 13)]
df_features.columns = new_cols

# If any monthly values are still NaN (e.g. Feb 29 missing), fill via forward/backfill
df_features = df_features.fillna(method='ffill', axis=1).fillna(method='bfill', axis=1)


# --------------------------------------------------------------
# 6) Run K-Means clustering on the 12 monthly-mean columns
#    (add Latitude/Longitude if you want spatial info in the clustering)
# --------------------------------------------------------------
# Example: cluster purely on the 12 temperature columns
monthly_cols = [f'Month_{m}' for m in range(1, 13)]
k = 4  # ← you can choose any k (4 is just an example)
kmeans = KMeans(n_clusters=k, random_state=42)
df_features['ClimateCluster'] = kmeans.fit_predict(df_features[monthly_cols])


# --------------------------------------------------------------
# 7) Show the final table: one row per station, with coords, 
#    12 monthly means, and the assigned “ClimateCluster” label.
# --------------------------------------------------------------
import ace_tools as tools
tools.display_dataframe_to_user(
    name="Station Climate Clusters",
    dataframe=df_features
)


RuntimeError: No station data was successfully parsed. Check file formats.

# Cluster Por dia

In [17]:
    import pandas as pd
    import numpy as np
    from sklearn.cluster import KMeans
    import glob
    import os 

    # Directory where all 19 station CSV files are stored
    # Option B (forward slashes):
    data_directory = 'C:/Users/Tino/Documents/PowerComsuption_MachineLearning/PowerConsumtionPrediction/Raw Data/Meteorological Data'
    pattern        = data_directory + "/*.csv"
    all_files      = glob.glob(pattern)

    print("Files found:", all_files) 

    def process_station_csv(file_path):
        """
        Reads a single station CSV (with a "Day/Month" or similar first column),
        skips metadata rows, drops the '平均' row, and returns a DataFrame
        of (Date, Station, Temperature).
        """
        df_raw = pd.read_csv(file_path, skiprows=5, encoding='utf-8-sig')

        # Identify the first column name (which indicates "Day/Month")
        day_col = df_raw.columns[0]
        
        # If there's a duplicate 'Day/Month' column, drop it
        duplicate_col = day_col + '.1'
        if duplicate_col in df_raw.columns:
            df_raw = df_raw.drop(columns=[duplicate_col])
        
        # Drop the row where the day column == '平均'
        df_raw = df_raw[df_raw[day_col] != '平均']
        
        # Melt to long form: Day × Month → Temperature
        df_long = df_raw.melt(id_vars=[day_col], var_name='Month', value_name='Temperature')
        df_long = df_long.rename(columns={day_col: 'Day'})
        
        # Infer station name from the filename
        station_name = file_path.split('/')[-1].replace('.csv', '').split('_')[-1]
        df_long['Station'] = station_name
        
        # Convert Day and Month to numeric
        df_long['Day'] = pd.to_numeric(df_long['Day'], errors='coerce')
        df_long['Month'] = pd.to_numeric(df_long['Month'], errors='coerce')
        
        # Build a Date column for 2024; invalid dates become NaT
        df_long['Date'] = pd.to_datetime(
            dict(year=2024, month=df_long['Month'], day=df_long['Day']),
            errors='coerce'
        )
        
        # Drop rows where Date or Temperature is NaN
        df_long = df_long.dropna(subset=['Date', 'Temperature'])
        
        # Keep only the columns we need
        return df_long[['Date', 'Station', 'Temperature']]

    # Process each CSV and concatenate
    dfs = []
    for file in all_files:
        df_station = process_station_csv(file)
        dfs.append(df_station)



    df_all = pd.concat(dfs, ignore_index=True)

    # Pivot so each station is a column, indexed by Date
    df_pivot = df_all.pivot(index='Date', columns='Station', values='Temperature')

    # Drop any dates with missing station readings
    df_pivot = df_pivot.dropna()

    # Prepare feature matrix: rows = dates, columns = station temperatures
    X = df_pivot.values

    # Apply K-Means clustering (e.g., into 4 weather-type clusters)
    n_clusters = 4
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    df_pivot['WeatherType'] = kmeans.fit_predict(X)

    # Build a result DataFrame
    result = df_pivot.reset_index().rename_axis(None, axis=1)

    # Display the first several rows along with cluster assignments
    import ace_tools as tools
    tools.display_dataframe_to_user(
        name="Combined Stations Temperature Clustering (2024)",
        dataframe=result.head(20)
    )

    # Also show how many days fall into each cluster
    cluster_counts = result['WeatherType'].value_counts().sort_index()
    print("Number of days in each cluster:")
    print(cluster_counts)


Files found: ['C:/Users/Tino/Documents/PowerComsuption_MachineLearning/PowerConsumtionPrediction/Raw Data/Meteorological Data\\逐日平均氣溫年報表_2024 五股 _ csv_1.csv', 'C:/Users/Tino/Documents/PowerComsuption_MachineLearning/PowerConsumtionPrediction/Raw Data/Meteorological Data\\逐日平均氣溫年報表_2024 南投 _ csv_7.csv', 'C:/Users/Tino/Documents/PowerComsuption_MachineLearning/PowerConsumtionPrediction/Raw Data/Meteorological Data\\逐日平均氣溫年報表_2024 員林 _ csv_16.csv', 'C:/Users/Tino/Documents/PowerComsuption_MachineLearning/PowerConsumtionPrediction/Raw Data/Meteorological Data\\逐日平均氣溫年報表_2024 嘉義 _ csv_9.csv', 'C:/Users/Tino/Documents/PowerComsuption_MachineLearning/PowerConsumtionPrediction/Raw Data/Meteorological Data\\逐日平均氣溫年報表_2024 基隆 _ csv_3.csv', 'C:/Users/Tino/Documents/PowerComsuption_MachineLearning/PowerConsumtionPrediction/Raw Data/Meteorological Data\\逐日平均氣溫年報表_2024 宜蘭 _ csv_12.csv', 'C:/Users/Tino/Documents/PowerComsuption_MachineLearning/PowerConsumtionPrediction/Raw Data/Meteorological Data\\逐

ValueError: Index contains duplicate entries, cannot reshape