### Filter Sky Brightness Data
- Only keep daily max, non-zero min, non-zero mean, of King's Park Meteorological Station, Hong Kong Observatory
- From 1-1-2022 to 31-10-2024
- Miss August 2022, that month has no data from King's Park Meteorological Station, Hong Kong Observatory

In [31]:
import pandas as pd
import os

directory = r"C:\Users\cxoox\Desktop\sky_brightness"
output_dir = r"C:\Users\cxoox\Desktop\filtered_data"

os.makedirs(output_dir, exist_ok=True)
filtered_kings_park = pd.DataFrame()

if not os.path.exists(directory):
    print(f"Directory does not exist: {directory}")
else:
    for file in os.listdir(directory):
        if not file.endswith('.csv'):
            continue
 
        file_path = os.path.join(directory, file)
        print(f"Processing {file_path}")
        
        try:
            df = pd.read_csv(file_path)
            df = df[df['device_code'].isin(['KP'])]  # Filter device
            
            # Parse datetime with multiple format attempts
            for fmt in ['%d/%m/%Y %H:%M:%S', '%m/%d/%Y %H:%M:%S', '%d/%m/%y %H:%M:%S', '%Y-%m-%d %H:%M:%S']:
                try:
                    df['received_adjusted'] = pd.to_datetime(df['received_adjusted'], format=fmt)
                    break
                except:
                    continue
            
            if not pd.api.types.is_datetime64_any_dtype(df['received_adjusted']):
                print(f"Datetime parsing failed in {file}")
                continue
                
            # Sort by timestamp
            df = df.sort_values('received_adjusted')
            
            # Create hour-level grouping key
            df['hour_group'] = df['received_adjusted'].dt.strftime('%Y-%m-%d %H:00')
            
            # Keep first observation in each hour group
            df = df.drop_duplicates(subset=['hour_group'], keep='first')
            
            filtered_kings_park = pd.concat([filtered_kings_park, df], ignore_index=True)
            
        except Exception as e:
            print(f"Error processing {file}: {str(e)}")

# Final cleanup
filtered_kings_park = filtered_kings_park.drop(columns=['hour_group'])  # Remove helper column
filtered_kings_park = filtered_kings_park.drop_duplicates()  # Safety check

# Extract date from timestamp
filtered_kings_park['date'] = filtered_kings_park['received_adjusted'].dt.date

# Calculate daily statistics
def non_zero_mean(x):
    non_zero = x[x > 0]
    return non_zero.mean() if len(non_zero) > 0 else None

daily_stats = filtered_kings_park.groupby('date')['nsb'].agg([
    ('Max Night Sky Brightness', 'max'),
    ('Min Night Sky Brightness (Non-zero)', lambda x: x[x > 0].min() if any(x > 0) else None),
    ('Mean Night Sky Brightness (Excluded zero)', non_zero_mean)  # Mean excluding zero values
]).reset_index()

# Save results
output_path = os.path.join(output_dir, "daily_nsb.csv")
daily_stats.to_csv(output_path, index=False)
print(f"Saved daily statistics for {len(daily_stats)} days to {output_path}")
print("\nSample of daily statistics (excluding zero values from mean calculation):")
print(daily_stats.head())

Processing C:\Users\cxoox\Desktop\sky_brightness\GaN-MN_2022_01.csv
Processing C:\Users\cxoox\Desktop\sky_brightness\GaN-MN_2022_02.csv
Processing C:\Users\cxoox\Desktop\sky_brightness\GaN-MN_2022_03.csv
Processing C:\Users\cxoox\Desktop\sky_brightness\GaN-MN_2022_04.csv
Processing C:\Users\cxoox\Desktop\sky_brightness\GaN-MN_2022_05.csv
Processing C:\Users\cxoox\Desktop\sky_brightness\GaN-MN_2022_06.csv
Processing C:\Users\cxoox\Desktop\sky_brightness\GaN-MN_2022_07.csv
Processing C:\Users\cxoox\Desktop\sky_brightness\GaN-MN_2022_08.csv
Processing C:\Users\cxoox\Desktop\sky_brightness\GaN-MN_2022_09.csv
Processing C:\Users\cxoox\Desktop\sky_brightness\GaN-MN_2022_10.csv
Processing C:\Users\cxoox\Desktop\sky_brightness\GaN-MN_2022_11.csv
Processing C:\Users\cxoox\Desktop\sky_brightness\GaN-MN_2022_12.csv
Processing C:\Users\cxoox\Desktop\sky_brightness\GaN-MN_2023_01.csv
Processing C:\Users\cxoox\Desktop\sky_brightness\GaN-MN_2023_02.csv
Processing C:\Users\cxoox\Desktop\sky_brightness

In [32]:
import pandas as pd

weather_data = ["Daily Maximum Hong Kong Heat Index", 
                "Daily Mean Hong Kong Heat Index", 
                "Daily Mean Wet Bulb Temperature (°C)",
                'Daily Mean Dew Point Temperature (°C)',
                "Daily Mean Amount of Cloud",
                "Daily Mean Pressure (hPa)",
                "Daily Total Evaporation (mm)",
                "Daily Total Rainfall (mm)",
                "Daily Mean Relative Humidity (%)",
                "Daily Maximum Temperature (°C)",
                "Daily Minimum Temperature (°C)",
                "Daily Mean Temperature (°C)",
                "Daily Global Solar Radiation (MJ/m2)",
                "Daily Total Bright Sunshine (hours)",
                "Daily Mean Wind Speed (m/s)"]

weather_data_path = [r"C:\Users\cxoox\Desktop\weather\daily_KP_MEANHKHI_ALL.csv",
                     r"C:\Users\cxoox\Desktop\weather\daily_KP_MAXHKHI_ALL.csv", 
                     r"C:\Users\cxoox\Desktop\weather\daily_KP_WET_ALL.csv",
                     r"C:\Users\cxoox\Desktop\weather\daily_KP_DEW_ALL.csv",
                     r"C:\Users\cxoox\Desktop\weather\daily_HKO_CLD_ALL.csv",
                     r"C:\Users\cxoox\Desktop\weather\daily_HKO_MSLP_ALL.csv",
                     r"C:\Users\cxoox\Desktop\weather\daily_KP_EVAP_ALL.csv",
                     r"C:\Users\cxoox\Desktop\weather\daily_KP_RF_ALL.csv",
                     r"C:\Users\cxoox\Desktop\weather\daily_KP_RH_ALL.csv",
                     r"C:\Users\cxoox\Desktop\weather\CLMMAXT_KP_.csv",
                     r"C:\Users\cxoox\Desktop\weather\CLMMINT_KP_.csv",
                     r"C:\Users\cxoox\Desktop\weather\CLMTEMP_KP_.csv",
                     r"C:\Users\cxoox\Desktop\weather\daily_KP_GSR_ALL.csv",
                     r"C:\Users\cxoox\Desktop\weather\daily_KP_SUN_ALL.csv",
                     r"C:\Users\cxoox\Desktop\weather\daily_KP_WSPD_ALL.csv"]


# 1. Load the combined filtered data
filtered_kings_park = pd.read_csv(r"C:\Users\cxoox\Desktop\filtered_data\daily_nsb.csv")

# Convert the 'received_adjusted' column to a date column, handling invalid values
filtered_kings_park['date'] = pd.to_datetime(
    filtered_kings_park['date'],
    errors='coerce'  # Convert invalid dates to NaT
).dt.date

# Drop rows with invalid dates (if any)
filtered_kings_park = filtered_kings_park.dropna(subset=['date'])

# 2. Load the daily mean cloud cover data
for file_path in weather_data_path:
    if not os.path.exists(file_path):
        print(f"File does not exist: {file_path}")
        continue

    # Read the CSV file, skipping the first two rows and using only the relevant columns
    df = pd.read_csv(
        file_path,
        skiprows=2,
        header=0,
        usecols=["年/Year", "月/Month", "日/Day", "數值/Value"]
    )

    # Rename columns for easier handling
    df = df.rename(columns={
        "年/Year": "Year",
        "月/Month": "Month",
        "日/Day": "Day",
        "數值/Value": weather_data[weather_data_path.index(file_path)]
    })

    # Convert Year, Month, and Day to integers, handling invalid values
    for col in ["Year", "Month", "Day"]:
        df[col] = pd.to_numeric(df[col], errors='coerce').astype('Int64')  # Int64 supports NaN

    # Combine Year, Month, and Day into a single date column, handling invalid values
    df['date'] = pd.to_datetime(
        df[["Year", "Month", "Day"]].astype(str).agg("-".join, axis=1),
        format="%Y-%m-%d",
        errors='coerce'  # Convert invalid dates to NaT
    ).dt.date

    # Drop rows with invalid dates (if any)
    df = df.dropna(subset=['date'])
    
    filtered_kings_park = pd.merge(
        filtered_kings_park,
        df[['date', weather_data[weather_data_path.index(file_path)]]],
        on='date',
        how='left'
    )
    

# 5. Save the merged data
output_file_path = r"C:\Users\cxoox\Desktop\filtered_data\nsb_weather_merged.csv"
filtered_kings_park.to_csv(output_file_path, index=False)
print(f"Merge completed! Data saved to {output_file_path}")
print("First 5 rows of the merged data:")
print(filtered_kings_park.head())

Merge completed! Data saved to C:\Users\cxoox\Desktop\filtered_data\nsb_weather_merged.csv
First 5 rows of the merged data:
         date  Max Night Sky Brightness  Min Night Sky Brightness (Non-zero)  \
0  2022-01-01                     16.71                                 9.03   
1  2022-01-02                     17.49                                 7.82   
2  2022-01-03                     17.54                                 7.87   
3  2022-01-04                     17.74                                 8.02   
4  2022-01-05                     16.87                                 8.11   

   Mean Night Sky Brightness (Excluded zero)  \
0                                  15.205000   
1                                  15.688571   
2                                  15.614286   
3                                  15.472143   
4                                  14.589286   

  Daily Maximum Hong Kong Heat Index  Daily Mean Hong Kong Heat Index  \
0                               1

### Combine sun and moon files respectively

In [None]:
# Combine csv files into one
import pandas as pd
import os

combined = pd.DataFrame()
directory = r"C:\Users\cxoox\Desktop\weather\sun"

for file in os.listdir(directory):
    file_path = os.path.join(directory, file)
    df = pd.read_csv(file_path)
    combined = pd.concat([combined, df])
    
combined.to_csv(r"C:\Users\cxoox\Desktop\filtered_data\combined_sun.csv", index=False)
print("Combined sun CSV files into one.")

combined = pd.DataFrame()
directory = r"C:\Users\cxoox\Desktop\weather\moon"

for file in os.listdir(directory):
    file_path = os.path.join(directory, file)
    df = pd.read_csv(file_path)
    combined = pd.concat([combined, df])
    
combined.to_csv(r"C:\Users\cxoox\Desktop\filtered_data\combined_moon.csv", index=False)
print("Combined moon CSV files into one.")
    

Combined sun CSV files into one.
Combined moon CSV files into one.


### Merge sun and moon data into nsb_weather dataset

In [34]:
import pandas as pd

weather_data = ["Sun", "Moon"]

weather_data_path = [r"C:\Users\cxoox\Desktop\filtered_data\combined_sun.csv", r"C:\Users\cxoox\Desktop\filtered_data\combined_moon.csv"]


# 1. Load the combined filtered data
filtered_kings_park = pd.read_csv(r"C:\Users\cxoox\Desktop\filtered_data\nsb_weather_merged.csv")

filtered_kings_park = filtered_kings_park.rename(columns={"date": "Date"})

# Convert the 'received_adjusted' column to a date column, handling invalid values
filtered_kings_park['Date'] = pd.to_datetime(
    filtered_kings_park['Date'],
    errors='coerce'  # Convert invalid dates to NaT
).dt.date

# Drop rows with invalid dates (if any)
filtered_kings_park = filtered_kings_park.dropna(subset=['Date'])

# 2. Load the daily mean cloud cover data
for file_path in weather_data_path:
    if not os.path.exists(file_path):
        print(f"File does not exist: {file_path}")
        continue

    # Read the CSV file, skipping the first two rows and using only the relevant columns
    df = pd.read_csv(
        file_path,
        skiprows=0,
        header=0,
        usecols=["YYYY-MM-DD", "RISE", "TRAN.", "SET"]
    )

    # Rename columns for easier handling
    df = df.rename(columns={
        "YYYY-MM-DD": "Date",
        "RISE": f"{weather_data[weather_data_path.index(file_path)]} Rise",
        "TRAN.": f"{weather_data[weather_data_path.index(file_path)]} Transit",
        "SET": f"{weather_data[weather_data_path.index(file_path)]} Set"
    })


    # Combine Year, Month, and Day into a single date column, handling invalid values
    df['Date'] = pd.to_datetime(
        df["Date"],
        format="%Y-%m-%d",
        errors='coerce'  # Convert invalid dates to NaT
    ).dt.date

    # Drop rows with invalid dates (if any)
    df = df.dropna(subset=['Date'])

    
    filtered_kings_park = pd.merge(
        filtered_kings_park,
        df[['Date', f"{weather_data[weather_data_path.index(file_path)]} Rise", f"{weather_data[weather_data_path.index(file_path)]} Transit", f"{weather_data[weather_data_path.index(file_path)]} Set"]],
        on='Date',
        how='left'
    )
    

# 5. Save the merged data
output_file_path = r"C:\Users\cxoox\Desktop\filtered_data\nsb_weather_merged.csv"
filtered_kings_park.to_csv(output_file_path, index=False)
print(f"Merge completed! Data saved to {output_file_path}")
print("First 5 rows of the merged data:")
print(filtered_kings_park.head())

Merge completed! Data saved to C:\Users\cxoox\Desktop\filtered_data\nsb_weather_merged.csv
First 5 rows of the merged data:
         Date  Max Night Sky Brightness  Min Night Sky Brightness (Non-zero)  \
0  2022-01-01                     16.71                                 9.03   
1  2022-01-02                     17.49                                 7.82   
2  2022-01-03                     17.54                                 7.87   
3  2022-01-04                     17.74                                 8.02   
4  2022-01-05                     16.87                                 8.11   

   Mean Night Sky Brightness (Excluded zero)  \
0                                  15.205000   
1                                  15.688571   
2                                  15.614286   
3                                  15.472143   
4                                  14.589286   

   Daily Maximum Hong Kong Heat Index  Daily Mean Hong Kong Heat Index  \
0                               