In [1]:
import pandas as pd


In [4]:
dates = []
with open("../climate_washed/split_datesC.txt") as f:
    for line in f.readlines():
        date, mode = line.split(" ")
        date = date.strip()
        mode = mode.strip()
        
        if mode == "train":
            dates.append(date)

df_station = pd.read_csv("../climate_washed/static_filtered.csv", dtype={'STAID': str})

In [28]:
import pandas as pd
from datetime import datetime

def calculate_metrics(df, y):
    # Filter the dataset based on the user-defined 'y' variable
    observation_data = df[['Date', y]].dropna()

    # Define the time segments
    segments = [(1982, 1986), (1987, 1991), (1992, 1996), (1997, 2001), 
                (2002, 2006), (2007, 2011), (2012, 2016), (2017, 2018)]

    results = {}

    for start_year, end_year in segments:
        # Filter data for the current segment
        segment_data = observation_data[(observation_data['Date'].dt.year >= start_year) & 
                                        (observation_data['Date'].dt.year <= end_year)]
        
        # Create a date range for the full 5-year period
        full_range = pd.date_range(start=f'{start_year}-01-01', end=f'{end_year}-12-31', freq='ME')
        
        # Group by year-month to find months with at least one observation
        monthly_counts = segment_data['Date'].dt.to_period('M').value_counts()
        
        # Calculate monthly availability
        months_with_data = monthly_counts.index
        monthly_availability = len(months_with_data) / len(full_range)
        
        # Calculate monthly continuity
        sorted_months = sorted(months_with_data)
        max_continuous_months = 0
        current_streak = 0
        prev_month = None
        
        for month in sorted_months:
            if prev_month and (month - prev_month).n > 1:
                max_continuous_months = max(max_continuous_months, current_streak)
                current_streak = 0
            current_streak += 1
            prev_month = month
            
        max_continuous_months = max(max_continuous_months, current_streak)
        monthly_continuity = max_continuous_months / len(full_range)
        
        results[f'{start_year}_{end_year}_Availability'] = monthly_availability
        results[f'{start_year}_{end_year}_Continuity'] = monthly_continuity
    # print(results)
    # Convert results to DataFrame for better visualization
    # results_df = pd.DataFrame(results)
    return results

In [35]:
from collections import defaultdict
from tqdm import tqdm

df = pd.read_csv(f"../climate_washed/11465200.csv")
y_columns = df.columns[1:21]
print(y_columns)

final_df = defaultdict(dict)

for staid in tqdm(df_station['STAID']):
    if len(staid) < 8:
        staid_str = '0' * (8 - len(staid)) + staid
    else:
        staid_str = staid

    df = pd.read_csv(f"../climate_washed/{staid_str}.csv")
    df = df[df['Date'].isin(dates)]
    df['Date'] = pd.to_datetime(df['Date'])


    for y in y_columns:
        result = calculate_metrics(df, y)
        final_df[y][staid] = result
    
for y in y_columns:
    final_df[y] = pd.DataFrame(final_df[y]).T
    # rename the first column to "staid"
    final_df[y].reset_index(inplace=True)
    final_df[y].rename(columns={'index': 'STAID'}, inplace=True)
    final_df[y].to_csv(f"statistics/{y}_metrics.csv", index=False)
    print(f"Saved {y} metrics to statistics/{y}_metrics.csv")


Index(['00010', '00095', '00300', '00400', '00405', '00600', '00605', '00618',
       '00660', '00665', '00681', '00915', '00925', '00930', '00935', '00940',
       '00945', '00955', '71846', '80154'],
      dtype='object')


  0%|          | 0/482 [00:00<?, ?it/s]

100%|██████████| 482/482 [05:12<00:00,  1.54it/s]


Saved 00010 metrics to statistics/00010_metrics.csv
Saved 00095 metrics to statistics/00095_metrics.csv
Saved 00300 metrics to statistics/00300_metrics.csv
Saved 00400 metrics to statistics/00400_metrics.csv
Saved 00405 metrics to statistics/00405_metrics.csv
Saved 00600 metrics to statistics/00600_metrics.csv
Saved 00605 metrics to statistics/00605_metrics.csv
Saved 00618 metrics to statistics/00618_metrics.csv
Saved 00660 metrics to statistics/00660_metrics.csv
Saved 00665 metrics to statistics/00665_metrics.csv
Saved 00681 metrics to statistics/00681_metrics.csv
Saved 00915 metrics to statistics/00915_metrics.csv
Saved 00925 metrics to statistics/00925_metrics.csv
Saved 00930 metrics to statistics/00930_metrics.csv
Saved 00935 metrics to statistics/00935_metrics.csv
Saved 00940 metrics to statistics/00940_metrics.csv
Saved 00945 metrics to statistics/00945_metrics.csv
Saved 00955 metrics to statistics/00955_metrics.csv
Saved 71846 metrics to statistics/71846_metrics.csv
Saved 80154 