In [1]:
import os
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

In [19]:
def extract_date_from_filename(filename):
    """
    Extract date from filename for both naming conventions:
    - ECMWF_utci_YYYYMMDD_v1.1_con.area-subset.*.csv (1990-2012)
    - mrt_with_utci_YYYYMMDD_hourly_stats.csv (2013-2024)
    """
    parts = filename.split('_')
    for part in parts:
        # Look for 8-digit number that could be a date
        if len(part) == 8 and part.isdigit():
            try:
                return datetime.strptime(part, '%Y%m%d').date()
            except ValueError:
                continue
    raise ValueError(f"Could not find date in filename: {filename}")

def process_csv(file_path, output_dir, n_days_past, n_days_future):
    """
    Processes a single CSV file: prepares daily UTCI data for time series forecasting.
    Assumes CSV has hourly data (0-23) and date is in filename.
    """
    # Extract date from filename
    file_date = extract_date_from_filename(os.path.basename(file_path))
    
    # Load the data
    data = pd.read_csv(file_path)
    
    # Calculate daily mean UTCI
    daily_utci = data['UTCI_Celsius_mean'].mean()
    
    return file_date, daily_utci

def process_directory(input_dir, output_dir, n_days_past, n_days_future):
    """
    Processes all CSV files in a directory.
    """
    os.makedirs(output_dir, exist_ok=True)
    
    # Dictionary to store date -> UTCI mapping
    daily_values = {}
    
    # First pass: collect all daily values
    for file_name in os.listdir(input_dir):
        if file_name.endswith('.csv') and ('mrt_with_utci' in file_name or 'ECMWF_utci' in file_name):
            file_path = os.path.join(input_dir, file_name)
            try:
                date, utci_mean = process_csv(file_path, output_dir, n_days_past, n_days_future)
                daily_values[date] = utci_mean
            except Exception as e:
                print(f"Error processing {file_name}: {str(e)}")
    
    if not daily_values:
        raise ValueError("No valid files were processed. Check file naming patterns and directory path.")
    
    # Convert to sorted DataFrame
    daily_df = pd.DataFrame(
        [(date, utci) for date, utci in daily_values.items()],
        columns=['date', 'UTCI_mean']
    ).sort_values('date')
    
    # Create sequences
    values_utci = daily_df['UTCI_mean'].values
    dates = daily_df['date'].values
    
    X, y = [], []
    sequence_info = []
    
    # Create sequences of past days and future days
    for i in range(len(values_utci) - n_days_past - n_days_future + 1):
        # Check if dates are consecutive
        date_sequence = dates[i:i + n_days_past + n_days_future]
        expected_dates = [date_sequence[0] + timedelta(days=x) for x in range(len(date_sequence))]
        
        if all(actual == expected for actual, expected in zip(date_sequence, expected_dates)):
            X.append(values_utci[i:i + n_days_past])
            y.append(values_utci[i + n_days_past:i + n_days_past + n_days_future])
            
            # Store comprehensive date information
            sequence_info.append({
                'feature_start_date': dates[i],
                'feature_end_date': dates[i + n_days_past - 1],
                'target_start_date': dates[i + n_days_past],
                'target_end_date': dates[i + n_days_past + n_days_future - 1],
                'sequence_id': i + 1  # 1-based indexing for easier reference
            })
    
    if not X:
        raise ValueError("No valid sequences found. Check if you have consecutive daily data.")
    
    # Convert to numpy arrays
    X = np.array(X)
    y = np.array(y)
    
    # Save features and targets
    base_name = os.path.basename(input_dir)
    np.save(os.path.join(output_dir, f"{base_name}_features.npy"), X)
    np.save(os.path.join(output_dir, f"{base_name}_targets.npy"), y)
    
    # Save comprehensive sequence information
    sequence_df = pd.DataFrame(sequence_info)
    sequence_df.to_csv(os.path.join(output_dir, f"{base_name}_sequence_info.csv"), index=False)
    
    # Get unique years from the dates (convert dates to strings first for year extraction)
    years = sorted(set([d.year for d in daily_df['date']]))
    
    # Save summary information
    with open(os.path.join(output_dir, f"{base_name}_summary.txt"), 'w') as f:
        f.write(f"Dataset Summary\n")
        f.write(f"==============\n\n")
        f.write(f"Total sequences: {len(X)}\n")
        f.write(f"Features shape: {X.shape} (sequences, past_days)\n")
        f.write(f"Targets shape: {y.shape} (sequences, future_days)\n\n")
        f.write(f"Date range:\n")
        f.write(f"First sequence starts: {sequence_df['feature_start_date'].min()}\n")
        f.write(f"Last sequence ends: {sequence_df['target_end_date'].max()}\n\n")
        f.write(f"Features: {n_days_past} days of historical data\n")
        f.write(f"Targets: {n_days_future} days of future predictions\n")
        f.write(f"\nProcessed files from years: {years}")

    print(f"Successfully created {len(X)} sequences")
    print(f"Features shape: {X.shape}")
    print(f"Targets shape: {y.shape}")
    print(f"Date range: {daily_df['date'].min()} to {daily_df['date'].max()}")
    print(f"Detailed sequence information saved to {base_name}_sequence_info.csv")
    print(f"Summary information saved to {base_name}_summary.txt")

# Parameters
input_directory = 'data'
output_directory = 'entire_utci_arrays(30-7)'
n_days_past = 30  # Use 30 days of historical data as features
n_days_future =15  # Predict next 7 days

# Process all files
process_directory(input_directory, output_directory, n_days_past, n_days_future)

Successfully created 1924 sequences
Features shape: (1924, 30)
Targets shape: (1924, 7)
Date range: 1990-04-01 to 2024-06-30
Detailed sequence information saved to data_sequence_info.csv
Summary information saved to data_summary.txt


In [13]:
import numpy as np

features_path = 'processed_daily_utci_arrays/data_features.npy'
targets_path = 'processed_daily_utci_arrays/data_targets.npy'

features = np.load(features_path)
targets = np.load(targets_path)

print("Features shape:", features.shape)
print("First feature set:", features[0])
print("Targets shape:", targets.shape)
print("First target set:", targets[0])
# print(len(features))

Features shape: (659, 30)
First feature set: [29.16583333 30.17708333 30.35833333 28.78833333 26.6775     26.65833333
 28.11875    29.57333333 30.66958333 31.34833333 31.23375    31.45875
 31.46875    29.73875    29.06416667 28.9175     28.66333333 29.22583333
 29.04541667 27.53708333 26.96125    26.79166667 28.70041667 30.15333333
 30.94791667 31.62333333 31.35416667 31.69791667 32.55875    32.90916667]
Targets shape: (659, 7)
First target set: [32.76916667 32.4675     32.0275     30.5625     29.23458333 28.80958333
 30.51875   ]
