In [1]:
import pandas as pd
import numpy as np
import os

# --- Step 1: Load CSV ---
csv_path = "era5_data_csv/era5_full_dataset_01.csv"
if not os.path.exists(csv_path):
    raise FileNotFoundError(f"{csv_path} not found.")

df = pd.read_csv(csv_path, parse_dates=['valid_time'])
print('Step 1 complete: CSV loaded')


Step 1 complete: CSV loaded


In [2]:
# --- Step 2: Inspect columns and basic info ---
print(df.info())
print(df.head())
print(df.describe())
print('Step 2 complete: inspection printed')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1740960 entries, 0 to 1740959
Data columns (total 12 columns):
 #   Column      Dtype         
---  ------      -----         
 0   valid_time  datetime64[ns]
 1   latitude    float64       
 2   longitude   float64       
 3   t2m         float64       
 4   d2m         float64       
 5   msl         float64       
 6   u10         float64       
 7   v10         float64       
 8   tcc         float64       
 9   skt         float64       
 10  number      int64         
 11  expver      int64         
dtypes: datetime64[ns](1), float64(9), int64(2)
memory usage: 159.4 MB
None
  valid_time  latitude  longitude        t2m        d2m         msl       u10  \
0 2025-01-01      36.0      -9.80  16.260406  282.94458  102652.625 -6.966656   
1 2025-01-01      36.0      -9.55  16.235504  283.03100  102644.875 -7.204693   
2 2025-01-01      36.0      -9.30  16.259918  282.84155  102635.780 -7.474468   
3 2025-01-01      36.0      -9.05  16.3

In [3]:
# --- Step 3: Set time index for interpolation ---
df = df.sort_values('valid_time')
df.set_index('valid_time', inplace=True)
print('Step 3 complete: valid_time is now DatetimeIndex')


Step 3 complete: valid_time is now DatetimeIndex


In [4]:
# --- Step 4: Unit conversion ---

# Temperature: Kelvin → Celsius
for temp_col in ['d2m', 'skt']:  # match your CSV names
    if temp_col in df.columns:
        df[temp_col] = df[temp_col] - 273.15

# Pressure: Pa → hPa
if 'msl' in df.columns:
    df['msl'] = df['msl'] / 100

print('✅ Step 4 complete: units converted where applicable')


✅ Step 4 complete: units converted where applicable


In [5]:
# --- Step 5: Handle missing values ---
# Count missing values
print(df.isna().sum())

# Interpolate isolated missing hours (time-based)
df.interpolate(method='time', limit_direction='both', inplace=True)

# Drop any remaining rows with missing values
df.dropna(inplace=True)

print('Step 5 complete: missing values interpolated and remaining NaNs dropped')


latitude     0
longitude    0
t2m          0
d2m          0
msl          0
u10          0
v10          0
tcc          0
skt          0
number       0
expver       0
dtype: int64
Step 5 complete: missing values interpolated and remaining NaNs dropped
Step 5 complete: missing values interpolated and remaining NaNs dropped


In [6]:
# --- Step 6: Remove outliers / sanity checks with reporting ---
print(df.columns)
def report_changes(df, col, mask, action_desc):
    """Always report how many values were affected."""
    n_changed = mask.sum()
    print(f"{action_desc}: {n_changed} values in '{col}' were modified or removed.")
    return n_changed


# Temperature sanity range (°C)
for temp_col, raw_col in zip(
    ['2m_temperature', '2m_dewpoint_temperature', 'skin_temperature'],
    ['t2m', 'd2m', 'skt']
):
    if raw_col in df.columns:
        mask = (df[raw_col] < -123.15) | (df[raw_col] > 57.0)
        report_changes(df, raw_col, mask, "Out-of-range temperature values removed")
        df = df[(df[raw_col] >= -123.15) & (df[raw_col] <= 57.0)]

# Cloud cover: clip 0–1
if 'tcc' in df.columns:
    before_clip = df['tcc'].copy()
    df['tcc'] = df['tcc'].clip(0, 1)
    changed = (before_clip != df['tcc']).sum()
    print(f"Cloud cover clipped (0–1): {changed} values adjusted.")

# Precipitation: cannot be negative
if 'tp' in df.columns:
    before_clip = df['tp'].copy()
    df['tp'] = df['tp'].clip(lower=0)
    changed = (before_clip != df['tp']).sum()
    print(f"Precipitation negatives removed: {changed} values adjusted.")

# Wind components: cap ±100 m/s
for wind_col in ['u10', 'v10']:
    if wind_col in df.columns:
        before_clip = df[wind_col].copy()
        df[wind_col] = df[wind_col].clip(-100, 100)
        changed = (before_clip != df[wind_col]).sum()
        print(f"Wind component '{wind_col}' clipped to ±100 m/s: {changed} values adjusted.")



print("✅ Step 6 complete: outliers removed/clipped where applicable")


Index(['latitude', 'longitude', 't2m', 'd2m', 'msl', 'u10', 'v10', 'tcc',
       'skt', 'number', 'expver'],
      dtype='object')
Out-of-range temperature values removed: 0 values in 't2m' were modified or removed.
Out-of-range temperature values removed: 0 values in 'd2m' were modified or removed.
Out-of-range temperature values removed: 0 values in 'skt' were modified or removed.
Cloud cover clipped (0–1): 0 values adjusted.
Wind component 'u10' clipped to ±100 m/s: 0 values adjusted.
Wind component 'v10' clipped to ±100 m/s: 0 values adjusted.
✅ Step 6 complete: outliers removed/clipped where applicable
Cloud cover clipped (0–1): 0 values adjusted.
Wind component 'u10' clipped to ±100 m/s: 0 values adjusted.
Wind component 'v10' clipped to ±100 m/s: 0 values adjusted.
✅ Step 6 complete: outliers removed/clipped where applicable


In [7]:
# --- Step 7: Reset index and save cleaned CSV ---
df.reset_index(inplace=True)
cleaned_path = "era5_cleaned.csv"
df.to_csv(cleaned_path, index=False)
print(f"✅ Cleaned CSV saved as '{cleaned_path}'")


✅ Cleaned CSV saved as 'era5_cleaned.csv'
