In [27]:
import pandas as pd

def clean_weather_csv(filepath):
    """
    Cleans Met Office–style weather CSV/text files.
    Removes metadata, handles missing values, and drops
    sun_hours and air_frost_days.

    Parameters
    ----------
    filepath : str
        Path to raw weather CSV file

    Returns
    -------
    pandas.DataFrame
        Fully cleaned weather dataset
    """

    with open(filepath, "r") as f:
        lines = f.readlines()

    rows = []

    for line in lines:
        # Remove quotes and whitespace
        line = line.replace('"', '').strip()
        if not line:
            continue

        parts = line.split()

        # Data rows start with a 4-digit year
        if parts[0].isdigit() and len(parts[0]) == 4 and len(parts) >= 7:
            rows.append(parts[:7])

    # Create DataFrame
    df = pd.DataFrame(
        rows,
        columns=[
            "year",
            "month",
            "tmax",
            "tmin",
            "air_frost_days",
            "rain_mm",
            "sun_hours"
        ]
    )

    # Replace missing-value markers
    df.replace(["---", "*"], pd.NA, inplace=True)

    # Convert all columns to numeric
    df = df.apply(pd.to_numeric, errors="coerce")

    # Drop unwanted columns
    df = df.drop(columns=["sun_hours", "air_frost_days"])

    # Remove rows with ANY missing values
    df = df.dropna()

    return df

In [28]:
df_clean = clean_weather_csv("Data/Raw_weather_data/Armagh.csv")
df_clean.to_csv("Armagh_cleaned.csv", index=False)

In [31]:
print(df_clean.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1909 entries, 147 to 2075
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   year     1909 non-null   int64  
 1   month    1909 non-null   int64  
 2   tmax     1909 non-null   float64
 3   tmin     1909 non-null   float64
 4   rain_mm  1909 non-null   float64
dtypes: float64(3), int64(2)
memory usage: 89.5 KB
None


In [32]:
from pathlib import Path

# Define folders
raw_dir = Path("data/raw_weather_data")
clean_dir = Path("data/cleaned_data")

# Create output folder if it doesn't exist
clean_dir.mkdir(parents=True, exist_ok=True)

# Loop through all CSV files
for file in raw_dir.glob("*.csv"):
    df_clean = clean_weather_csv(file)

    output_file = clean_dir / f"{file.stem}_cleaned.csv"
    df_clean.to_csv(output_file, index=False)

    print(f"Cleaned {file.name} → {output_file.name}")

Cleaned Aberporth.csv → Aberporth_cleaned.csv
Cleaned Armagh.csv → Armagh_cleaned.csv
Cleaned Chivenor.csv → Chivenor_cleaned.csv
Cleaned Manston.csv → Manston_cleaned.csv
Cleaned Wick-Airport.csv → Wick-Airport_cleaned.csv
