In [4]:
import pandas as pd

In [5]:
def clean_weather_data(file_path, country_name):
    """
    Clean weather data by keeping only specified columns and handling missing values
    
    Args:
        file_path (str): Path to the raw weather CSV file
        country_name (str): Name of the country for the dataset
        
    Returns:
        pd.DataFrame: Cleaned DataFrame
    """
    # Load the raw data
    df = pd.read_csv(file_path)
    
    # Select only the columns we want to keep
    columns_to_keep = ['time', 'temp', 'dwpt', 'rhum', 'prcp', 'wspd', 'pres']
    df = df[columns_to_keep].copy()
    
    # Convert time to datetime and set as index for time-based operations
    df['time'] = pd.to_datetime(df['time'])
    df = df.set_index('time')
    
    # Handle missing values for each column
    # Temperature-related columns (time-based interpolation)
    df[['temp', 'dwpt', 'rhum']] = df[['temp', 'dwpt', 'rhum']].interpolate(method='time')
    
    # Precipitation (fill missing with 0 assuming no rain when data is missing)
    df['prcp'] = df['prcp'].fillna(0)
    
    # Wind speed (time-based interpolation)
    df['wspd'] = df['wspd'].interpolate(method='time')
    
    # Pressure (time-based interpolation)
    df['pres'] = df['pres'].interpolate(method='time')
    
    # For any remaining missing values (shouldn't be any after interpolation)
    df = df.fillna(method='ffill').fillna(method='bfill')
    
    # Add country identifier
    df['country'] = country_name
    
    # Reset index to make time a column again
    df = df.reset_index()
    
    return df



In [6]:
# File paths for each country
weather_files = {
    'France': '/workspace/COMP3610-Renewable-Energy-Prediction/data/raw/weather_data_france.csv',
    'Italy': '/workspace/COMP3610-Renewable-Energy-Prediction/data/raw/weather_data_italy.csv',
    'Spain': '/workspace/COMP3610-Renewable-Energy-Prediction/data/raw/weather_data_spain.csv'
}



In [7]:
# Process all countries
cleaned_datasets = {}
for country, file_path in weather_files.items():
    print(f"Cleaning {country} weather data...")
    cleaned_df = clean_weather_data(file_path, country)
    cleaned_datasets[country] = cleaned_df
    
    # Save cleaned data
    output_path = f'/workspace/COMP3610-Renewable-Energy-Prediction/data/processed/weather_data_{country.lower()}_cleaned.csv'
    cleaned_df.to_csv(output_path, index=False)
    print(f"Saved cleaned {country} data to {output_path}")



Cleaning France weather data...


  df = df.fillna(method='ffill').fillna(method='bfill')


Saved cleaned France data to /workspace/COMP3610-Renewable-Energy-Prediction/data/processed/weather_data_france_cleaned.csv
Cleaning Italy weather data...


  df = df.fillna(method='ffill').fillna(method='bfill')


Saved cleaned Italy data to /workspace/COMP3610-Renewable-Energy-Prediction/data/processed/weather_data_italy_cleaned.csv
Cleaning Spain weather data...


  df = df.fillna(method='ffill').fillna(method='bfill')


Saved cleaned Spain data to /workspace/COMP3610-Renewable-Energy-Prediction/data/processed/weather_data_spain_cleaned.csv


In [8]:
# Combine all countries
combined_weather = pd.concat(cleaned_datasets.values())
combined_output_path = '/workspace/COMP3610-Renewable-Energy-Prediction/data/processed/weather_data_all_countries_cleaned.csv'
combined_weather.to_csv(combined_output_path, index=False)

print("\nCleaning complete! Final combined dataset saved to:", combined_output_path)


Cleaning complete! Final combined dataset saved to: /workspace/COMP3610-Renewable-Energy-Prediction/data/processed/weather_data_all_countries_cleaned.csv
