In [13]:
import pandas as pd
from datetime import datetime

In [14]:
# Define file paths
data_dir = '/workspace/COMP3610-Renewable-Energy-Prediction/data/processed/'

In [15]:
# Load all cleaned datasets
print("Loading cleaned datasets...")

# Power Generation Data
fr_power = pd.read_csv(f"{data_dir}France_Power_Generation_Cleaned.csv")
it_power = pd.read_csv(f"{data_dir}Italy_Power_Generation_Cleaned.csv")
es_power = pd.read_csv(f"{data_dir}Spain_Power_Generation_Cleaned.csv")

# Weather Data
fr_weather = pd.read_csv(f"{data_dir}weather_data_france_cleaned.csv")
it_weather = pd.read_csv(f"{data_dir}weather_data_italy_cleaned.csv")
es_weather = pd.read_csv(f"{data_dir}weather_data_spain_cleaned.csv")


Loading cleaned datasets...


In [16]:
def join_datasets(power_df, weather_df, country_name):
    """
    Join power generation data with weather data for a specific country
    
    Args:
        power_df: Cleaned power generation DataFrame
        weather_df: Cleaned weather DataFrame
        country_name: Name of the country
        
    Returns:
        pd.DataFrame: Joined dataset
    """
    print(f"\nJoining {country_name} datasets...")
    
    # Convert time columns to datetime
    power_df['time'] = pd.to_datetime(power_df['MTU'].str.split(' - ').str[0], format='%d.%m.%Y %H:%M')
    weather_df['time'] = pd.to_datetime(weather_df['time'])
    
    # Merge on time (inner join to ensure we only keep matching timestamps)
    merged_df = pd.merge(
        power_df,
        weather_df,
        on='time',
        how='inner',
        suffixes=('', '_weather')
    )
    
    # Clean up columns
    merged_df = merged_df.drop(columns=['MTU', 'country'])  # Remove redundant columns
    
    # Add country identifier back
    merged_df['country'] = country_name
    
    print(f"Joined {country_name} dataset shape:", merged_df.shape)
    print("Sample of joined data:")
    print(merged_df.head(2))
    
    return merged_df



In [17]:
# Join datasets for each country
fr_combined = join_datasets(fr_power, fr_weather, 'France')
it_combined = join_datasets(it_power, it_weather, 'Italy')
es_combined = join_datasets(es_power, es_weather, 'Spain')


Joining France datasets...
Joined France dataset shape: (59070, 13)
Sample of joined data:
          Area  YEAR  Solar  Wind Onshore Country                time  temp  \
0  France (FR)  2016    0.0        2189.0  France 2016-01-01 00:00:00   8.2   
1  France (FR)  2016    0.0        1753.0  France 2016-01-01 01:00:00   7.9   

   dwpt  rhum  prcp  wspd    pres country  
0   5.3  82.0   0.0   9.4  1024.6  France  
1   5.2  83.0   0.0   5.4  1024.8  France  

Joining Italy datasets...
Joined Italy dataset shape: (59070, 13)
Sample of joined data:
         Area  YEAR  Solar  Wind Onshore Country                time  temp  \
0  Italy (IT)  2016    0.0         203.0   Italy 2016-01-01 00:00:00   6.0   
1  Italy (IT)  2016    1.0         218.0   Italy 2016-01-01 01:00:00   6.0   

   dwpt  rhum  prcp  wspd    pres country  
0   3.0  81.0   0.0   7.0  1026.0   Italy  
1   3.0  81.0   0.0   7.0  1026.0   Italy  

Joining Spain datasets...
Joined Spain dataset shape: (59070, 13)
Sample of join

In [18]:
# Combine all countries
final_combined = pd.concat([fr_combined, it_combined, es_combined])

In [19]:
# Save results
final_combined.to_csv(f"{data_dir}final_combined_dataset.csv", index=False)
print("\nFinal combined dataset saved to:", f"{data_dir}final_combined_dataset.csv")

# Verify the final dataset
print("\nFinal dataset info:")
print(final_combined.info())
print("\nCount by country:")
print(final_combined['country'].value_counts())


Final combined dataset saved to: /workspace/COMP3610-Renewable-Energy-Prediction/data/processed/final_combined_dataset.csv

Final dataset info:
<class 'pandas.core.frame.DataFrame'>
Index: 177210 entries, 0 to 59069
Data columns (total 13 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   Area          177210 non-null  object        
 1   YEAR          177210 non-null  int64         
 2   Solar         177210 non-null  float64       
 3   Wind Onshore  177210 non-null  float64       
 4   Country       177210 non-null  object        
 5   time          177210 non-null  datetime64[ns]
 6   temp          177210 non-null  float64       
 7   dwpt          177210 non-null  float64       
 8   rhum          177210 non-null  float64       
 9   prcp          177210 non-null  float64       
 10  wspd          177210 non-null  float64       
 11  pres          177210 non-null  float64       
 12  country       177210 non-null  