In [2]:
import pandas as pd 

In [3]:
df = pd.read_csv("/workspace/COMP3610-Renewable-Energy-Prediction/data/raw/Spain_Power_Generation.csv")

In [4]:
# Basic information about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76969 entries, 0 to 76968
Data columns (total 25 columns):
 #   Column                                                     Non-Null Count  Dtype  
---  ------                                                     --------------  -----  
 0   Area                                                       76969 non-null  object 
 1   MTU                                                        76969 non-null  object 
 2   DATETIME                                                   76969 non-null  object 
 3   YEAR                                                       76969 non-null  int64  
 4   Biomass  - Actual Aggregated [MW]                          76906 non-null  float64
 5   Fossil Brown coal/Lignite  - Actual Aggregated [MW]        76905 non-null  float64
 6   Fossil Coal-derived gas  - Actual Aggregated [MW]          76906 non-null  float64
 7   Fossil Gas  - Actual Aggregated [MW]                       76907 non-null  float64
 8   Fossil

In [5]:
# Display the first few rows to understand the structure
print(df.head())

         Area                                             MTU  \
0  Spain (ES)  01.01.2015 00:00 - 01.01.2015 01:00 (CET/CEST)   
1  Spain (ES)  01.01.2015 01:00 - 01.01.2015 02:00 (CET/CEST)   
2  Spain (ES)  01.01.2015 02:00 - 01.01.2015 03:00 (CET/CEST)   
3  Spain (ES)  01.01.2015 03:00 - 01.01.2015 04:00 (CET/CEST)   
4  Spain (ES)  01.01.2015 04:00 - 01.01.2015 05:00 (CET/CEST)   

           DATETIME  YEAR  Biomass  - Actual Aggregated [MW]  \
0  01.01.2015 00:00  2015                              447.0   
1  01.01.2015 01:00  2015                              449.0   
2  01.01.2015 02:00  2015                              448.0   
3  01.01.2015 03:00  2015                              438.0   
4  01.01.2015 04:00  2015                              428.0   

   Fossil Brown coal/Lignite  - Actual Aggregated [MW]  \
0                                              329.0     
1                                              328.0     
2                                              323

In [6]:
# Check data types
print(df.dtypes)

Area                                                          object
MTU                                                           object
DATETIME                                                      object
YEAR                                                           int64
Biomass  - Actual Aggregated [MW]                            float64
Fossil Brown coal/Lignite  - Actual Aggregated [MW]          float64
Fossil Coal-derived gas  - Actual Aggregated [MW]            float64
Fossil Gas  - Actual Aggregated [MW]                         float64
Fossil Hard coal  - Actual Aggregated [MW]                   float64
Fossil Oil  - Actual Aggregated [MW]                         float64
Fossil Oil shale  - Actual Aggregated [MW]                   float64
Fossil Peat  - Actual Aggregated [MW]                        float64
Geothermal  - Actual Aggregated [MW]                         float64
Hydro Pumped Storage  - Actual Aggregated [MW]               float64
Hydro Pumped Storage  - Actual Con

In [7]:
# Check for missing values
print(df.isnull().sum())

Area                                                             0
MTU                                                              0
DATETIME                                                         0
YEAR                                                             0
Biomass  - Actual Aggregated [MW]                               63
Fossil Brown coal/Lignite  - Actual Aggregated [MW]             64
Fossil Coal-derived gas  - Actual Aggregated [MW]               63
Fossil Gas  - Actual Aggregated [MW]                            62
Fossil Hard coal  - Actual Aggregated [MW]                      63
Fossil Oil  - Actual Aggregated [MW]                            65
Fossil Oil shale  - Actual Aggregated [MW]                      62
Fossil Peat  - Actual Aggregated [MW]                           63
Geothermal  - Actual Aggregated [MW]                            63
Hydro Pumped Storage  - Actual Aggregated [MW]               76969
Hydro Pumped Storage  - Actual Consumption [MW]               

In [8]:
# Basic statistics for numerical columns
print(df.describe())

               YEAR  Biomass  - Actual Aggregated [MW]  \
count  76969.000000                       76906.000000   
mean    2018.810404                         412.978051   
std        2.403026                          86.916096   
min     2015.000000                           0.000000   
25%     2017.000000                         351.000000   
50%     2019.000000                         400.000000   
75%     2021.000000                         487.000000   
max     2022.000000                         609.000000   

       Fossil Brown coal/Lignite  - Actual Aggregated [MW]  \
count                                       76905.000000     
mean                                          210.971471     
std                                           328.973514     
min                                             0.000000     
25%                                             0.000000     
50%                                             0.000000     
75%                                        

In [None]:
columns_to_keep = [
    "Area", 
    "MTU", 
    "DATETIME", 
    "YEAR", 
    "Solar  - Actual Aggregated [MW]", 
    "Wind Offshore  - Actual Aggregated [MW]", 
    "Wind Onshore  - Actual Aggregated [MW]"
]

In [None]:
# Now df_filtered will only contain the selected columns
print(df_filtered.head())

In [None]:
# Strip extra spaces and unwanted characters from the column names
df.columns = df.columns.str.replace(r'\s+', ' ').str.strip()

# Check the cleaned column names and their lengths
for col in df.columns:
    print(f"'{col}' - length: {len(col)}")

In [None]:
def fill_with_avg(df, column_name):
    # Check if the column exists in the DataFrame
    if column_name not in df.columns:
        print(f"Column '{column_name}' not found!")
        return

    # Fill missing values by averaging the previous and next values
    df[column_name] = df[column_name].fillna((df[column_name].shift(1) + df[column_name].shift(-1)) / 2)

# Try applying the function to your columns again
fill_with_avg(df, 'Solar - Actual Aggregated [MW]')
fill_with_avg(df, 'Wind Offshore - Actual Aggregated [MW]')
fill_with_avg(df, 'Wind Onshore - Actual Aggregated [MW]')

In [None]:
# Check if missing values are fixed
print(df.isnull().sum())

In [None]:
# Convert the 'DATETIME' column to pandas datetime format
df['DATETIME'] = pd.to_datetime(df['DATETIME'], format='%d.%m.%Y %H:%M', errors='coerce')

# Verify the conversion
print(df['DATETIME'].head())