In [2]:
import pandas as pd 

In [3]:
df = pd.read_csv("/workspace/COMP3610-Renewable-Energy-Prediction/data/raw/Italy_Power_Generation.csv")

In [4]:
# Basic information about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59070 entries, 0 to 59069
Data columns (total 25 columns):
 #   Column                                                     Non-Null Count  Dtype  
---  ------                                                     --------------  -----  
 0   Area                                                       59070 non-null  object 
 1   MTU                                                        59070 non-null  object 
 2   DATETIME                                                   59070 non-null  object 
 3   YEAR                                                       59070 non-null  int64  
 4   Biomass  - Actual Aggregated [MW]                          59047 non-null  float64
 5   Fossil Brown coal/Lignite  - Actual Aggregated [MW]        59063 non-null  object 
 6   Fossil Coal-derived gas  - Actual Aggregated [MW]          58495 non-null  float64
 7   Fossil Gas  - Actual Aggregated [MW]                       59047 non-null  float64
 8   Fossil

In [5]:
# Display the first few rows to understand the structure
print(df.head())

         Area                                             MTU  \
0  Italy (IT)  01.01.2016 00:00 - 01.01.2016 01:00 (CET/CEST)   
1  Italy (IT)  01.01.2016 01:00 - 01.01.2016 02:00 (CET/CEST)   
2  Italy (IT)  01.01.2016 02:00 - 01.01.2016 03:00 (CET/CEST)   
3  Italy (IT)  01.01.2016 03:00 - 01.01.2016 04:00 (CET/CEST)   
4  Italy (IT)  01.01.2016 04:00 - 01.01.2016 05:00 (CET/CEST)   

           DATETIME  YEAR  Biomass  - Actual Aggregated [MW]  \
0  01.01.2016 00:00  2016                              220.0   
1  01.01.2016 01:00  2016                              216.0   
2  01.01.2016 02:00  2016                              217.0   
3  01.01.2016 03:00  2016                              216.0   
4  01.01.2016 04:00  2016                              215.0   

  Fossil Brown coal/Lignite  - Actual Aggregated [MW]  \
0                                                n/e    
1                                                n/e    
2                                                n/e 

In [6]:
# Check data types
print(df.dtypes)

Area                                                          object
MTU                                                           object
DATETIME                                                      object
YEAR                                                           int64
Biomass  - Actual Aggregated [MW]                            float64
Fossil Brown coal/Lignite  - Actual Aggregated [MW]           object
Fossil Coal-derived gas  - Actual Aggregated [MW]            float64
Fossil Gas  - Actual Aggregated [MW]                         float64
Fossil Hard coal  - Actual Aggregated [MW]                   float64
Fossil Oil  - Actual Aggregated [MW]                         float64
Fossil Oil shale  - Actual Aggregated [MW]                    object
Fossil Peat  - Actual Aggregated [MW]                         object
Geothermal  - Actual Aggregated [MW]                         float64
Hydro Pumped Storage  - Actual Aggregated [MW]               float64
Hydro Pumped Storage  - Actual Con

In [7]:
# Check for missing values
print(df.isnull().sum())

Area                                                             0
MTU                                                              0
DATETIME                                                         0
YEAR                                                             0
Biomass  - Actual Aggregated [MW]                               23
Fossil Brown coal/Lignite  - Actual Aggregated [MW]              7
Fossil Coal-derived gas  - Actual Aggregated [MW]              575
Fossil Gas  - Actual Aggregated [MW]                            23
Fossil Hard coal  - Actual Aggregated [MW]                     119
Fossil Oil  - Actual Aggregated [MW]                            23
Fossil Oil shale  - Actual Aggregated [MW]                       7
Fossil Peat  - Actual Aggregated [MW]                            7
Geothermal  - Actual Aggregated [MW]                            23
Hydro Pumped Storage  - Actual Aggregated [MW]                1507
Hydro Pumped Storage  - Actual Consumption [MW]              1

In [8]:
# Basic statistics for numerical columns
print(df.describe())

               YEAR  Biomass  - Actual Aggregated [MW]  \
count  59070.000000                       59047.000000   
mean    2018.882123                         503.847901   
std        1.947274                         161.482369   
min     2016.000000                         167.000000   
25%     2017.000000                         354.000000   
50%     2019.000000                         534.000000   
75%     2021.000000                         651.000000   
max     2022.000000                         845.000000   

       Fossil Coal-derived gas  - Actual Aggregated [MW]  \
count                                       58495.000000   
mean                                          439.063971   
std                                           246.285346   
min                                             0.000000   
25%                                           235.000000   
50%                                           396.000000   
75%                                           620.000000 

In [None]:
columns_to_keep = [
    "Area", 
    "MTU", 
    "DATETIME", 
    "YEAR", 
    "Solar  - Actual Aggregated [MW]", 
    "Wind Offshore  - Actual Aggregated [MW]", 
    "Wind Onshore  - Actual Aggregated [MW]"
]

In [None]:
# Now df_filtered will only contain the selected columns
print(df_filtered.head())

In [None]:
# Strip extra spaces and unwanted characters from the column names
df.columns = df.columns.str.replace(r'\s+', ' ').str.strip()

# Check the cleaned column names and their lengths
for col in df.columns:
    print(f"'{col}' - length: {len(col)}")

In [None]:
def fill_with_avg(df, column_name):
    # Check if the column exists in the DataFrame
    if column_name not in df.columns:
        print(f"Column '{column_name}' not found!")
        return

    # Fill missing values by averaging the previous and next values
    df[column_name] = df[column_name].fillna((df[column_name].shift(1) + df[column_name].shift(-1)) / 2)

# Try applying the function to your columns again
fill_with_avg(df, 'Solar - Actual Aggregated [MW]')
fill_with_avg(df, 'Wind Offshore - Actual Aggregated [MW]')
fill_with_avg(df, 'Wind Onshore - Actual Aggregated [MW]')

In [None]:
# Check if missing values are fixed
print(df.isnull().sum())

In [None]:
# Convert the 'DATETIME' column to pandas datetime format
df['DATETIME'] = pd.to_datetime(df['DATETIME'], format='%d.%m.%Y %H:%M', errors='coerce')

# Verify the conversion
print(df['DATETIME'].head())