In [45]:
# import necessary libraries
import pandas as pd
from datetime import datetime
import numpy as np

# read the original data
input_file = 'prosumer_daily_data1.csv'
print("Reading input file...")
df = pd.read_csv(input_file)

# store original shape for comparison
original_shape = df.shape
print(f"Original data shape: {original_shape}")

# display first few rows of original data for us to check
print("\nOriginal data sample:")
df.head()

Reading input file...
Original data shape: (180106, 6)

Original data sample:


Unnamed: 0,Customer,Postcode,date,GC,GG,net_load
0,11,2026,2010-07-01,26.873,6.743,20.13
1,11,2026,2010-07-02,20.961,1.977,18.984
2,11,2026,2010-07-03,37.023,7.305,29.718
3,11,2026,2010-07-04,31.955,6.573,25.382
4,11,2026,2010-07-05,34.751,1.378,33.373


In [47]:
# keep only the required columns
print("Keeping only required columns...")
columns_to_keep = ['Customer', 'Postcode', 'date']
df = df[columns_to_keep]

# display the data after keeping selected columns
print("\nData after keeping selected columns:")
df.head()

Keeping only required columns...

Data after keeping selected columns:


Unnamed: 0,Customer,Postcode,date
0,11,2026,2010-07-01
1,11,2026,2010-07-02
2,11,2026,2010-07-03
3,11,2026,2010-07-04
4,11,2026,2010-07-05


In [49]:
# convert dates to datetime
print("Converting dates...")
df['date'] = pd.to_datetime(df['date'])

# function to convert historical dates to 2024
def convert_to_2024(date):
    return date.replace(year=2024)

# function to convert early 2024 dates to 2025
def convert_early_dates_to_2025(date):
    if date.year == 2024:
        if date.month == 1 or (date.month == 2 and date.day <= 1):
            return date.replace(year=2025)
    return date

# apply date conversion
df['date'] = df['date'].apply(convert_to_2024)

# apply the conversion for early dates to 2025
df['date'] = df['date'].apply(convert_early_dates_to_2025)

# filter out dates outside our desired range cuz the api plan restriction
mask = (df['date'] >= '2024-02-05') & (df['date'] <= '2025-02-02')
df = df[mask]

# display sample after date conversion
print("\nData after date conversion:")
df.head()

Converting dates...

Data after date conversion:


Unnamed: 0,Customer,Postcode,date
0,11,2026,2024-07-01
1,11,2026,2024-07-02
2,11,2026,2024-07-03
3,11,2026,2024-07-04
4,11,2026,2024-07-05


In [51]:
# remove duplicates
print("Removing duplicates...")
df = df.drop_duplicates(subset=['Postcode', 'date'])

# sort by postcode and date
df = df.sort_values(['Postcode', 'date'])

# convert date back to string format
df['date'] = df['date'].dt.strftime('%Y-%m-%d')

# display sample of processed data
print("\nProcessed data sample:")
df.head()

Removing duplicates...

Processed data sample:


Unnamed: 0,Customer,Postcode,date
67075,116,2008,2024-02-05
67076,116,2008,2024-02-06
67077,116,2008,2024-02-07
67078,116,2008,2024-02-08
67079,116,2008,2024-02-09


In [53]:
# save processed data
output_file = '2024_weather_data_preprocessed.csv'
df.to_csv(output_file, index=False)

# print summary statistics
print("\nProcessing Summary:")
print(f"Original records: {original_shape[0]}")
print(f"Processed records: {len(df)}")
print(f"Unique postcodes: {df['Postcode'].nunique()}")
print(f"Date range: {df['date'].min()} to {df['date'].max()}")


Processing Summary:
Original records: 180106
Processed records: 28314
Unique postcodes: 78
Date range: 2024-02-05 to 2025-02-01
