# Weather and Electricity Data Preprocessing Notebook


### Importing libraries

In [1]:

import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler




### Data formating

In [2]:

def format_climate(file_path):
    df = pd.read_csv(file_path)
    
    df.replace('-', pd.NA, inplace=True)
    df.dropna(inplace=True)

    df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month

    df = df.drop(columns='Station')

    df['Total Rainfall Mm'] = pd.to_numeric(df['Total Rainfall Mm'], errors='coerce')
    df['Mean Air Temperature in Celsius'] = pd.to_numeric(df['Mean Air Temperature in Celsius'], errors='coerce')
    df['Mean Vapour Pressure Hpa'] = pd.to_numeric(df['Mean Vapour Pressure Hpa'], errors='coerce')
    df['Mean Wind Speed M/Sec'] = pd.to_numeric(df['Mean Wind Speed M/Sec'], errors='coerce')
    df['Days Of Wind Gust >= 24 Knots in Day'] = pd.to_numeric(df['Days Of Wind Gust >= 24 Knots in Day'], errors='coerce')
    df['Standard Deviation Of Daily Mean Temperature in Celsius'] = pd.to_numeric(df['Standard Deviation Of Daily Mean Temperature in Celsius'], errors='coerce')

    df_grouped = df.groupby('Date').mean().reset_index()

    return df_grouped

In [3]:
def format_eletricity_data(file_path, business = False):
    df = pd.read_csv(file_path)
    
    df['Period start'] = pd.to_datetime(df['Period start'], format='%d/%m/%y')
    df.dropna(inplace=True)
    df.reset_index(drop=True, inplace=True)


    if business:
        df = df.drop(columns=[
            'Regional demand (GWh)',
            'Proportion of regional demand (%)', 
            'National demand (GWh)',
            'Proportion of national demand (%)'])
        return df
    
    df = df.drop(columns=[
        'Est. Total demand (GWh)',
        'Est. proportion of regional demand',
        'Est. proportion of national demand', 'Average consumption (kWh)',
        'Average daily consumption (kWh)', '5th percentile (kWh)',
        '25th percentile (kWh)', '50th percentile (kWh)',
        '75th percentile (kWh)', '95th percentile (kWh)', 'Coverage %'])

    return df
    

In [4]:
def read_climate_list(file_list, file_path):
    data_list = []
    for i in file_list:
        data_list.append(format_climate(file_path+i))
    
    return data_list

In [5]:
def read_electricity_list(file_list, file_path):
    data_list = []
    data_list.append(format_eletricity_data(file_path+file_list[0], True))
    data_list.append(format_eletricity_data(file_path+file_list[1]))
    return data_list

In [6]:
def merge_data(climate_data, electricity_data):
    data = pd.merge(climate_data, electricity_data, how='right', left_on='Date', right_on='Period start')
    data.drop(columns=['Period start', 'Period end', 'Region ID', 'Region description', 'Region'], inplace=True)
        
    return data


### Read data files using helper functions above

In [7]:
ClimateDatafilelist = ['NewUpperNorthRegions.csv', 'NewLowerSouthRegions.csv', 'NewUpperSouthARegions.csv','NewLowerNorthRegions.csv', 'NewCentralNorthRegions.csv']
ClimatePath = '../ClimateDataProcessing/'
ElectricityDatafilelist = ['Business_demand_trends_20240901140306.csv', 'Residential_consumption_trends_20240901141207.csv']
ElectricityPath = '../ElectricityData/'

climate_data = read_climate_list(ClimateDatafilelist, ClimatePath)
electricity_data = read_electricity_list(ElectricityDatafilelist, ElectricityPath)

### Merge data files

Here we combine the dataframes to perform analysis by region easier.

In [12]:
electricity_Bus_res = pd.merge(electricity_data[0], electricity_data[1], how='inner')
electricity_Bus_res.head()


Unnamed: 0,Period start,Period end,Region ID,Region description,Selected business demand (GWh),Region,Est. Total consumption (GWh)
0,2010-01-01,31/01/10,CNI,Central North Island,502.788,Central North Island,177.005
1,2010-02-01,28/02/10,CNI,Central North Island,475.792,Central North Island,162.78
2,2010-03-01,31/03/10,CNI,Central North Island,539.23,Central North Island,182.505
3,2010-04-01,30/04/10,CNI,Central North Island,505.722,Central North Island,188.663
4,2010-05-01,31/05/10,CNI,Central North Island,519.263,Central North Island,222.647


In [13]:
uni_climate_electricity = merge_data(climate_data[0], electricity_Bus_res)
cni_climate_electricity = merge_data(climate_data[4], electricity_Bus_res)
lni_climate_electricity = merge_data(climate_data[3], electricity_Bus_res)

usi_climate_electricity = merge_data(climate_data[2], electricity_Bus_res)
lsi_climate_electricity = merge_data(climate_data[1], electricity_Bus_res)



# Transformations


# Feature Selection

In [None]:
def lasso_feature_selection(X, y, alpha):
    lasso = Lasso(alpha=alpha)
    lasso.fit(X, y)
    index = [i for i, coef in enumerate(lasso.coef_) if coef != 0]
    return lasso.coef_, X.columns[index]