In [1]:
# Step 1: Importing libraries and loading the data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler

In [2]:
def normalized_func(data):
  # Convert 'DateTime' to datetime format
  data['DateTime'] = pd.to_datetime(data['DateTime'])
  
  # Selecting numerical columns for normalization
  normalize_features = ['WindSpeed(m/s)', 'Pressure(hpa)', 'Temperature(°C)', 'Humidity(%)', 'Sunlight(Lux)']
  data_numerical = data[normalize_features]

  scaler = MinMaxScaler(feature_range=(0, 1))
  data_normalized = scaler.fit_transform(data_numerical)

  # Creating a DataFrame with normalized data
  data_normalized = pd.DataFrame(data_normalized, columns=normalize_features)

  # Concatenate normalized data with other non-numerical columns if needed
  data_normalized = pd.concat([data[['LocationCode', 'DateTime', 'Power(mW)','Hour',
       'Minute', 'Month', 'Hour_sin', 'Hour_cos', 'Minute_sin', 'Minute_cos',
       'Month_sin', 'Month_cos', 'DayOfYear', 'DayOfYear_sin',
       'DayOfYear_cos']], data_normalized], axis=1)

  return data_normalized
  # print(data_normalized.head())

In [14]:
def resample_filter_func(data:pd.DataFrame):
  # Set DateTime as the index for resampling
  data.set_index('DateTime', inplace=True)

  # Resample data in 10-minute intervals using mean
  # 'LocationCode' is a non-numeric column, so we need to handle it separately if we have multiple locations.
  data_resampled = data.resample('1min').mean()

  data_resampled = data_resampled.between_time("07:00", "16:59")

  # Reset index if you want DateTime as a regular column
  data_resampled.reset_index(inplace=True)

  data_resampled.dropna(inplace=True)

  data_resampled['Date'] = data_resampled['DateTime'].dt.date
  
  data_resampled = data_resampled.groupby('Date').filter(lambda x: len(x)==600)
  
  data_resampled = data_resampled.drop(columns=['Date'])
  
  return data_resampled

  # print(data_resampled.head())

In [15]:
def process_pipeline(data_path:str, save_path:str):
  # Load the dataset (replace 'your_data.csv' with the actual file path)
  data = pd.read_csv(data_path)
  data_normalized = normalized_func(data)
  data_resampled = resample_filter_func(data_normalized)
  # Save the resampled data to a new CSV file
  data_resampled.to_csv(save_path, index=False)

In [16]:
# def process_all(dataset_num:int):
#   for step in range(1,dataset_num+1):
#     data_path = f"/home/sebastian/Desktop/AICUP-2024-Power_Prediciton/dataset/36_TrainingData_raw/L{step}_Train.csv"
#     save_path = f"/home/sebastian/Desktop/AICUP-2024-Power_Prediciton/dataset/36_TrainingData_process/L{step}_Train_resampled.csv"
#     process_pipeline(data_path,save_path)

In [17]:
def process_all(dataset_num:int):
  for step in range(1,dataset_num+1):
    data_path = f"/home/sebastian/Desktop/AICUP-2024-Power_Prediciton/dataset/36_TrainingData_interpolation/new_L{step}_Train.csv"
    save_path = f"/home/sebastian/Desktop/AICUP-2024-Power_Prediciton/dataset/36_TrainingData_interpolation_5min/L{step}_Train_resampled.csv"
    process_pipeline(data_path,save_path)

In [18]:
process_all(17)

In [20]:
data_path = f"/home/sebastian/Desktop/AICUP-2024-Power_Prediciton/dataset/36_TrainingData_interpolation/new_L1_Train.csv"
# data_path = f"/home/sebastian/Desktop/AICUP-2024-Power_Prediciton/dataset/36_TrainingData_interpolation_5min/L1_Train_resampled.csv"
data = pd.read_csv(data_path)
data.describe()

Unnamed: 0,LocationCode,WindSpeed(m/s),Pressure(hpa),Temperature(°C),Humidity(%),Sunlight(Lux),Power(mW),Hour,Minute,Month,Hour_sin,Hour_cos,Minute_sin,Minute_cos,Month_sin,Month_cos,DayOfYear,DayOfYear_sin,DayOfYear_cos
count,82708.0,82708.0,82708.0,82708.0,82708.0,82708.0,82708.0,82708.0,82708.0,82708.0,82708.0,82708.0,82708.0,82708.0,82708.0,82708.0,82708.0,82708.0,82708.0
mean,1.0,0.366623,1009.37948,31.696232,67.400872,32637.143473,363.00569,11.438059,29.478261,3.713897,0.107976,-0.722637,-0.00077,0.001089048,0.477679,-0.169699,97.986023,0.504215,-0.04946
std,0.0,0.854776,63.845463,10.143307,25.952298,30411.180597,520.429154,2.925557,17.338959,2.051924,0.639223,0.239869,0.706536,0.7076849,0.450497,0.734906,62.34883,0.364328,0.781406
min,1.0,0.0,958.15,10.5,18.9,148.33,0.01,7.0,0.0,1.0,-0.965926,-1.0,-1.0,-1.0,-0.5,-1.0,1.0,-0.39359,-0.999963
25%,1.0,0.0,1003.76,23.0,43.4,9296.46,18.0,9.0,14.0,2.0,-0.5,-0.965926,-0.669131,-0.6691306,0.5,-0.866025,40.0,0.221922,-0.865307
50%,1.0,0.0,1007.98,31.5,62.8,21352.5,95.685,11.0,29.0,4.0,0.258819,-0.707107,0.0,2.832769e-16,0.5,-0.5,93.0,0.566702,-0.03012
75%,1.0,0.17,1013.65,39.7,100.0,47926.2475,496.0675,14.0,44.0,5.0,0.707107,-0.5,0.669131,0.7431448,0.866025,0.5,152.0,0.821477,0.772157
max,1.0,8.88,9009.0,61.8,300.0,125926.640625,2598.45,17.0,59.0,7.0,0.965926,-0.258819,1.0,1.0,1.0,0.866025,206.0,0.999991,0.999852
