In [31]:
# Step 1: Importing libraries and loading the data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler

In [32]:
def normalized_func(data):
  # Convert 'DateTime' to datetime format
  data['DateTime'] = pd.to_datetime(data['DateTime'])
  
  # Selecting numerical columns for normalization
  normalize_features = ['WindSpeed(m/s)', 'Pressure(hpa)', 'Temperature(°C)', 'Humidity(%)', 'Sunlight(Lux)']
  data_numerical = data[normalize_features]

  scaler = MinMaxScaler(feature_range=(0, 1))
  data_normalized = scaler.fit_transform(data_numerical)

  # Creating a DataFrame with normalized data
  data_normalized = pd.DataFrame(data_normalized, columns=normalize_features)

  # Concatenate normalized data with other non-numerical columns if needed
  data_normalized = pd.concat([data[['LocationCode', 'DateTime', 'Power(mW)','Hour',
       'Minute', 'Month', 'Hour_sin', 'Hour_cos', 'Minute_sin', 'Minute_cos',
       'Month_sin', 'Month_cos', 'DayOfYear', 'DayOfYear_sin',
       'DayOfYear_cos']], data_normalized], axis=1)

  return data_normalized
  # print(data_normalized.head())

In [None]:
def resample_filter_func(data:pd.DataFrame):
  # Set DateTime as the index for resampling
  data.set_index('DateTime', inplace=True)

  # Resample data in 10-minute intervals using mean
  # 'LocationCode' is a non-numeric column, so we need to handle it separately if we have multiple locations.
  data_resampled = data.resample('10min').mean()

  data_resampled = data_resampled.between_time("07:00", "16:50")

  # Reset index if you want DateTime as a regular column
  data_resampled.reset_index(inplace=True)

  data_resampled.dropna(inplace=True)

  data_resampled['Date'] = data_resampled['DateTime'].dt.date
  
  data_resampled = data_resampled.groupby('Date').filter(lambda x: len(x)>=60)
  
  data_resampled = data_resampled.drop(columns=['Date'])
  
  return data_resampled

  # print(data_resampled.head())

In [34]:
def process_pipeline(data_path:str, save_path:str):
  # Load the dataset (replace 'your_data.csv' with the actual file path)
  data = pd.read_csv(data_path)
  data_normalized = normalized_func(data)
  data_resampled = resample_filter_func(data_normalized)
  # Save the resampled data to a new CSV file
  data_resampled.to_csv(save_path, index=False)

In [35]:
# def process_all(dataset_num:int):
#   for step in range(1,dataset_num+1):
#     data_path = f"/home/sebastian/Desktop/AICUP-2024-Power_Prediciton/dataset/36_TrainingData_raw/L{step}_Train.csv"
#     save_path = f"/home/sebastian/Desktop/AICUP-2024-Power_Prediciton/dataset/36_TrainingData_process/L{step}_Train_resampled.csv"
#     process_pipeline(data_path,save_path)

In [36]:
def process_all(dataset_num:int):
  for step in range(1,dataset_num+1):
    data_path = f"/home/sebastian/Desktop/AICUP-2024-Power_Prediciton/dataset/36_TrainingData_interpolation/new_L{step}_Train.csv"
    save_path = f"/home/sebastian/Desktop/AICUP-2024-Power_Prediciton/dataset/36_TrainingData_interpolation_process/L{step}_Train_resampled.csv"
    process_pipeline(data_path,save_path)

In [37]:
process_all(17)

In [30]:
data_path = f"/home/sebastian/Desktop/AICUP-2024-Power_Prediciton/dataset/36_TrainingData_interpolation/new_L1_Train.csv"
data = pd.read_csv(data_path)
data.columns

Index(['DateTime', 'LocationCode', 'WindSpeed(m/s)', 'Pressure(hpa)',
       'Temperature(°C)', 'Humidity(%)', 'Sunlight(Lux)', 'Power(mW)', 'Hour',
       'Minute', 'Month', 'Hour_sin', 'Hour_cos', 'Minute_sin', 'Minute_cos',
       'Month_sin', 'Month_cos', 'DayOfYear', 'DayOfYear_sin',
       'DayOfYear_cos'],
      dtype='object')