In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [2]:
def load_data(filepath):
    df = pd.read_csv(filepath)
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values('date')
    return df

In [3]:
def remove_outliers_iqr(df, column='Appliances'):
    """
    IQR Logic adapted for pipeline
    """
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    mask = ~((df[column] < (Q1 - 1.5 * IQR)) | (df[column] > (Q3 + 1.5 * IQR)))
    df_cleaned = df[mask]
    return df_cleaned

In [4]:
def split_and_scale(df, target_col='Appliances', train_split=0.8):

    cols_to_drop = ['date', 'day_of_week', 'cluster', 'outlier_flag', 'rv1', 'rv2']
    features = df.drop(columns=[c for c in cols_to_drop if c in df.columns], errors='ignore')

    n = len(features)
    train_df = features[0 : int(n * train_split)]
    test_df = features[int(n * train_split) :]

    scaler = MinMaxScaler()
    scaler.fit(train_df)

    train_scaled = pd.DataFrame(scaler.transform(train_df), columns=features.columns)
    test_scaled = pd.DataFrame(scaler.transform(test_df), columns=features.columns)

    return train_scaled, test_scaled, scaler