In [1]:
import os

os.chdir("../")

In [176]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    FunctionTransformer,
    StandardScaler,
    MinMaxScaler,
    OneHotEncoder,
    OrdinalEncoder
)
from sklearn import set_config
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, r2_score

from statsmodels.stats.outliers_influence import variance_inflation_factor

from typing import Optional, List, Dict
import warnings

from ml.utils.utils import (
    get_data,
    remove_outliers,
    numerical_categorical_analysis,
    anova_test,
    two_sample_independent_ttest,
)

import mlflow
import dagshub

%matplotlib inline

In [5]:
# Set the maximum number of columns to display
pd.set_option('display.max_columns', None) 

# Enable pandas output for the pipeline
set_config(transform_output="pandas")

## Ignore warnings
warnings.filterwarnings("ignore")

In [7]:
df = get_data("eda_1", "processed")
df.head()

Unnamed: 0,timestamp,equipment_energy_consumption,lighting_energy,zone1_temperature,zone1_humidity,zone2_temperature,zone2_humidity,zone3_temperature,zone3_humidity,zone4_temperature,zone4_humidity,zone5_temperature,zone5_humidity,zone6_temperature,zone6_humidity,zone7_temperature,zone7_humidity,zone8_temperature,zone8_humidity,zone9_temperature,zone9_humidity,outdoor_temperature,atmospheric_pressure,outdoor_humidity,wind_speed,visibility_index,dew_point,random_variable1,random_variable2,hour,hour_category
0,2016-01-11 17:00:00,60.0,-77.787786,33.746609,47.596667,19.2,44.79,19.79,,19.0,45.566667,17.166667,55.2,,84.256667,17.2,41.626667,18.2,48.9,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433,17,Afternoon
1,2016-01-11 17:10:00,60.0,30.0,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,45.9925,17.166667,55.2,6.833333,84.063333,17.2,,18.2,48.863333,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195,17,Afternoon
2,2016-01-11 17:20:00,50.0,30.0,19.89,46.3,19.2,44.626667,19.79,44.933333,35.921144,45.89,,55.09,6.56,83.156667,17.2,41.433333,18.2,48.73,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668,17,Afternoon
3,2016-01-11 17:30:00,50.0,40.0,33.746609,46.066667,19.2,44.59,19.79,45.0,,45.723333,17.166667,55.09,6.433333,83.423333,17.133333,41.29,18.1,94.385668,17.0,45.4,6.25,733.8,92.0,6.0,51.5,37.673716,45.410389,45.410389,17,Afternoon
4,2016-01-11 17:40:00,60.0,40.0,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,45.53,17.2,55.09,6.366667,84.893333,17.2,41.23,18.1,48.59,4.476511,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097,17,Afternoon


In [8]:
df.shape

(8608, 31)

In [39]:
num_cols_fs = ['lighting_energy',
 'zone6_temperature',
 'zone6_humidity',
 'outdoor_temperature',
 'wind_speed',
 'visibility_index',
 'dew_point',
 'random_variable1',
 'random_variable2']

In [40]:
time = ["timestamp"]
target = ["equipment_energy_consumption"]

In [41]:
df[num_cols_fs+time+target]

Unnamed: 0,lighting_energy,zone6_temperature,zone6_humidity,outdoor_temperature,wind_speed,visibility_index,dew_point,random_variable1,random_variable2,timestamp,equipment_energy_consumption
0,-77.787786,,84.256667,6.600000,7.000000,63.000000,5.300000,13.275433,13.275433,2016-01-11 17:00:00,60.0
1,30.000000,6.833333,84.063333,6.483333,6.666667,59.166667,5.200000,18.606195,18.606195,2016-01-11 17:10:00,60.0
2,30.000000,6.560000,83.156667,6.366667,6.333333,55.333333,5.100000,28.642668,28.642668,2016-01-11 17:20:00,50.0
3,40.000000,6.433333,83.423333,6.250000,6.000000,51.500000,37.673716,45.410389,45.410389,2016-01-11 17:30:00,50.0
4,40.000000,6.366667,84.893333,6.133333,5.666667,47.666667,4.900000,10.084097,10.084097,2016-01-11 17:40:00,60.0
...,...,...,...,...,...,...,...,...,...,...,...
8603,-0.000000,12.890000,353.393026,11.766667,8.000000,47.666667,6.033333,,22.203528,2016-03-27 00:20:00,30.0
8604,0.000000,12.556667,37.330000,10.900000,8.000000,51.500000,,3.690379,3.690379,2016-03-27 00:30:00,30.0
8605,0.000000,11.056667,52.500000,10.033333,,55.333333,6.266667,31.403596,31.403596,2016-03-27 00:40:00,40.0
8606,0.000000,9.323333,66.400000,8.300000,8.000000,63.000000,6.500000,10.606541,10.606541,2016-03-27 01:00:00,70.0


# Train Test Split

## Missing Value Imputation (Median Strategy)

Since exploratory data analysis indicated that the missing values are Missing Completely At Random (MCAR), a median imputation strategy was chosen for numerical features. The median is a robust measure that is less affected by outliers compared to the mean, making it suitable for skewed or non-normally distributed data.

All missing values in numerical columns were filled using the respective column's median.

In [42]:
X = df[num_cols_fs+time+target].drop(columns='equipment_energy_consumption')
X = X.fillna(X.median(numeric_only=True))
y = df[num_cols_fs+time+target]['equipment_energy_consumption'].dropna()

In [43]:
X.shape

(8608, 10)

In [44]:
y.shape

(8608,)

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [46]:
X_train.head()

Unnamed: 0,lighting_energy,zone6_temperature,zone6_humidity,outdoor_temperature,wind_speed,visibility_index,dew_point,random_variable1,random_variable2,timestamp
6444,0.0,-0.733333,78.226667,-0.433333,2.0,35.0,-0.45,24.814893,42.913075,2016-03-08 07:50:00
1867,0.0,9.6,99.9,9.6,4.0,43.0,8.55,2.327381,2.327381,2016-01-27 20:30:00
4048,10.0,-1.9,83.133333,-1.6,1.0,23.0,-2.3,47.18639,47.18639,2016-02-16 02:00:00
425,0.0,2.5,91.833333,3.2,4.0,40.0,-0.55,27.098964,24.7404,2016-01-15 07:10:00
188,0.0,4.833333,78.633333,5.033333,7.166667,22.833333,3.433333,8.272697,8.272697,2016-01-13 05:50:00


In [59]:
# ----- 1. Function to categorize hour -----
def categorize_hour(hour):
    if 0 <= hour < 6:
        return "Night"
    elif 6 <= hour < 12:
        return "Morning"
    elif 12 <= hour < 18:
        return "Afternoon"
    else:
        return "Evening"

In [132]:
# ----- 2. Transformer to add time_of_day -----
class TimeOfDayExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, timestamp_col='timestamp'):
        self.timestamp_col = timestamp_col

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X['hour'] = pd.to_datetime(X[self.timestamp_col]).dt.hour
        X['time_of_day'] = X['hour'].apply(categorize_hour)
        return X.drop(columns=['hour','timestamp'])

In [133]:
# ----- 3. List of numeric columns to scale -----
numeric_cols_to_scale = [
    'lighting_energy'
]

In [134]:
# ----- 4. Function to get categorical columns -----
def get_categorical_features(df, exclude_cols=None):
    if exclude_cols is None:
        exclude_cols = []
    return df.select_dtypes(include='object').columns.difference(exclude_cols).tolist()

In [135]:
# ----- 5. Build Preprocessor Pipeline -----
def build_pipeline(df):
    categorical_cols = get_categorical_features(df, exclude_cols=['time_of_day'])

    # Define transformers
    numeric_transformer = Pipeline([
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline([
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    # Create final preprocessor pipeline
    preprocessor = Pipeline([
        ('time_feature', TimeOfDayExtractor()),  # Extract time_of_day from timestamp
        ('column_processing', ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numeric_cols_to_scale),
                ('cat', categorical_transformer, ['time_of_day'])
            ],
            remainder='passthrough'  
        ))
    ])
    
    return preprocessor

In [136]:
preprocessor = build_pipeline(X_train)
X_train_trans = preprocessor.fit_transform(X_train)

In [137]:
X_test_trans = preprocessor.transform(X_test)

## Preprocessor Pipeline Overview

This pipeline is designed to preprocess your dataset, including handling numerical and categorical columns, extracting time-related features, and scaling data. The goal is to prepare the dataset for modeling by transforming the features accordingly.

Key Steps:
Time Feature Extraction:

Custom Transformer: The TimeOfDayExtractor transformer extracts the time_of_day feature from the timestamp column in the dataset. It categorizes the time into four parts: Night, Morning, Afternoon, and Evening.

This is done using the categorize_hour function that determines the time of day based on the hour value extracted from the timestamp column.

Numeric Feature Scaling:

Standard Scaling: The numeric features are scaled using the StandardScaler, which normalizes the data by transforming it to have a mean of 0 and a standard deviation of 1.

This step is applied to the numeric columns, such as 'lighting_energy', 'zone6_temperature', 'zone6_humidity', etc.

Categorical Feature Encoding:

One-Hot Encoding: Categorical columns are transformed using OneHotEncoder, which converts categorical variables into binary vectors (one-hot encoding). The time_of_day feature, which is extracted earlier, is one of the categorical features to be encoded.

ColumnTransformer:

Transformer Mapping: The ColumnTransformer is used to apply specific transformations to different types of columns:

Numeric Columns: Processed by the numeric_transformer (scaling).

Categorical Columns: Processed by the categorical_transformer (one-hot encoding for the time_of_day feature).

Passthrough: Any columns not mentioned in the transformers are passed through without transformation.

# Dagshub Integration for MLflow Experiment Tracking

In [91]:
mlflow.is_tracking_uri_set()

False

In [93]:
dagshub.init(repo_owner='pramitde726', repo_name='DS-Intern-Assignment-Pramit-De', mlflow=True)



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=ece0c06c-b00f-45db-85ab-fc7d4ae3c665&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=ddbf9dc931a4310e22200251d123eab49cc6d7a0918c9f30af86bd29c5ea2641




In [94]:
mlflow.get_tracking_uri()

'https://dagshub.com/pramitde726/DS-Intern-Assignment-Pramit-De.mlflow'

# Baseline model

In [172]:
lr = LinearRegression()
lr.fit(X_train_trans, y_train)

In [173]:
y_pred_train = lr.predict(X_train_trans)
y_pred_test = lr.predict(X_test_trans)

In [174]:
# calculate the cross val score

scores = cross_val_score(lr,
                         X_train_trans,
                         y_train,
                         cv=5,scoring="r2",
                         n_jobs=-1)

print(scores)
print(f"Mean cross val score: {scores.mean():.2f}")

[0.03642837 0.04055507 0.03375635 0.0410906  0.02390591]
Mean cross val score: 0.04


In [177]:
# mae and r2 score
print(f"The train error is {mean_absolute_error(y_train,y_pred_train):.2f}")
print(f"The test error is {mean_absolute_error(y_test,y_pred_test):.2f}")

print(f"The train r2 score is {r2_score(y_train,y_pred_train):.2f}")
print(f"The test r2 score is {r2_score(y_test,y_pred_test):.2f}")

# RMSE
train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))

print(f"The train RMSE is {train_rmse:.2f}")
print(f"The test RMSE is {test_rmse:.2f}")

The train error is 79.55
The test error is 70.09
The train r2 score is 0.04
The test r2 score is 0.05
The train RMSE is 152.72
The test RMSE is 127.70


In [178]:
dt = DecisionTreeRegressor()

In [179]:
dt.fit(X_train_trans, y_train)

In [180]:
# get the predictions
y_pred_train = dt.predict(X_train_trans)
y_pred_test = dt.predict(X_test_trans)

In [181]:
# calculate the cross val score

scores = cross_val_score(dt,
                         X_train_trans,
                         y_train,
                         cv=5,scoring="r2",
                         n_jobs=-1)

print(scores)
print(f"Mean cross val score: {scores.mean():.2f}")

[-1.04041627 -0.79118832 -0.9349637  -0.95208529 -1.08678985]
Mean cross val score: -0.96


In [182]:
# mae and r2 score
print(f"The train error is {mean_absolute_error(y_train,y_pred_train):.2f}")
print(f"The test error is {mean_absolute_error(y_test,y_pred_test):.2f}")

print(f"The train r2 score is {r2_score(y_train,y_pred_train):.2f}")
print(f"The test r2 score is {r2_score(y_test,y_pred_test):.2f}")

# RMSE
train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))

print(f"The train RMSE is {train_rmse:.2f}")
print(f"The test RMSE is {test_rmse:.2f}")

The train error is 0.00
The test error is 85.92
The train r2 score is 1.00
The test r2 score is -1.39
The train RMSE is 0.00
The test RMSE is 202.03


In [183]:
rf = RandomForestRegressor()

rf.fit(X_train_trans, y_train)

In [184]:
# get the predictions
y_pred_train = rf.predict(X_train_trans)
y_pred_test = rf.predict(X_test_trans)

In [185]:
# calculate the cross val score

scores = cross_val_score(rf,
                         X_train_trans,
                         y_train,
                         cv=5,scoring="r2",
                         n_jobs=-1)

print(scores)
print(f"Mean cross val score: {scores.mean():.2f}")

[ 0.05642578  0.04237002  0.08104699  0.03708861 -0.00289968]
Mean cross val score: 0.04


In [186]:
# mae and r2 score
print(f"The train error is {mean_absolute_error(y_train,y_pred_train):.2f}")
print(f"The test error is {mean_absolute_error(y_test,y_pred_test):.2f}")

print(f"The train r2 score is {r2_score(y_train,y_pred_train):.2f}")
print(f"The test r2 score is {r2_score(y_test,y_pred_test):.2f}")

# RMSE
train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))

print(f"The train RMSE is {train_rmse:.2f}")
print(f"The test RMSE is {test_rmse:.2f}")

The train error is 28.56
The test error is 67.09
The train r2 score is 0.87
The test r2 score is 0.10
The train RMSE is 56.52
The test RMSE is 123.78


In [187]:
gbr = GradientBoostingRegressor()

gbr.fit(X_train_trans, y_train)

In [188]:
# get the predictions
y_pred_train = gbr.predict(X_train_trans)
y_pred_test = gbr.predict(X_test_trans)

In [189]:
# calculate the cross val score

scores = cross_val_score(gbr,
                         X_train_trans,
                         y_train,
                         cv=5,
                         scoring="r2",
                         n_jobs=-1)

print(scores)
print(f"Mean cross val score: {scores.mean():.2f}")

[0.03271235 0.03538607 0.04495613 0.04962985 0.02768914]
Mean cross val score: 0.04


In [190]:
# mae and r2 score
print(f"The train error is {mean_absolute_error(y_train,y_pred_train):.2f}")
print(f"The test error is {mean_absolute_error(y_test,y_pred_test):.2f}")

print(f"The train r2 score is {r2_score(y_train,y_pred_train):.2f}")
print(f"The test r2 score is {r2_score(y_test,y_pred_test):.2f}")

# RMSE
train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))

print(f"The train RMSE is {train_rmse:.2f}")
print(f"The test RMSE is {test_rmse:.2f}")

The train error is 72.32
The test error is 69.00
The train r2 score is 0.20
The test r2 score is 0.07
The train RMSE is 139.14
The test RMSE is 125.91


## Observation

- The current task cannot be effectively solved due to the absence of sufficiently informative or representative features required to build a meaningful predictive model.