In [1]:
# Importing all the necessary libraries

import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.metrics import r2_score

In [2]:
data = pd.read_csv("Energy Dataset.csv")   # Importing data

In [3]:
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,Heating Load
0,0.98,514.5,294.0,110.25,7.0,2,0.0,0,15.55
1,0.98,514.5,294.0,110.25,7.0,3,0.0,0,15.55
2,0.98,514.5,294.0,110.25,7.0,4,0.0,0,15.55
3,0.98,514.5,294.0,110.25,7.0,5,0.0,0,15.55
4,0.9,563.5,318.5,122.5,7.0,2,0.0,0,20.84


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   X1            768 non-null    float64
 1   X2            768 non-null    float64
 2   X3            768 non-null    float64
 3   X4            768 non-null    float64
 4   X5            768 non-null    float64
 5   X6            768 non-null    int64  
 6   X7            768 non-null    float64
 7   X8            768 non-null    int64  
 8   Heating Load  768 non-null    float64
dtypes: float64(7), int64(2)
memory usage: 54.1 KB


In [5]:
data.isnull().sum()    # Checking no of Null values present in each column 

X1              0
X2              0
X3              0
X4              0
X5              0
X6              0
X7              0
X8              0
Heating Load    0
dtype: int64

### Checking variance value for each column to know constant columns

In [6]:
data.var()

X1                 0.011189
X2              7759.163842
X3              1903.269883
X4              2039.963060
X5                 3.066493
X6                 1.251630
X7                 0.017748
X8                 2.405476
Heating Load     101.812050
dtype: float64

In [7]:
varT = VarianceThreshold(threshold=0.2)     # Setting the threshold limit to eliminate constant columns
varT.fit(data)
varT.get_support()
constant_columns = data.columns[varT.get_support() == False]    # Getting the constant column names

In [8]:
constant_columns

Index(['X1', 'X7'], dtype='object')

In [9]:
data.drop(constant_columns,axis=1,inplace=True)     # Dropping constant columns

### Splitting the data into IV and DV 

In [10]:
data

Unnamed: 0,X2,X3,X4,X5,X6,X8,Heating Load
0,514.5,294.0,110.25,7.0,2,0,15.55
1,514.5,294.0,110.25,7.0,3,0,15.55
2,514.5,294.0,110.25,7.0,4,0,15.55
3,514.5,294.0,110.25,7.0,5,0,15.55
4,563.5,318.5,122.50,7.0,2,0,20.84
...,...,...,...,...,...,...,...
763,784.0,343.0,220.50,3.5,5,5,17.88
764,808.5,367.5,220.50,3.5,2,5,16.54
765,808.5,367.5,220.50,3.5,3,5,16.44
766,808.5,367.5,220.50,3.5,4,5,16.48


In [11]:
X = data[['X2', 'X3', 'X4', 'X5', 'X6', 'X8']]   # IV

In [12]:
# Sacling the IV

scaler = StandardScaler()
X = scaler.fit_transform(X) 

In [13]:
X

array([[-1.78587489, -0.56195149, -1.47007664,  1.        , -1.34164079,
        -1.81457514],
       [-1.78587489, -0.56195149, -1.47007664,  1.        , -0.4472136 ,
        -1.81457514],
       [-1.78587489, -0.56195149, -1.47007664,  1.        ,  0.4472136 ,
        -1.81457514],
       ...,
       [ 1.55394308,  1.12390297,  0.97251224, -1.        , -0.4472136 ,
         1.41133622],
       [ 1.55394308,  1.12390297,  0.97251224, -1.        ,  0.4472136 ,
         1.41133622],
       [ 1.55394308,  1.12390297,  0.97251224, -1.        ,  1.34164079,
         1.41133622]])

In [14]:
Scaled_X = pd.DataFrame(X,columns = ['X2', 'X3', 'X4', 'X5', 'X6', 'X8'])  # Converting into dataframe

In [15]:
Scaled_X

Unnamed: 0,X2,X3,X4,X5,X6,X8
0,-1.785875,-0.561951,-1.470077,1.0,-1.341641,-1.814575
1,-1.785875,-0.561951,-1.470077,1.0,-0.447214,-1.814575
2,-1.785875,-0.561951,-1.470077,1.0,0.447214,-1.814575
3,-1.785875,-0.561951,-1.470077,1.0,1.341641,-1.814575
4,-1.229239,0.000000,-1.198678,1.0,-1.341641,-1.814575
...,...,...,...,...,...,...
763,1.275625,0.561951,0.972512,-1.0,1.341641,1.411336
764,1.553943,1.123903,0.972512,-1.0,-1.341641,1.411336
765,1.553943,1.123903,0.972512,-1.0,-0.447214,1.411336
766,1.553943,1.123903,0.972512,-1.0,0.447214,1.411336


In [16]:
y = data['Heating Load']   # DV

In [17]:
y

0      15.55
1      15.55
2      15.55
3      15.55
4      20.84
       ...  
763    17.88
764    16.54
765    16.44
766    16.48
767    16.64
Name: Heating Load, Length: 768, dtype: float64

### Training the XGBoost Regressor model 

In [18]:
# Splitting data into training and testing data
x_train,x_test,y_train,y_test = train_test_split(Scaled_X,y, test_size = 0.2)
print('x_train shape -', x_train.shape)
print('x_test shape -', x_test.shape)

# Creating and training the Random forest Regressor model
xg_reg = XGBRegressor(max_depth = 5, n_estimators = 100)
xg_reg.fit(x_train,y_train)
y_pred = xg_reg.predict(x_test)

# Checking r2_score
r2_score = r2_score(y_test,y_pred)
print(f'The r2_score value of XGBoost Regressor Model is - {r2_score:.2f}')

x_train shape - (614, 6)
x_test shape - (154, 6)
The r2_score value of XGBoost Regressor Model is - 0.90


### Predicting the 'Heating Load' value 

In [19]:
Scaled_X

Unnamed: 0,X2,X3,X4,X5,X6,X8
0,-1.785875,-0.561951,-1.470077,1.0,-1.341641,-1.814575
1,-1.785875,-0.561951,-1.470077,1.0,-0.447214,-1.814575
2,-1.785875,-0.561951,-1.470077,1.0,0.447214,-1.814575
3,-1.785875,-0.561951,-1.470077,1.0,1.341641,-1.814575
4,-1.229239,0.000000,-1.198678,1.0,-1.341641,-1.814575
...,...,...,...,...,...,...
763,1.275625,0.561951,0.972512,-1.0,1.341641,1.411336
764,1.553943,1.123903,0.972512,-1.0,-1.341641,1.411336
765,1.553943,1.123903,0.972512,-1.0,-0.447214,1.411336
766,1.553943,1.123903,0.972512,-1.0,0.447214,1.411336


In [20]:
xg_reg.predict([[-1.785875,-0.561951,-1.470077,1.0,1.341641,-1.814575]])

array([15.432874], dtype=float32)