In [94]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [95]:
data = pd.read_csv(r'C:\Users\91909\0 Real Project\Predictive model for power consumption\Data01.csv')
data.head()

Unnamed: 0,S no,Temperature,Humidity,Wind_Speed,general_diffuse_flows,diffuse_flows,Air_Quality_Index_PM,Cloudiness,Power_Consumption_in_A_Zone
0,1,6.559,73.8,0.083,0.051,0.119,158.0,1,34055.6962
1,2,6.414,74.5,0.083,0.07,0.085,159.0,1,29814.68354
2,3,6.313,74.5,0.08,0.062,0.1,151.0,1,29128.10127
3,4,6.121,75.0,0.083,0.091,0.096,151.0,1,28228.86076
4,5,5.921,75.7,0.081,0.048,0.085,154.0,1,27335.6962


In [96]:
data.isnull().sum()

S no                              0
Temperature                     120
Humidity                          9
Wind_Speed                       55
general_diffuse_flows           168
diffuse_flows                     0
Air_Quality_Index_PM             17
Cloudiness                        0
 Power_Consumption_in_A_Zone      0
dtype: int64

In [97]:
a = pd.DataFrame(np.round(data.isnull().mean()*100,2), columns = ['null_percent'])
b = pd.DataFrame(data.isnull().sum(),columns = ['null_value_count'])
c = pd.DataFrame(data.dtypes,columns = ['Data_type'])
d = pd.DataFrame(data.nunique(),columns = ['Unique_values'])
e = pd.DataFrame(data.head(2).T)
e.columns = ['s1','s2']
info = pd.concat([a,b,c,d,e],axis = 1)
info

Unnamed: 0,null_percent,null_value_count,Data_type,Unique_values,s1,s2
S no,0.0,0,int64,5000,1.0,2.0
Temperature,2.4,120,float64,1365,6.559,6.414
Humidity,0.18,9,float64,1602,73.8,74.5
Wind_Speed,1.1,55,float64,76,0.083,0.083
general_diffuse_flows,3.36,168,float64,1968,0.051,0.07
diffuse_flows,0.0,0,float64,2045,0.119,0.085
Air_Quality_Index_PM,0.34,17,float64,10,158.0,159.0
Cloudiness,0.0,0,int64,2,1.0,1.0
Power_Consumption_in_A_Zone,0.0,0,float64,2833,34055.6962,29814.68354


In [98]:
data.drop(columns = 'S no',inplace = True)

In [99]:
data.columns

Index(['Temperature', 'Humidity', 'Wind_Speed', 'general_diffuse_flows',
       'diffuse_flows', 'Air_Quality_Index_PM', 'Cloudiness',
       ' Power_Consumption_in_A_Zone'],
      dtype='object')

In [100]:
numeric = ['Temperature','Humidity','Wind_Speed','general_diffuse_flows','diffuse_flows', 'Air_Quality_Index_PM','Cloudiness']

In [102]:
data[' Power_Consumption_in_A_Zone'].value_counts()

 Power_Consumption_in_A_Zone
13736.17021    8
13732.52280    8
13272.94833    8
21392.09726    8
21198.78419    7
              ..
24568.99696    1
24784.19453    1
25200.00000    1
25247.41641    1
17598.78419    1
Name: count, Length: 2833, dtype: int64

In [103]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

In [104]:
numeric_pipeline = Pipeline([
    ('imputer',SimpleImputer(strategy = 'median')),
    ('scaler',MinMaxScaler())
])

In [105]:
from sklearn.compose import ColumnTransformer

In [106]:
preprocessor = ColumnTransformer([
    ('numeric',numeric_pipeline,numeric)
])

In [107]:
x = data.iloc[:,data.columns !=' Power_Consumption_in_A_Zone']
y = data[[' Power_Consumption_in_A_Zone']]

In [108]:
from sklearn.model_selection import train_test_split

In [109]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size = 0.2,random_state = 10)

In [110]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

In [111]:

from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [112]:
models = {
    'LinearRegression':LinearRegression(),
    'DecisionTreeRegressor':DecisionTreeRegressor(random_state =10),
    'RandomForestRegressor':RandomForestRegressor(random_state = 10),
    'AdaBoostRegressor':AdaBoostRegressor(n_estimators = 50,random_state = 10),
    'XGBRegressor':XGBRegressor(random_state = 10),
    'KNeighborsRegressor':KNeighborsRegressor(),
    'SVR':SVR()
}

In [113]:
for name,model in models.items():
    print('\n',f'Model name: {name}')
    model_pipeline = Pipeline([
        ('preprocessor',preprocessor),
        ('regressor',model)
    ])
    model_pipeline.fit(xtrain,ytrain)
    ypred = model_pipeline.predict(xtest)
    print(f'r2 score {r2_score(ytest,ypred)}')
    print(f'mean_absolute_error {mean_absolute_error(ytest,ypred)}')
    print(f'mean_squared_error {mean_squared_error(ytest,ypred)}')


 Model name: LinearRegression
r2 score 0.13528578225244114
mean_absolute_error 3403.1327550952624
mean_squared_error 17378823.245851934

 Model name: DecisionTreeRegressor
r2 score 0.274818396596039
mean_absolute_error 2150.959208734
mean_squared_error 14574529.535930615

 Model name: RandomForestRegressor
r2 score 0.5668036155598458
mean_absolute_error 2006.9621061052499
mean_squared_error 8706279.186131509

 Model name: AdaBoostRegressor
r2 score 0.2115529026380687
mean_absolute_error 3247.20993767732
mean_squared_error 15846024.573818447

 Model name: XGBRegressor
r2 score 0.5622143745422363
mean_absolute_error 2083.6495494655624
mean_squared_error 8798512.711053364

 Model name: KNeighborsRegressor
r2 score 0.19351586636079388
mean_absolute_error 2851.8429256817994
mean_squared_error 16208528.692414181

 Model name: SVR
r2 score 0.01231247777064759
mean_absolute_error 3763.6351352888782
mean_squared_error 19850311.835590057


In [114]:
mod_pipeline = Pipeline([
        ('preprocessor',preprocessor),
        ('regressor',XGBRegressor(random_state = 10))
    ])


In [115]:
mod_pipeline.fit(xtrain,ytrain)

In [116]:
ypred = mod_pipeline.predict(xtest)

In [117]:
print(f'r2 score {r2_score(ytest,ypred)}')
print(f'mean_absolute_error {mean_absolute_error(ytest,ypred)}')
print(f'mean_squared_error {mean_squared_error(ytest,ypred)}')

r2 score 0.5622143745422363
mean_absolute_error 2083.6495494655624
mean_squared_error 8798512.711053364


In [118]:
import joblib

In [119]:
preprocessor = mod_pipeline.named_steps['preprocessor']
model = mod_pipeline.named_steps['regressor']

In [120]:
joblib.dump(preprocessor,'preprocessor.pkl')
joblib.dump(model,'model.pkl')

['model.pkl']

In [121]:
pwd

'c:\\Users\\91909\\0 Real Project\\Predictive model for power consumption'