<a href="https://colab.research.google.com/github/SourabhGothe/DA224-O-Mini-Projects/blob/main/Seoul_Rental_bike_count_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Common Imports

In [1]:
from google.colab import drive
import os
import pandas as pd
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, RidgeCV, Lasso, ElasticNet, SGDRegressor
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold, GridSearchCV
import xgboost as xg
import numpy as np
import sys
from sklearn import linear_model
pd.options.mode.chained_assignment = None

from IPython.display import Markdown, display
def printmd(string, color=None):
    colorstr = "<span style='color:{}'>{}</span>".format(color, string)
    display(Markdown(colorstr))
np.random.seed(42)

In [2]:
drive.mount('/content/drive')
drivepath = "/content/drive/Shareddrives/DA224-O/mp_1/data/"

Mounted at /content/drive


## Load the dataset from Google Drive


In [101]:
data_file = drivepath+'SeoulBikeData_MP1.csv'
df = pd.read_csv (data_file, index_col=0)

# Common methods required

In [102]:
def prepareData(data, removeRedundant=True, max_days=None):
  X = data.drop(['Rented Bike Count'], axis=1).copy()
  y = data['Rented Bike Count'].copy()

  #If already prepared, just return the feature and target
  if 'Seasons_sin' in data.columns:
      if removeRedundant:
        X = X.drop(['Date'], axis=1)
        X = X.drop(['Hour'], axis=1)
        X = X.drop(['Seasons'], axis=1)
      return X, y
  
  X['Seasons'].replace(['Summer', 'Spring', "Autumn", "Winter"], [0, 1, 2, 3], inplace=True)
  X["Seasons_sin"] = np.sin(2 * np.pi * X['Seasons']/3.0)
  X['Date'] =  pd.to_datetime(X['Date'])
  X["Month"] = np.sin(2 * np.pi * X['Date'].dt.month/12.0)
  if max_days == None:
    max_days = 6.0
  X['Day'] =  np.sin(2 * np.pi * X['Date'].dt.dayofweek/max_days)
  X['Hour_sin'] = np.sin(2 * np.pi * X['Hour']/23.0)
  X['Hour_cos'] = np.cos(2 * np.pi * X['Hour']/23.0)

  if removeRedundant:
    X = X.drop(['Date'], axis=1)
    X = X.drop(['Hour'], axis=1)
    X = X.drop(['Seasons'], axis=1)
  return X, y

**Hyper-parameter tuning with Grid Search**

In [None]:
model_params = {
    'elasticnet' : {
        'model' : ElasticNet(),
        'params' : {
            'alpha' : [0.1,0.15,0.18,0.2,0.22,0.24,0.3],
            'l1_ratio' : [0.9,0.95,0.97,0.98,0.99,1],
            'max_iter' : [20000,15000,25000] 
        }
    }
    ,
    'SGDRegressor' : {
        'model' : SGDRegressor(random_state = 42),
        'params' : {
            'alpha' : [0.1,0.01,0.001,0.0001],
            'max_iter' : [20000,15000,25000],
            'learning_rate' : ['constant','optimal','invscaling']
        }
    }
}

def runPipeline(X1):
  cat_cols = ['Holiday', 'Functioning Day']
  num_cols = [col for col in list(X1) if col not in cat_cols]

  num_pipeline = Pipeline([('imputer', KNNImputer(n_neighbors=2)), ('scaler',StandardScaler())])
  cat_pipeline = Pipeline([('encoder',OneHotEncoder())])
  full_pipeline = ColumnTransformer([('num', num_pipeline, num_cols),
                                  ('cat', cat_pipeline, cat_cols)])

  #Data preparation
  X_train_prepared = full_pipeline.fit_transform(X1)

  return X_train_prepared

def GridSCV(data):
  X_grid,y_grid = prepareData(data)
  X_prepared_grid = runPipeline(X_grid)
  scores = []
  for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'],mp['params'], cv=5, scoring ='neg_mean_squared_error', return_train_score = False)
    clf.fit(X_prepared_grid,y_grid)
    scores.append({
        'model' : model_name,
        'best_score' : clf.best_score_,
        'best_params' : clf.best_params_,
        'best_model'  :clf.best_estimator_
    })
  print(scores)
  return scores

**K-fold CV**

In [11]:
def runCV(splits, regressor, X1, y1, withHoliday=True):
  cat_cols = ['Functioning Day']
  if withHoliday:
    cat_cols.append('Holiday')
  num_cols = [col for col in list(X1) if col not in cat_cols]
  print("Model --> "+ type(regressor()).__name__)
  RMSEscores = []
  R2Score = []
  trainedModels = []
  inputTransformer = []

  kf = KFold(n_splits=splits)
  for train, test in kf.split(X1):
    X_train, X_test, y_train, y_test = X1.loc[train], X1.loc[test], y1.loc[train], y1.loc[test]

    num_pipeline = Pipeline([('scaler', StandardScaler())])
    cat_pipeline = Pipeline([('encoder', OneHotEncoder())])
    full_pipeline = ColumnTransformer([('num', num_pipeline, num_cols),
                                  ('cat', cat_pipeline, cat_cols)])

    #Data preparation
    X_train_prepared = full_pipeline.fit_transform(X_train)
    X_test_prepared = full_pipeline.transform(X_test)

    #Model Fitting
    model = regressor()
    model.fit(X_train_prepared, y_train)

    #Evaluation
    y_test_predict = model.predict(X_test_prepared)
    RMSE = mean_squared_error(y_test, y_test_predict, squared=False)
    R2 = r2_score(y_test,y_test_predict, multioutput='variance_weighted')
    RMSEscores.append(RMSE)
    R2Score.append(R2)
    trainedModels.append(model)
    inputTransformer.append(full_pipeline)
  print("RMSE scores --> " +  str(RMSEscores))
  print("R2 scores  --> " +  str(R2Score))
  printmd("**Min RMSE = " + str(min(RMSEscores)) + "**")
  printmd("**MAX R2 = " + str(max(R2Score)) + "**")
  metric = min(RMSEscores)
  print('=========================================================================================')
  modelIndex = RMSEscores.index(min(RMSEscores))#Based on MSE, could be based on R2 too
  return trainedModels[modelIndex], inputTransformer[modelIndex], metric

In [12]:
def evaluateCVModel(modelPipeline, X1, y1):
  model = modelPipeline[0]
  pipeline = modelPipeline[1]

  #Data preparation
  X_test_prepared = pipeline.transform(X1)

  y_test_predict = model.predict(X_test_prepared)
  RMSE = mean_squared_error(y1, y_test_predict, squared=False)
  R2 = r2_score(y1, y_test_predict, multioutput='variance_weighted')
  printmd("**Model : "+type(model).__name__+",  RMSE = " +str(RMSE)+ ", R2 ="+ str(R2) + "**")

  return y_test_predict, y1

In [13]:
def getFinalMetrics(y_test_predict, y1, model=None):
  RMSE = mean_squared_error(y1, y_test_predict, squared=False)
  R2 = r2_score(y1, y_test_predict, multioutput='variance_weighted')
  if model != None:
    printmd("**Model : "+type(model).__name__+",  RMSE = " +str(RMSE)+ ", R2 ="+ str(R2) + "**")
  else:
    printmd("**RMSE = " +str(RMSE)+ ", R2 ="+ str(R2) + "**")

**Initial Data Split into Train and Test, no changes to be made further**

In [103]:
trainDf = pd.DataFrame()
testDf = pd.DataFrame()
def initData():
  global trainDf, testDf
  X, y = prepareData(df, False)
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state=42)

  trainDf = X_train
  trainDf['Rented Bike Count'] = y_train

  testDf = X_test
  testDf['Rented Bike Count'] = y_test

In [104]:
initData()

In [105]:
LinearRegressionModels = ["LinearRegression", "RidgeCV", "Lasso", "ElasticNet"]
models = [LinearRegression, RidgeCV, Lasso, KNeighborsRegressor, SGDRegressor, ElasticNet]

# Basic pipeline with SimpleImputer and Linear Regression

In [106]:
X_train, y_train = prepareData(trainDf)
X_test, y_test = prepareData(testDf)
cat_cols = ['Holiday', 'Functioning Day']
num_cols = [col for col in list(X_train) if col not in cat_cols]

strategies = ["mean", "median"] #Currently required only for numerical missing (Temperature)

for strategy in strategies:
    num_pipeline = Pipeline([('imputer', SimpleImputer()),
                            ('scaler', StandardScaler())])
    cat_pipeline = Pipeline([('encoder', OneHotEncoder())])
    full_pipeline = ColumnTransformer([('num', num_pipeline, num_cols),
                                    ('cat', cat_pipeline, cat_cols)])

    #Data preparation
    X_train_prepared = full_pipeline.fit_transform(X_train)
    X_test_prepared = full_pipeline.transform(X_test)

    #Model Fitting
    lin_reg = LinearRegression()
    lin_reg.fit(X_train_prepared, y_train)

    #Evaluation
    y_test_predict = lin_reg.predict(X_test_prepared)
    RMSE = mean_squared_error(y_test, y_test_predict, squared=False)
    R2 = r2_score(y_test,y_test_predict, multioutput='variance_weighted')
    print("Impute strategy - SimpleImputer(" + strategy + "), RMSE = " +str(RMSE)+ ", R2 =", str(R2))

Impute strategy - SimpleImputer(mean), RMSE = 463.25885180035255, R2 = 0.4849137125256828
Impute strategy - SimpleImputer(median), RMSE = 463.25885180035255, R2 = 0.4849137125256828


# Fill Training Data with Imputer (Regressor Model)

In [107]:
def performKNN(dfWithTemp, xcols):
  X, y = prepareData(dfWithTemp)
  Xtem = X[xcols]
  ytem = X['Temperature(C)']

  X_trainT, X_testT, y_trainT, y_testT = train_test_split(Xtem, ytem, test_size= 0.2, random_state=42)
  lin_reg = KNeighborsRegressor()
  lin_reg.fit(X_trainT,y_trainT)
  y_test_predict = lin_reg.predict(X_testT)

  RMSE = mean_squared_error(y_testT,y_test_predict,squared=False)

  R2 = r2_score(y_testT,y_test_predict,multioutput='variance_weighted')

  print("Imputer Model KNN \nRMSE = " +str(RMSE)+ "\nR2 =", str(R2))

  return lin_reg, RMSE, R2

In [108]:
# Use only the rows with temperature value for building the model
dfWithTemp = trainDf.loc[trainDf['Temperature(C)'].notnull()]
# Keep the rows without Temperature value for prediction using the model created
dfPrediction = trainDf.loc[trainDf['Temperature(C)'].isna()]
xparams = ["Dew point temperature(C)", "Solar Radiation (MJ/m2)", "Snowfall (cm)", "Humidity(%)", "Rainfall(mm)",  "Hour_sin", "Hour_cos"]

#Training the Imputer
lin_reg, RMSE, R2 = performKNN(dfWithTemp, xparams)
#Predicting Missing Values
trainDf.loc[trainDf['Temperature(C)'].isna(), "Temperature(C)"] = lin_reg.predict(trainDf.loc[trainDf['Temperature(C)'].isna()][xparams])
testDf.loc[testDf['Temperature(C)'].isna(), "Temperature(C)"] = lin_reg.predict(testDf.loc[testDf['Temperature(C)'].isna()][xparams])

Imputer Model KNN 
RMSE = 0.5042309276712973
R2 = 0.9982082398101662


# Training on complete data with imputation

In [110]:
trainDf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7008 entries, 8415 to 7270
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   Date                      7008 non-null   datetime64[ns]
 1   Hour                      7008 non-null   int64         
 2   Temperature(C)            7008 non-null   float64       
 3   Humidity(%)               7008 non-null   int64         
 4   Wind speed (m/s)          7008 non-null   float64       
 5   Visibility (10m)          7008 non-null   int64         
 6   Dew point temperature(C)  7008 non-null   float64       
 7   Solar Radiation (MJ/m2)   7008 non-null   float64       
 8   Rainfall(mm)              7008 non-null   float64       
 9   Snowfall (cm)             7008 non-null   float64       
 10  Seasons                   7008 non-null   int64         
 11  Holiday                   7008 non-null   object        
 12  Functioning Day  

In [111]:
#Complete Data
trainDf.reset_index(drop=True, inplace=True)
testDf.reset_index(drop=True, inplace=True)
X, y = prepareData(trainDf)
soFarBestMSE = sys.maxsize
finalModelAllData = []
LRfinalModelAllData = []
LRsoFarBestMSE = sys.maxsize
for model in models:
  bestModelMSE, imputer, metric = runCV(10, model, X, y)
  if metric < soFarBestMSE:
    finalModelAllData = [bestModelMSE, imputer]
    soFarBestMSE = metric
  if metric < LRsoFarBestMSE:
    if type(model()).__name__ in LinearRegressionModels:
      LRsoFarBestMSE = metric
      LRfinalModelAllData = [bestModelMSE, imputer]

Model --> LinearRegression
RMSE scores --> [439.1711789389289, 479.2462481285659, 450.53966912533883, 435.6023093723511, 459.05856631049477, 422.20842898577047, 452.60772026874, 445.71169457979784, 456.9535734542384, 435.64036159480185]
R2 scores  --> [0.5057287279037879, 0.44907574184960297, 0.50720151872786, 0.535288919607892, 0.5314793298295908, 0.5723684073285139, 0.5146284546456898, 0.5288182078842067, 0.4784090226772204, 0.54379753117227]


<span style='color:None'>**Min RMSE = 422.20842898577047**</span>

<span style='color:None'>**MAX R2 = 0.5723684073285139**</span>

Model --> RidgeCV
RMSE scores --> [438.90777553011037, 479.22171019398667, 450.7763249132104, 435.77623966339354, 459.1674372248913, 422.4129320963961, 452.56828294691365, 445.3948403706469, 456.633202877057, 435.68050400706863]
R2 scores  --> [0.5063214521128895, 0.44913215626025416, 0.5066836764582903, 0.5349177394356518, 0.5312570735594215, 0.5719540471961595, 0.5147130352616969, 0.5294878913098675, 0.47914014213589795, 0.5437134530521917]


<span style='color:None'>**Min RMSE = 422.4129320963961**</span>

<span style='color:None'>**MAX R2 = 0.5719540471961595**</span>

Model --> Lasso
RMSE scores --> [438.650477567135, 479.19007248929785, 451.24020059063946, 436.0580032650793, 459.22298545736135, 422.95606571523496, 452.6595849981949, 445.1467955201444, 456.3218356991917, 435.7654960267259]
R2 scores  --> [0.5069000941878118, 0.44920488927087, 0.5056678503166174, 0.5343161205047935, 0.5311436534258829, 0.5708525865462273, 0.5145172099404229, 0.5300118111990413, 0.47985022353218465, 0.54353541204527]


<span style='color:None'>**Min RMSE = 422.95606571523496**</span>

<span style='color:None'>**MAX R2 = 0.5708525865462273**</span>

Model --> KNeighborsRegressor
RMSE scores --> [268.23116619141246, 308.17777036234526, 292.62344169688083, 252.1011528526096, 288.98526565561866, 286.37431407223374, 286.804846989684, 262.437839552724, 296.8746031576295, 256.2042385062578]
R2 scores  --> [0.8156189931846964, 0.77218775187031, 0.7921157834370265, 0.8443488462874729, 0.8143290480141919, 0.8032638773414184, 0.8051038004382537, 0.8366446550975751, 0.779843249115773, 0.842211978016881]


<span style='color:None'>**Min RMSE = 252.1011528526096**</span>

<span style='color:None'>**MAX R2 = 0.8443488462874729**</span>

Model --> SGDRegressor
RMSE scores --> [437.45990150168865, 479.9097769087531, 451.24119025024083, 436.3854426104002, 460.5659890667027, 423.1452842677625, 453.0082550802154, 444.50618625897573, 456.1921517860799, 435.10099325218283]
R2 scores  --> [0.5095731847922225, 0.4475491481122461, 0.505665681976589, 0.533616486630833, 0.5283972904032325, 0.5704685238841181, 0.5137690162738338, 0.5313635547078231, 0.48014582837937714, 0.5449264846231172]


<span style='color:None'>**Min RMSE = 423.1452842677625**</span>

<span style='color:None'>**MAX R2 = 0.5704685238841181**</span>

Model --> ElasticNet
RMSE scores --> [468.00259645088045, 505.47272024492577, 490.36352490306547, 481.01481959767443, 506.38857228048994, 469.8363961690102, 484.4804080936559, 482.9086174684468, 484.4194973659171, 483.2945695993597]
R2 scores  --> [0.43870104770437934, 0.3871278358148367, 0.4162329129738471, 0.43334386611995934, 0.42988777133683637, 0.47044716519191915, 0.4438616604014727, 0.44689147532364093, 0.4138225007916071, 0.4385316989776946]


<span style='color:None'>**Min RMSE = 468.00259645088045**</span>

<span style='color:None'>**MAX R2 = 0.47044716519191915**</span>



**Testing Full models on unseen data**

In [112]:
#Full Data
print("Evaluation for Model trained on complete data")

X, y = prepareData(testDf)
out = evaluateCVModel(LRfinalModelAllData, X, y)
out = evaluateCVModel(finalModelAllData, X, y)

Evaluation for Model trained on complete data


<span style='color:None'>**Model : LinearRegression,  RMSE = 462.18758667855525, R2 =0.48729318556997747**</span>

<span style='color:None'>**Model : KNeighborsRegressor,  RMSE = 304.34946249276453, R2 =0.7776804710175861**</span>

# Experiment with separate models for each seasons

In [117]:
seasonwiseData = {}
seasons = ["Summer", "Spring", "Autumn", "Winter"]
seasontoIdmap = dict()
seasontoIdmap['Summer'] = 0
seasontoIdmap['Spring'] = 1
seasontoIdmap['Autumn'] = 2
seasontoIdmap['Winter'] = 3

mask = (trainDf['Seasons'] == seasontoIdmap['Summer'] )
seasonwiseData["Summer"] = trainDf[mask].copy()

mask = (trainDf['Seasons'] == seasontoIdmap['Spring'])
seasonwiseData["Spring"] = trainDf[mask].copy()

mask = (trainDf['Seasons'] == seasontoIdmap['Autumn'])
seasonwiseData["Autumn"] = trainDf[mask].copy()

mask = (trainDf['Seasons'] == seasontoIdmap['Winter'])
seasonwiseData["Winter"] = trainDf[mask].copy()

finalModelSeasonwise = dict()
LRfinalModelSeasonwise = dict()

for season in seasons:
  seasonwiseData[season].reset_index(drop=True, inplace=True)

In [119]:
for season in seasons:
  printmd("**Season : "+season+"**")
  X, y = prepareData(seasonwiseData[season])
  soFarBestMSE = sys.maxsize
  finalModelWithCurrentSeason = []
  LRsoFarBestMSE = sys.maxsize
  LRfinalModelWithCurrentSeason = []
  for model in models:
    bestModelMSE, imputer, metric = runCV(10, model, X, y)
    if metric < soFarBestMSE:
      finalModelWithCurrentSeason = [bestModelMSE, imputer]
      soFarBestMSE = metric
    if metric < LRsoFarBestMSE:
      if type(model()).__name__ in LinearRegressionModels:
        LRsoFarBestMSE = metric
        LRfinalModelWithCurrentSeason = [bestModelMSE, imputer]
  finalModelSeasonwise[season] = finalModelWithCurrentSeason
  LRfinalModelSeasonwise[season] = LRfinalModelWithCurrentSeason

<span style='color:None'>**Season : Summer**</span>

Model --> LinearRegression
RMSE scores --> [489.1118084504191, 489.01414978996934, 498.3022004864068, 506.764309917095, 452.3585381784103, 483.31120417872756, 492.17006751148637, 504.9842119744287, 488.90413682849083, 464.2920188257495]
R2 scores  --> [0.3586702477691577, 0.4462024968638964, 0.49740921600627475, 0.4775233258030411, 0.6051321169464746, 0.5437766550824896, 0.5280506267018943, 0.4652080534624427, 0.47518311070762836, 0.4860515038722108]


<span style='color:None'>**Min RMSE = 452.3585381784103**</span>

<span style='color:None'>**MAX R2 = 0.6051321169464746**</span>

Model --> RidgeCV
RMSE scores --> [489.09495452302576, 488.96303234330713, 498.420164415114, 506.7564456100511, 452.4774611862437, 483.1928523412486, 492.12202074016676, 504.98271265701163, 488.93531981624307, 464.2091158294641]
R2 scores  --> [0.3587144451844083, 0.4463182695251264, 0.49717122949536086, 0.47753954196122855, 0.6049244717048199, 0.5440000650082342, 0.5281427677656683, 0.4652112290931679, 0.4751161614662503, 0.4862350265887448]


<span style='color:None'>**Min RMSE = 452.4774611862437**</span>

<span style='color:None'>**MAX R2 = 0.6049244717048199**</span>

Model --> Lasso
RMSE scores --> [488.54154894028846, 488.28721931508767, 501.06917509302747, 507.2544977670025, 454.75297912953084, 481.1457685405629, 491.31445032808426, 505.5812265658638, 489.1338757835698, 463.7176174332342]
R2 scores  --> [0.360164839326654, 0.44784773786873694, 0.4918121427619223, 0.4765120644350312, 0.6009407948201648, 0.5478556382705992, 0.5296901290545363, 0.4639427967652905, 0.4746897656808384, 0.4873223855926672]


<span style='color:None'>**Min RMSE = 454.75297912953084**</span>

<span style='color:None'>**MAX R2 = 0.6009407948201648**</span>

Model --> KNeighborsRegressor
RMSE scores --> [322.57555950646525, 345.53784656545355, 368.31729095737796, 325.31432841596035, 361.19357406085885, 363.2378793966447, 340.52268529424, 353.6429130392084, 322.0196384428396, 320.5675082011365]
R2 scores  --> [0.721048943577965, 0.7234972936094337, 0.7254173285907177, 0.784691463775033, 0.7482518672540225, 0.7423049708521263, 0.7740787315939212, 0.7377236226495096, 0.772319888827466, 0.7549943125482091]


<span style='color:None'>**Min RMSE = 320.5675082011365**</span>

<span style='color:None'>**MAX R2 = 0.784691463775033**</span>

Model --> SGDRegressor
RMSE scores --> [491.3465843505966, 488.2846521891595, 504.84852989126244, 507.6354122660067, 462.33389922519854, 481.59775154828833, 492.46395754718816, 507.2294174312523, 493.4174231084109, 463.4066390154204]
R2 scores  --> [0.3527963248960678, 0.4478535436347136, 0.4841171356182887, 0.47572555977027164, 0.5875249314629363, 0.5470057604605623, 0.527486827158893, 0.4604420151664714, 0.4654487611715095, 0.4880097789904249]


<span style='color:None'>**Min RMSE = 462.33389922519854**</span>

<span style='color:None'>**MAX R2 = 0.5875249314629363**</span>

Model --> ElasticNet
RMSE scores --> [506.7715450573611, 526.9413305446088, 561.3339087908273, 560.1597159136005, 527.7460826782744, 533.9853244252408, 553.5714528380388, 542.9166976567033, 519.5213607903328, 500.487646152335]
R2 scores  --> [0.3115228435485722, 0.35696787644073746, 0.36221916320089387, 0.36162097298710805, 0.46255223128475875, 0.4430933280305391, 0.4029476797672162, 0.38184746385206436, 0.40739242797086817, 0.40279437841704624]


<span style='color:None'>**Min RMSE = 500.487646152335**</span>

<span style='color:None'>**MAX R2 = 0.46255223128475875**</span>



<span style='color:None'>**Season : Spring**</span>

Model --> LinearRegression
RMSE scores --> [460.1998847686206, 440.911691888095, 391.962106234209, 372.6971879703557, 481.425429774214, 367.8025596367609, 392.3429571359679, 405.23739371400603, 371.3959328045355, 387.1164730770126]
R2 scores  --> [0.5025976089163429, 0.5295060038140176, 0.5619456006048125, 0.630258995683959, 0.45711551335065403, 0.5403026551683532, 0.5247847935274248, 0.6368563323991452, 0.6381950606646623, 0.5732719483508766]


<span style='color:None'>**Min RMSE = 367.8025596367609**</span>

<span style='color:None'>**MAX R2 = 0.6381950606646623**</span>

Model --> RidgeCV
RMSE scores --> [459.8576114608053, 440.80476779752655, 391.9928943854656, 374.8697072287836, 481.68217514158204, 367.64335205601276, 392.28541656499203, 405.4140404378768, 371.4261855432298, 386.8900136248663]
R2 scores  --> [0.5033372190579272, 0.5297341721464662, 0.5618767806113386, 0.6259358578664045, 0.4565363156805281, 0.5407005396324267, 0.524924172347349, 0.6365396680362079, 0.6381361152906408, 0.5737710660157911]


<span style='color:None'>**Min RMSE = 367.64335205601276**</span>

<span style='color:None'>**MAX R2 = 0.6381361152906408**</span>

Model --> Lasso
RMSE scores --> [458.5004504222032, 440.3546654584427, 392.40436901042347, 373.7534277160128, 482.33059894064263, 367.3045363822849, 392.2123050683305, 405.93016325132794, 371.84091479894806, 386.749653898419]
R2 scores  --> [0.5062644587390501, 0.5306940514327743, 0.5609565026774226, 0.628160302408995, 0.4550721469990112, 0.5415467192109171, 0.5251012386700988, 0.6356136538450723, 0.637327559881572, 0.5740802728831444]


<span style='color:None'>**Min RMSE = 367.3045363822849**</span>

<span style='color:None'>**MAX R2 = 0.637327559881572**</span>

Model --> KNeighborsRegressor
RMSE scores --> [331.7189087366107, 306.7790215198763, 287.7128499430402, 266.40754724491245, 365.7204928393835, 296.67331053594546, 280.77582899040897, 281.15975577624266, 281.0839720254589, 256.0990477034947]
R2 scores  --> [0.7415626295752227, 0.7722270372207318, 0.7639747866878888, 0.811079784285916, 0.6867089627007377, 0.7009116778006117, 0.7566238350090984, 0.8251902051202966, 0.7927605797876199, 0.813240021517036]


<span style='color:None'>**Min RMSE = 256.0990477034947**</span>

<span style='color:None'>**MAX R2 = 0.8251902051202966**</span>

Model --> SGDRegressor
RMSE scores --> [455.81093254427265, 440.0862755226698, 397.44438608836043, 373.5813220210615, 486.59451965665386, 365.8440468829276, 392.4632115061218, 408.49680154354945, 372.88981591447975, 388.2129508530451]
R2 scores  --> [0.5120398767067116, 0.531265947764384, 0.5496059808829932, 0.6285026724875941, 0.44539496935399286, 0.5451853076831418, 0.5244934389248952, 0.6309911608471832, 0.6352785973016057, 0.5708511757926625]


<span style='color:None'>**Min RMSE = 365.8440468829276**</span>

<span style='color:None'>**MAX R2 = 0.6352785973016057**</span>

Model --> ElasticNet
RMSE scores --> [473.73928515512654, 467.08487312461955, 426.6430714400555, 425.80430886320363, 515.5835991913576, 377.42859931117243, 402.3246016645914, 456.2193578154299, 408.3619436632939, 413.73877169860964]
R2 scores  --> [0.47289922075846264, 0.4719896199998557, 0.48099770853576107, 0.5173798076615307, 0.37734487682317963, 0.5159256048937059, 0.5002971953357129, 0.5397361497919142, 0.562587963928244, 0.5125609376711036]


<span style='color:None'>**Min RMSE = 377.42859931117243**</span>

<span style='color:None'>**MAX R2 = 0.562587963928244**</span>



<span style='color:None'>**Season : Autumn**</span>

Model --> LinearRegression
RMSE scores --> [394.5517446686989, 539.149590610007, 368.8119100254639, 393.73535256719435, 363.41779341449814, 378.504368579969, 428.40862578901687, 450.01080198991787, 440.626765886365, 422.9196380068965]
R2 scores  --> [0.6135325746640823, 0.4093218479713052, 0.663227396291648, 0.5924044440523062, 0.6880516282555889, 0.6702811465850778, 0.5394073754341977, 0.4931530709901537, 0.5926230218269481, 0.6185874809704653]


<span style='color:None'>**Min RMSE = 363.41779341449814**</span>

<span style='color:None'>**MAX R2 = 0.6880516282555889**</span>

Model --> RidgeCV
RMSE scores --> [394.52965351468305, 539.147444264104, 368.74261151854046, 393.6999836053217, 363.4429036125631, 378.53530044600046, 428.4193808986767, 450.0340123335204, 440.6386032887561, 422.92975894489757]
R2 scores  --> [0.6135758504702128, 0.40932655092250875, 0.6633539412617262, 0.5924776687911261, 0.6880085188858533, 0.6702272542816918, 0.5393842489793663, 0.49310078604630675, 0.5926011332338621, 0.6185692254992075]


<span style='color:None'>**Min RMSE = 363.4429036125631**</span>

<span style='color:None'>**MAX R2 = 0.6880085188858533**</span>

Model --> Lasso
RMSE scores --> [394.1778056330255, 539.6902993422427, 365.72671969129823, 392.1988103456572, 365.1667756076806, 380.38810852406573, 429.43323391582993, 451.6593720567792, 441.56028553911364, 422.91690750137406]
R2 scores  --> [0.614264781659257, 0.4081364812584949, 0.6688381794229474, 0.5955794992809669, 0.6850418416465844, 0.6669910913977708, 0.5372015781536008, 0.48943270165291114, 0.5908950411688948, 0.6185924059989293]


<span style='color:None'>**Min RMSE = 365.1667756076806**</span>

<span style='color:None'>**MAX R2 = 0.6850418416465844**</span>

Model --> KNeighborsRegressor
RMSE scores --> [304.5386410024783, 417.4477422384747, 302.7790760003676, 281.836507824762, 304.8740476234173, 284.1496567454753, 351.61144821277605, 330.4736465827711, 343.4246198013682, 278.33816349387456]
R2 scores  --> [0.7697552329885244, 0.6458913130204547, 0.7730247000524078, 0.7911593114142748, 0.7804612120484934, 0.8141783723986713, 0.689739434193819, 0.7266591079043623, 0.7525328339402922, 0.8347944036929111]


<span style='color:None'>**Min RMSE = 278.33816349387456**</span>

<span style='color:None'>**MAX R2 = 0.8347944036929111**</span>

Model --> SGDRegressor
RMSE scores --> [395.6289747383059, 541.3429643284184, 367.2844054493341, 393.1308241982902, 365.89734133904835, 383.1314698685301, 430.0622355510326, 452.29679160763084, 441.6796310133077, 423.2270164041024]
R2 scores  --> [0.611419378304126, 0.40450606660442934, 0.6660112348378009, 0.5936551009486004, 0.6837803478663138, 0.6621704460600784, 0.5358448403301298, 0.4879905740400449, 0.5906738644683449, 0.6180328575263512]


<span style='color:None'>**Min RMSE = 365.89734133904835**</span>

<span style='color:None'>**MAX R2 = 0.6837803478663138**</span>

Model --> ElasticNet
RMSE scores --> [464.52995852489715, 599.0901163796451, 446.37879288102226, 458.18762066524704, 482.1545915923495, 467.36637782894593, 490.3667474543221, 498.1485402320546, 523.3279470119066, 510.62946121326127]
R2 scores  --> [0.464286718972695, 0.27068246153188313, 0.5066740204103855, 0.448040362158067, 0.45091072280804323, 0.49729069889938415, 0.3965481946732913, 0.3789183956584464, 0.42535108877680256, 0.4439792791056409]


<span style='color:None'>**Min RMSE = 446.37879288102226**</span>

<span style='color:None'>**MAX R2 = 0.5066740204103855**</span>



<span style='color:None'>**Season : Winter**</span>

Model --> LinearRegression
RMSE scores --> [87.30474050600793, 93.542284735856, 139.6664135008237, 113.91559699724436, 97.33627847617159, 119.25229926993775, 107.86640397754981, 109.47028114468101, 123.40201291049252, 127.44058872014963]
R2 scores  --> [0.5043558905830208, 0.45830914048345495, 0.3512427412083413, 0.42840163345945503, 0.5106758109865142, 0.44634202010099033, 0.45439074027987203, 0.48156249470437884, 0.3508592500882135, 0.3456091455120961]


<span style='color:None'>**Min RMSE = 87.30474050600793**</span>

<span style='color:None'>**MAX R2 = 0.5106758109865142**</span>

Model --> RidgeCV
RMSE scores --> [87.14019080922024, 93.2394457370953, 139.7762491495795, 113.99189644356018, 97.47597597725536, 119.44019678364255, 107.91292032296317, 109.53237352678214, 123.26731854479607, 127.2752915338881]
R2 scores  --> [0.506222484038747, 0.46181086323054243, 0.3502219561642149, 0.42763567610732456, 0.5092702421501334, 0.444595925213795, 0.45392006136158747, 0.48097420446096006, 0.3522755622072442, 0.34730560372232777]


<span style='color:None'>**Min RMSE = 87.14019080922024**</span>

<span style='color:None'>**MAX R2 = 0.5092702421501334**</span>

Model --> Lasso
RMSE scores --> [87.27968363004436, 92.69528711517837, 140.1748950180034, 114.35285488357384, 98.20298482985925, 120.12421602270474, 108.41155322301593, 110.06592440903478, 122.99171636570449, 127.0865845969526]
R2 scores  --> [0.5046403541917932, 0.46807442839214614, 0.346510299619731, 0.42400512267739, 0.5019228868399118, 0.4382162486345027, 0.4488618622999636, 0.47590536242424464, 0.3551687005785591, 0.3492396264258396]


<span style='color:None'>**Min RMSE = 87.27968363004436**</span>

<span style='color:None'>**MAX R2 = 0.5046403541917932**</span>

Model --> KNeighborsRegressor
RMSE scores --> [83.72041789248918, 72.05503257440279, 117.1167438751028, 97.17872604290322, 80.98149813961008, 98.85982396970792, 97.72094773246417, 89.00015375504165, 103.02958752917897, 100.63676563011765]
R2 scores  --> [0.5442181022555159, 0.6785864373420329, 0.5438199935919943, 0.584025219881126, 0.6612971713722127, 0.6195060928213558, 0.5521993994855376, 0.6573226031799733, 0.5475002876631792, 0.5919296654052917]


<span style='color:None'>**Min RMSE = 72.05503257440279**</span>

<span style='color:None'>**MAX R2 = 0.6785864373420329**</span>

Model --> SGDRegressor
RMSE scores --> [87.49253237019431, 93.49040452246372, 139.4717027119038, 114.03850716394012, 97.80692438439834, 119.48411983608946, 107.79654106000989, 109.29260758074066, 123.26151942170955, 127.80031360993294]
R2 scores  --> [0.5022213435763356, 0.4589098366123069, 0.3530503624151522, 0.4271675065513495, 0.5059323544460539, 0.4441873604509893, 0.4550972718639197, 0.48324400839904896, 0.3523365052930364, 0.34190965053036304]


<span style='color:None'>**Min RMSE = 87.49253237019431**</span>

<span style='color:None'>**MAX R2 = 0.5059323544460539**</span>

Model --> ElasticNet
RMSE scores --> [92.76080123851635, 97.03044399144726, 146.7880979094873, 124.91075684566405, 105.41396551711097, 129.46970114980317, 116.62134489235918, 119.85881389918241, 128.76094174547825, 131.0933772558949]
R2 scores  --> [0.4404701247490524, 0.41715698210149843, 0.2833948023504983, 0.31273492049836116, 0.42609036988794013, 0.3474041130562403, 0.362228038975252, 0.37849604508614687, 0.2932551150944807, 0.30755834283285965]


<span style='color:None'>**Min RMSE = 92.76080123851635**</span>

<span style='color:None'>**MAX R2 = 0.4404701247490524**</span>



**Testing 4 models on unseen data**

In [120]:
mask = (testDf['Seasons'] == seasontoIdmap["Summer"])
seasonwiseData["Summer"] = testDf[mask].copy()

mask = (testDf['Seasons'] == seasontoIdmap["Spring"])
seasonwiseData["Spring"] = testDf[mask].copy()

mask = (testDf['Seasons'] == seasontoIdmap["Autumn"])
seasonwiseData["Autumn"] = testDf[mask].copy()

mask = (testDf['Seasons'] == seasontoIdmap["Winter"])
seasonwiseData["Winter"] = testDf[mask].copy()

for season in seasons:
  seasonwiseData[season].reset_index(drop=True, inplace=True)

LRPredicted = []
LRground_truth = []
predicted = []
ground_truth = []

for season in seasons:
  printmd("**Season : "+season+"**")
  X, y = prepareData(seasonwiseData[season])
  LRy_test_predict, LRy1 = evaluateCVModel(LRfinalModelSeasonwise[season], X, y)
  LRPredicted.extend(LRy_test_predict)
  LRground_truth.extend(LRy1)
  y_test_predict, y1 = evaluateCVModel(finalModelSeasonwise[season], X, y)
  predicted.extend(y_test_predict)
  ground_truth.extend(y1)

#Metrics On full testset
printmd("**Final Results on complete testset**")

print('Linear Models Best output')
getFinalMetrics(LRPredicted, LRground_truth)

print("Overall Best Model")
getFinalMetrics(predicted, ground_truth)

<span style='color:None'>**Season : Summer**</span>

<span style='color:None'>**Model : LinearRegression,  RMSE = 512.5388134178594, R2 =0.4651888366836193**</span>

<span style='color:None'>**Model : KNeighborsRegressor,  RMSE = 358.4742543240949, R2 =0.7383848526896162**</span>

<span style='color:None'>**Season : Spring**</span>

<span style='color:None'>**Model : Lasso,  RMSE = 440.81664461845736, R2 =0.5241599556354878**</span>

<span style='color:None'>**Model : KNeighborsRegressor,  RMSE = 329.48264201253426, R2 =0.7341662621302161**</span>

<span style='color:None'>**Season : Autumn**</span>

<span style='color:None'>**Model : LinearRegression,  RMSE = 440.90675287526767, R2 =0.5112280788344794**</span>

<span style='color:None'>**Model : KNeighborsRegressor,  RMSE = 348.79284140890485, R2 =0.6941222593545455**</span>

<span style='color:None'>**Season : Winter**</span>

<span style='color:None'>**Model : RidgeCV,  RMSE = 118.52166842587255, R2 =0.4077786348243499**</span>

<span style='color:None'>**Model : KNeighborsRegressor,  RMSE = 104.77834702100594, R2 =0.5371591912391394**</span>

<span style='color:None'>**Final Results on complete testset**</span>

Linear Models Best output


<span style='color:None'>**RMSE = 407.25625620224736, R2 =0.6019221027217457**</span>

Overall Best Model


<span style='color:None'>**RMSE = 303.2186357991382, R2 =0.7793294820047562**</span>

# Experiment with separate models for rain/snowfall or not


In [121]:
threshold = 0.0
mask = (trainDf['Rainfall(mm)'] != threshold) | (trainDf['Snowfall (cm)'] != threshold)
dfWithPrecipitation = trainDf[mask].copy()
dfWithoutPrecipitation = trainDf[~mask].copy()
dfWithPrecipitation.reset_index(drop=True, inplace=True)
dfWithoutPrecipitation.reset_index(drop=True, inplace=True)

**Train Model With Precipitation**

In [122]:
X, y = prepareData(dfWithPrecipitation)
soFarBestMSE = sys.maxsize
finalModelWithPrecipitation = []
LRfinalModelWithPrecipitation = []
LRsoFarBestMSE = sys.maxsize
for model in models:
  bestModelMSE, imputer, metric = runCV(10, model, X, y)
  if metric < soFarBestMSE:
    finalModelWithPrecipitation = [bestModelMSE, imputer]
    soFarBestMSE = metric
  if metric < LRsoFarBestMSE:
    if type(model()).__name__ in LinearRegressionModels:
      LRsoFarBestMSE = metric
      LRfinalModelWithPrecipitation = [bestModelMSE, imputer]

Model --> LinearRegression
RMSE scores --> [147.0402268314847, 181.30817585103537, 139.19182285098768, 154.5470458693383, 247.86026906553093, 167.448787410823, 253.38441119779205, 133.0908722802346, 188.41134618517057, 218.30932191855163]
R2 scores  --> [0.14148281339329205, 0.23862157910898485, 0.2960043119557484, 0.23393961264254134, 0.24159774887313856, 0.32416334768315347, 0.18409610529303844, 0.04236865699208481, 0.2231676684213921, 0.272280273956903]


<span style='color:None'>**Min RMSE = 133.0908722802346**</span>

<span style='color:None'>**MAX R2 = 0.32416334768315347**</span>

Model --> RidgeCV
RMSE scores --> [146.95509009302648, 180.53518175157978, 138.6953807624276, 154.91787701916627, 247.6778907622872, 168.04232244782526, 255.10939956994446, 131.62732278037768, 187.7039087537209, 219.0591160791476]
R2 scores  --> [0.1424766937033909, 0.24509990170321894, 0.30101710409925286, 0.23025892276065718, 0.24271341964200233, 0.3193637470233638, 0.17294928334047022, 0.06331426722566902, 0.2289903383382277, 0.26727291091701166]


<span style='color:None'>**Min RMSE = 131.62732278037768**</span>

<span style='color:None'>**MAX R2 = 0.3193637470233638**</span>

Model --> Lasso
RMSE scores --> [148.10242085468397, 181.21846226578896, 140.7341097742281, 156.46621562639834, 248.81316243099909, 172.11968369259262, 260.76875577673667, 127.5178741855036, 187.87382734085213, 223.32136445009294]
R2 scores  --> [0.12903444340022274, 0.23937487209439193, 0.280316915782444, 0.21479555961806607, 0.23575521805440933, 0.28593325858676455, 0.13584761924367594, 0.12088854851314923, 0.2275937966145747, 0.23848207360754445]


<span style='color:None'>**Min RMSE = 127.5178741855036**</span>

<span style='color:None'>**MAX R2 = 0.28593325858676455**</span>

Model --> KNeighborsRegressor
RMSE scores --> [124.51437245301612, 148.56574656572616, 100.25137615540564, 142.35212641452475, 209.17198036166567, 168.25470041138914, 234.54391015217956, 119.45057995937725, 172.59016865334328, 195.379850008832]
R2 scores  --> [0.3843758703069797, 0.4887854968503633, 0.6348063308471967, 0.3500656249038667, 0.4598768446153203, 0.31764223443844364, 0.3009189282360136, 0.22860230921032254, 0.3481535665988782, 0.41712003250630136]


<span style='color:None'>**Min RMSE = 100.25137615540564**</span>

<span style='color:None'>**MAX R2 = 0.6348063308471967**</span>

Model --> SGDRegressor
RMSE scores --> [148.96388507226163, 180.9952216012773, 141.9089359353121, 156.52870194082058, 250.3401497189739, 172.97778532177813, 261.30983726947716, 129.3516920674838, 188.258227227011, 223.69362379580593]
R2 scores  --> [0.11887272151959427, 0.24124772619718926, 0.26825116175536834, 0.2141682762323025, 0.2263459647842917, 0.2787955596169919, 0.13225775697300246, 0.09542196623313326, 0.224429794362225, 0.23594117531877745]


<span style='color:None'>**Min RMSE = 129.3516920674838**</span>

<span style='color:None'>**MAX R2 = 0.2787955596169919**</span>

Model --> ElasticNet
RMSE scores --> [145.3657876703441, 186.85616732626818, 143.53438893761853, 155.52860351869103, 260.02816305834506, 177.54854893605574, 265.6556266355809, 125.50143242294504, 193.97729185572385, 232.67594609867925]
R2 scores  --> [0.16092442843970656, 0.19131263047896319, 0.2513919656345618, 0.22417792147980842, 0.1653074144689135, 0.24017779423400246, 0.1031552737525796, 0.14847152656471907, 0.17659221090162347, 0.17334830047732197]


<span style='color:None'>**Min RMSE = 125.50143242294504**</span>

<span style='color:None'>**MAX R2 = 0.2513919656345618**</span>



**Train Model Without Precipitation**

In [123]:
X, y = prepareData(dfWithoutPrecipitation)
soFarBestMSE = sys.maxsize
finalModelWithoutPrecipitation = []
LRfinalModelWithoutPrecipitation = []
LRsoFarBestMSE = sys.maxsize
for model in models:
  bestModelMSE, imputer, metric = runCV(10, model, X, y)
  if metric < soFarBestMSE:
    finalModelWithoutPrecipitation = [bestModelMSE, imputer]
    soFarBestMSE = metric
  if metric < LRsoFarBestMSE:
    if type(model()).__name__ in LinearRegressionModels:
      LRsoFarBestMSE = metric
      LRfinalModelWithoutPrecipitation = [bestModelMSE, imputer]

Model --> LinearRegression
RMSE scores --> [437.0040712127694, 485.77530333849285, 458.0541628502339, 435.43919128001096, 463.27911496110124, 426.8138898195972, 445.81042053547577, 450.29241396046064, 456.77710365159965, 435.5154083390599]
R2 scores  --> [0.5187231893847878, 0.4471244835631626, 0.4979936900075952, 0.5428006003341916, 0.5279009777030566, 0.5684326285097613, 0.5385434185009612, 0.5281283193959758, 0.4965780039296772, 0.5424293911645569]


<span style='color:None'>**Min RMSE = 426.8138898195972**</span>

<span style='color:None'>**MAX R2 = 0.5684326285097613**</span>

Model --> RidgeCV
RMSE scores --> [436.76367558232744, 485.72765798508544, 458.15013020490886, 435.51376185030654, 463.4046213615093, 426.90415476205806, 445.7341595060704, 450.2316495270861, 456.54571588767067, 435.7487994205825]
R2 scores  --> [0.5192525438121203, 0.4472329314705257, 0.4977833163521199, 0.542643992752771, 0.5276451514726865, 0.5682500687377572, 0.5387012799702455, 0.5282556636717277, 0.49708790771078015, 0.5419388389949589]


<span style='color:None'>**Min RMSE = 426.90415476205806**</span>

<span style='color:None'>**MAX R2 = 0.5682500687377572**</span>

Model --> Lasso
RMSE scores --> [436.47120713850535, 485.6688904527596, 458.52550614252544, 435.66724487092574, 463.33792401884307, 427.59830930733193, 445.81846635552785, 449.9651265467524, 456.2803015476911, 435.72902758968723]
R2 scores  --> [0.5198961705253897, 0.44736668045883665, 0.4969600172800297, 0.542321574694443, 0.5277811127666086, 0.5668448594706958, 0.538526761958408, 0.5288140139889179, 0.49767247690882044, 0.5419804065327508]


<span style='color:None'>**Min RMSE = 427.59830930733193**</span>

<span style='color:None'>**MAX R2 = 0.5668448594706958**</span>

Model --> KNeighborsRegressor
RMSE scores --> [279.3337024277593, 313.5245132617225, 301.2960716902894, 261.3459128128848, 291.45792105207914, 286.6124017972704, 284.1446042000446, 272.6100070356919, 292.9176188115976, 263.6608569734992]
R2 scores  --> [0.803360552114846, 0.7696970508576011, 0.7827986871558067, 0.8353043808808214, 0.8131472981328676, 0.8053916497213912, 0.8125397292413038, 0.8270509260721839, 0.7929789416327062, 0.8322963983458516]


<span style='color:None'>**Min RMSE = 261.3459128128848**</span>

<span style='color:None'>**MAX R2 = 0.8353043808808214**</span>

Model --> SGDRegressor
RMSE scores --> [436.9948540482674, 484.82108797011807, 457.28548886658746, 436.0087874480962, 462.49519313582266, 428.79607923076236, 447.1128762439376, 452.6981025559988, 458.73492332997245, 436.1565863474124]
R2 scores  --> [0.5187434910755278, 0.44929439282723704, 0.4996771390136461, 0.541603696803993, 0.5294973182194851, 0.5644147897694434, 0.5358431463908655, 0.5230728992877229, 0.49225326119434776, 0.5410811028134787]


<span style='color:None'>**Min RMSE = 428.79607923076236**</span>

<span style='color:None'>**MAX R2 = 0.5644147897694434**</span>

Model --> ElasticNet
RMSE scores --> [472.635954671029, 513.6486856571614, 498.38644492641083, 485.01213341181096, 511.8634343820717, 476.05763557634197, 483.68167403178643, 488.4468809624668, 491.22237064732514, 486.5081103418957]
R2 scores  --> [0.4370400705370343, 0.3818571401298976, 0.4056969610060337, 0.4327743927133533, 0.42369038000993486, 0.46310348541165713, 0.4568126102701896, 0.44477458889420696, 0.41778980251635955, 0.42900640028505954]


<span style='color:None'>**Min RMSE = 472.635954671029**</span>

<span style='color:None'>**MAX R2 = 0.46310348541165713**</span>



**Testing 2 models on unseen data**

In [124]:
threshold = 0.0
mask = (testDf['Rainfall(mm)'] != threshold) | (testDf['Snowfall (cm)'] != threshold)
dfWithPrecipitation = testDf[mask].copy()
dfWithoutPrecipitation = testDf[~mask].copy()
dfWithPrecipitation.reset_index(drop=True, inplace=True)
dfWithoutPrecipitation.reset_index(drop=True, inplace=True)

predicted = []
ground_truth = []

LRpredicted = []
LRground_truth = []


#WithPrecipitation
print("Evaluation for Model With Precipitation")
X, y = prepareData(dfWithPrecipitation)
LRy_test_predict, LRy1 = evaluateCVModel(LRfinalModelWithPrecipitation, X, y)
y_test_predict, y1 = evaluateCVModel(finalModelWithPrecipitation, X, y)
predicted.extend(y_test_predict)
ground_truth.extend(y1)
LRpredicted.extend(LRy_test_predict)
LRground_truth.extend(LRy1)

#WithoutPrecipitation
print("Evaluation for Model Without Precipitation")
X, y = prepareData(dfWithoutPrecipitation)
LRy_test_predict, LRy1 = evaluateCVModel(LRfinalModelWithoutPrecipitation, X, y)
y_test_predict, y1 = evaluateCVModel(finalModelWithoutPrecipitation, X, y)
predicted.extend(y_test_predict)
ground_truth.extend(y1)
LRpredicted.extend(LRy_test_predict)
LRground_truth.extend(LRy1)

#Metrics On full testset
#Metrics On full testset
printmd("**Final Results on complete testset**")

print('Linear Models Best output')
getFinalMetrics(LRpredicted, LRground_truth)

print("Overall Best Model")
getFinalMetrics(predicted, ground_truth)

Evaluation for Model With Precipitation


<span style='color:None'>**Model : ElasticNet,  RMSE = 241.51041994964865, R2 =0.12915737092146784**</span>

<span style='color:None'>**Model : KNeighborsRegressor,  RMSE = 246.05048040413138, R2 =0.09610836200155148**</span>

Evaluation for Model Without Precipitation


<span style='color:None'>**Model : LinearRegression,  RMSE = 463.7263030265346, R2 =0.4883937140686007**</span>

<span style='color:None'>**Model : KNeighborsRegressor,  RMSE = 317.64975962477126, R2 =0.7599457089757273**</span>

<span style='color:None'>**Final Results on complete testset**</span>

Linear Models Best output


<span style='color:None'>**RMSE = 445.72738430488687, R2 =0.5231616581683753**</span>

Overall Best Model


<span style='color:None'>**RMSE = 310.9431288940032, R2 =0.7679431117915152**</span>

In [90]:
#End

# Experiment with separate models for Weekday and Weekend data 

In [125]:
trainDf['datetime'] = pd.to_datetime(trainDf['Date'])
trainDf['day'] = trainDf['datetime'].dt.dayofweek# 0 - Monday

testDf['datetime'] = pd.to_datetime(testDf['Date'])
testDf['day'] = testDf['datetime'].dt.dayofweek# 0 - Monday

In [126]:
threshold = 0.0
mask = (trainDf['day'] > 4)

dfWithWeekdays = trainDf[mask].copy()
dfWithWeekdays = dfWithWeekdays.drop(['datetime'], axis=1)
dfWithWeekdays = dfWithWeekdays.drop(['day'], axis=1)

dfWithWeekends = trainDf[~mask].copy()
dfWithWeekends = dfWithWeekends.drop(['datetime'], axis=1)
dfWithWeekends = dfWithWeekends.drop(['day'], axis=1)

dfWithWeekdays.reset_index(drop=True, inplace=True)
dfWithWeekends.reset_index(drop=True, inplace=True)

**Train model with Weekdays**

In [127]:
X, y = prepareData(dfWithWeekdays, 4)
soFarBestMSE = sys.maxsize
finalModelWithWeekdays = []
LRfinalModelWithWeekdays = []
LRsoFarBestMSE = sys.maxsize
for model in models:
  bestModelMSE, imputer, metric = runCV(10, model, X, y)
  if metric < soFarBestMSE:
    finalModelWithWeekdays = [bestModelMSE, imputer]
    soFarBestMSE = metric
  if metric < LRsoFarBestMSE:
    if type(model()).__name__ in LinearRegressionModels:
      LRsoFarBestMSE = metric
      LRfinalModelWithWeekdays = [bestModelMSE, imputer]

Model --> LinearRegression
RMSE scores --> [366.9630922513392, 420.9455731493893, 368.21001885634587, 397.56588159415173, 373.7821351532544, 371.86820204177536, 409.9213356980916, 435.8147431639859, 404.18965210750673, 376.5550602419459]
R2 scores  --> [0.583996286477338, 0.54217190401113, 0.6612475518309722, 0.5929043407968502, 0.6151783295982615, 0.6414193597501914, 0.6063298004686608, 0.5945369105218341, 0.5361250099440565, 0.5815857502314202]


<span style='color:None'>**Min RMSE = 366.9630922513392**</span>

<span style='color:None'>**MAX R2 = 0.6612475518309722**</span>

Model --> RidgeCV
RMSE scores --> [365.23496139635535, 421.3561992983337, 368.8521220094676, 398.83230116137554, 373.8535752545069, 371.51891985139747, 409.8410677140666, 436.96694417900187, 403.6876064324556, 375.05695386254035]
R2 scores  --> [0.5879052140932352, 0.5412782593517003, 0.6600650545609965, 0.5903066579285404, 0.6150312154174389, 0.6420926467446357, 0.6064839569702858, 0.592390161032613, 0.5372766564382079, 0.5849084098798176]


<span style='color:None'>**Min RMSE = 365.23496139635535**</span>

<span style='color:None'>**MAX R2 = 0.6600650545609965**</span>

Model --> Lasso
RMSE scores --> [365.1656072574329, 421.4543947018336, 368.09415668244856, 398.6344783226381, 373.4704221092153, 371.4220755220997, 410.1710014233034, 436.62714446283735, 404.529347123995, 376.2335025287327]
R2 scores  --> [0.5880617038518331, 0.5410644278466779, 0.6614607042059883, 0.5907129770752992, 0.615819900746996, 0.6422792147933769, 0.6058501187433881, 0.5930238558974467, 0.5353449690016246, 0.5823000515208224]


<span style='color:None'>**Min RMSE = 365.1656072574329**</span>

<span style='color:None'>**MAX R2 = 0.6614607042059883**</span>

Model --> KNeighborsRegressor
RMSE scores --> [234.66144592348047, 313.53596531481634, 247.50189263903292, 269.089253012744, 240.73901713044364, 239.62482285471876, 257.69260531678333, 277.4580290135387, 273.4802092553846, 232.85430337861862]
R2 scores  --> [0.8298876208245571, 0.7460049350802598, 0.8469447297445276, 0.8135033805595699, 0.8403696721640309, 0.8511076584175962, 0.8444265772666293, 0.8356604539084842, 0.787635272984896, 0.8400007203016931]


<span style='color:None'>**Min RMSE = 232.85430337861862**</span>

<span style='color:None'>**MAX R2 = 0.8511076584175962**</span>

Model --> SGDRegressor
RMSE scores --> [367.7200490723247, 421.4557069082489, 368.59030883097523, 399.05659137465835, 374.96868222809815, 371.08603521768896, 408.88764213671095, 436.73322154175224, 403.6068855057672, 375.0126983181723]
R2 scores  --> [0.5822782846755871, 0.5410615700326553, 0.6605474585223605, 0.5898457321468205, 0.6127312690131298, 0.6429262103853204, 0.6083127237005629, 0.5928260849379157, 0.5374616892314845, 0.5850063631121833]


<span style='color:None'>**Min RMSE = 367.7200490723247**</span>

<span style='color:None'>**MAX R2 = 0.6605474585223605**</span>

Model --> ElasticNet
RMSE scores --> [388.9030784849507, 452.8146073069958, 414.22342467117795, 446.2688520826244, 405.6499742421979, 397.11354460580355, 455.4133652015115, 490.0567875242665, 427.23875232936734, 401.21137460015336]
R2 scores  --> [0.5327651850091191, 0.4702250682682454, 0.5712930246557224, 0.48705428143306306, 0.5467630119105844, 0.591080189865152, 0.5141043201234547, 0.48732716397485065, 0.48171116554699844, 0.5249974243605489]


<span style='color:None'>**Min RMSE = 388.9030784849507**</span>

<span style='color:None'>**MAX R2 = 0.591080189865152**</span>



**Train model with Weekends Data**

In [128]:
X, y = prepareData(dfWithWeekends)
soFarBestMSE = sys.maxsize
finalModelWithWeekends = []
LRfinalModelWithWeekends = []
LRsoFarBestMSE = sys.maxsize
for model in models:
  bestModelMSE, imputer, metric = runCV(10, model, X, y)
  if metric < soFarBestMSE:
    finalModelWithWeekends = [bestModelMSE, imputer]
    soFarBestMSE = metric
  if metric < LRsoFarBestMSE:
    if type(model()).__name__ in LinearRegressionModels:
      LRsoFarBestMSE = metric
      LRfinalModelWithWeekends = [bestModelMSE, imputer]

Model --> LinearRegression
RMSE scores --> [464.2439154617931, 499.8369944321409, 473.52747435809056, 442.10613059986264, 492.275954662797, 445.4422278980246, 457.16531291388407, 448.8152181262841, 480.3775167860583, 452.79264374278375]
R2 scores  --> [0.48045138122385805, 0.41429009209948187, 0.46509248783956814, 0.5113762008988949, 0.5094560111508495, 0.5447987005607539, 0.49170910585974387, 0.49839712430299954, 0.4636869253504916, 0.5332273827289393]


<span style='color:None'>**Min RMSE = 442.10613059986264**</span>

<span style='color:None'>**MAX R2 = 0.5447987005607539**</span>

Model --> RidgeCV
RMSE scores --> [464.31095093943355, 499.85612573987197, 473.82412865818884, 442.0781719048002, 492.68978315754276, 445.6919538352376, 457.1603896966038, 447.9385058736406, 479.5394543900576, 453.1313790390308]
R2 scores  --> [0.4803013277575573, 0.41424525503836873, 0.46442206287892734, 0.5114379998692573, 0.508630919434141, 0.5442881632624431, 0.4917200533790259, 0.5003548648432383, 0.46555658718660503, 0.5325287339982185]


<span style='color:None'>**Min RMSE = 442.0781719048002**</span>

<span style='color:None'>**MAX R2 = 0.5442881632624431**</span>

Model --> Lasso
RMSE scores --> [464.3282831785422, 499.85145336046054, 474.142201729291, 441.9029566708391, 492.9029686751657, 446.2497964253508, 457.6275245303132, 447.77897379551894, 479.4289789845034, 453.1793815833344]
R2 scores  --> [0.48026252742448505, 0.4142562056118566, 0.46370276604698335, 0.5118252008050156, 0.5082055993301544, 0.5431466817465627, 0.49068078321676784, 0.5007106959742637, 0.4658028069757995, 0.5324296854671415]


<span style='color:None'>**Min RMSE = 441.9029566708391**</span>

<span style='color:None'>**MAX R2 = 0.5431466817465627**</span>

Model --> KNeighborsRegressor
RMSE scores --> [282.7003886402948, 306.18814774282464, 285.8999577799652, 246.74272259078919, 295.50148988848736, 291.4169850751383, 293.1749040632053, 256.66261018229176, 308.6000272879205, 264.02415820026835]
R2 scores  --> [0.8073422141530238, 0.7802127328934078, 0.8050081250187969, 0.8478015440356105, 0.8232415999443838, 0.8051724167966438, 0.7909648258595801, 0.8359598784893425, 0.7786676173390888, 0.8412937686777657]


<span style='color:None'>**Min RMSE = 246.74272259078919**</span>

<span style='color:None'>**MAX R2 = 0.8478015440356105**</span>

Model --> SGDRegressor
RMSE scores --> [464.4224452866631, 499.4447104451912, 473.7581991331983, 442.152070684341, 494.1313151799388, 446.27582139371384, 457.65685639118794, 445.98445041286436, 480.0916345879537, 453.8144634052186]
R2 scores  --> [0.4800517087423586, 0.4152090895248477, 0.4645710968358278, 0.5112746479760467, 0.5057513773335065, 0.543093393476425, 0.49061549099382695, 0.5047045902896893, 0.464325076524127, 0.5311182688947983]


<span style='color:None'>**Min RMSE = 442.152070684341**</span>

<span style='color:None'>**MAX R2 = 0.543093393476425**</span>

Model --> ElasticNet
RMSE scores --> [498.1860733074405, 522.4531525658981, 513.4107577896482, 482.74167974511204, 548.6778961270813, 496.4843537156063, 489.5523246809509, 480.3277375409118, 511.1886797401956, 507.7046027813918]
R2 scores  --> [0.40170287047184466, 0.36008765864317616, 0.37119170546465363, 0.4174259493098874, 0.3906095534877567, 0.4345009822489183, 0.4171403092641689, 0.4254865475009672, 0.39268292087398615, 0.41314765134143205]


<span style='color:None'>**Min RMSE = 480.3277375409118**</span>

<span style='color:None'>**MAX R2 = 0.4345009822489183**</span>



**Test 2 models on unseen data**

In [129]:
mask = (testDf['day'] > 4)

dfWithWeekdays = testDf[mask].copy()
dfWithWeekdays = dfWithWeekdays.drop(['datetime'], axis=1)
dfWithWeekdays = dfWithWeekdays.drop(['day'], axis=1)

dfWithWeekends = testDf[~mask].copy()
dfWithWeekends = dfWithWeekends.drop(['datetime'], axis=1)
dfWithWeekends = dfWithWeekends.drop(['day'], axis=1)

testDf = testDf.drop(['datetime'], axis=1)
testDf = testDf.drop(['day'], axis=1)

trainDf = trainDf.drop(['datetime'], axis=1)
trainDf = trainDf.drop(['day'], axis=1)

dfWithWeekdays.reset_index(drop=True, inplace=True)
dfWithWeekends.reset_index(drop=True, inplace=True)

predicted = []
ground_truth = []

LRpredicted = []
LRground_truth = []

#WithPrecipitation
print("Evaluation for Model With Weekdays")
X, y = prepareData(dfWithWeekdays)
LRy_test_predict, LRy1 = evaluateCVModel(LRfinalModelWithWeekdays, X, y)
y_test_predict, y1 = evaluateCVModel(finalModelWithWeekdays, X, y)
predicted.extend(y_test_predict)
ground_truth.extend(y1)
LRpredicted.extend(LRy_test_predict)
LRground_truth.extend(LRy1)

#WithoutPrecipitation
print("Evaluation for Model With Weekends")
X, y = prepareData(dfWithWeekends)
LRy_test_predict, LRy1 = evaluateCVModel(LRfinalModelWithWeekends, X, y)
y_test_predict, y1 = evaluateCVModel(finalModelWithWeekends, X, y)
predicted.extend(y_test_predict)
ground_truth.extend(y1)
LRpredicted.extend(LRy_test_predict)
LRground_truth.extend(LRy1)

#Metrics On full testset
print("")
printmd("**Final Results on complete testset**")

print('Linear Models Best output')
getFinalMetrics(LRpredicted, LRground_truth)

print("Overall Best Model")
getFinalMetrics(predicted, ground_truth)

Evaluation for Model With Weekdays


<span style='color:None'>**Model : Lasso,  RMSE = 394.2163303586817, R2 =0.5918083312056913**</span>

<span style='color:None'>**Model : KNeighborsRegressor,  RMSE = 251.07267407399527, R2 =0.8344253632664903**</span>

Evaluation for Model With Weekends


<span style='color:None'>**Model : Lasso,  RMSE = 482.19801661349175, R2 =0.45927144029412503**</span>

<span style='color:None'>**Model : KNeighborsRegressor,  RMSE = 312.2418880214827, R2 =0.7732690552901411**</span>




<span style='color:None'>**Final Results on complete testset**</span>

Linear Models Best output


<span style='color:None'>**RMSE = 459.7709591400728, R2 =0.4926407207437466**</span>

Overall Best Model


<span style='color:None'>**RMSE = 296.73992521933764, R2 =0.7886586383127328**</span>

# Experiment with separate models for Holiday & Not Holiday

In [130]:
mask = (trainDf['Holiday'] == 'Holiday')

dfWithHolidays = trainDf[mask].copy()
dfWithHolidays = dfWithHolidays.drop(['Holiday'], axis=1)

dfWithNonHolidays = trainDf[~mask].copy()
dfWithNonHolidays = dfWithNonHolidays.drop(['Holiday'], axis=1)

dfWithHolidays.reset_index(drop=True, inplace=True)
dfWithNonHolidays.reset_index(drop=True, inplace=True)

**Train model with Holidays**

In [131]:
X, y = prepareData(dfWithHolidays)
soFarBestMSE = sys.maxsize
LRsoFarBestMSE = sys.maxsize
finalModelWithHolidays = []
LRfinalModelWithHolidays = []
for model in models:
  bestModelMSE, imputer, metric = runCV(10, model, X, y, withHoliday=False)
  if metric < soFarBestMSE:
    finalModelWithHolidays = [bestModelMSE, imputer]
    soFarBestMSE = metric
  if metric < LRsoFarBestMSE:
    if type(model()).__name__ in LinearRegressionModels:
      LRfinalModelWithHolidays = [bestModelMSE, imputer]
      LRsoFarBestMSE = metric

Model --> LinearRegression
RMSE scores --> [354.08172923927236, 349.0562748935887, 378.11598287771074, 310.89593306962274, 361.90375217236397, 235.57733963390908, 294.8590307456816, 337.4478162036861, 332.01552994244537, 331.0316164817982]
R2 scores  --> [0.6383846367411903, 0.6439383337272293, 0.6085603541457353, 0.7462357155633699, 0.6429565178734433, 0.6544190668330977, 0.7473461961338138, 0.6120015584320712, 0.5231083103641838, 0.5739797116467484]


<span style='color:None'>**Min RMSE = 235.57733963390908**</span>

<span style='color:None'>**MAX R2 = 0.7473461961338138**</span>

Model --> RidgeCV
RMSE scores --> [360.82941898411445, 348.3377646565596, 381.41151093805655, 312.23368848884, 363.72136656801695, 232.8561373768817, 300.9376007892196, 333.74839782097837, 330.8442824952811, 325.3928146084655]
R2 scores  --> [0.6244707964737672, 0.6454026858195321, 0.6017073143707394, 0.7440471703059393, 0.6393611030921476, 0.6623567085882756, 0.736821818015381, 0.6204621296534373, 0.5264670265330644, 0.588369781570576]


<span style='color:None'>**Min RMSE = 232.8561373768817**</span>

<span style='color:None'>**MAX R2 = 0.7440471703059393**</span>

Model --> Lasso
RMSE scores --> [352.5861284794253, 349.04028530906, 377.0132390151583, 313.9397596411751, 363.8307342449017, 235.33631751932685, 293.77636961694446, 332.6015912880352, 329.93928182951146, 329.55785398372944]
R2 scores  --> [0.641433029023351, 0.6439709539556613, 0.6108402271988452, 0.7412424329167753, 0.639144188871898, 0.6551258414195447, 0.7491981744407802, 0.6230659396533929, 0.5290541138626432, 0.5777645774045681]


<span style='color:None'>**Min RMSE = 235.33631751932685**</span>

<span style='color:None'>**MAX R2 = 0.7491981744407802**</span>

Model --> KNeighborsRegressor
RMSE scores --> [165.3850859862118, 130.65052621401875, 253.34915038341848, 160.26184288755164, 274.4742048769923, 164.86763175347673, 172.2488067551454, 215.6610303230512, 138.89633133075495, 193.8317267263393]
R2 scores  --> [0.9211080801397835, 0.9501165310489423, 0.8242668702750973, 0.9325688321692027, 0.7946295243317103, 0.830740308060627, 0.913779722145389, 0.8415252450945746, 0.9165388224641264, 0.8539367156983488]


<span style='color:None'>**Min RMSE = 130.65052621401875**</span>

<span style='color:None'>**MAX R2 = 0.9501165310489423**</span>

Model --> SGDRegressor
RMSE scores --> [353.7101346654514, 346.8847571437704, 376.006795480605, 314.95344424062085, 372.3425489204129, 228.08288053501488, 293.75600275925905, 330.8918037479137, 330.6093053594048, 318.5509486715305]
R2 scores  --> [0.6391432401114516, 0.6483547514816842, 0.6129151917640724, 0.7395687230888907, 0.6220622505576614, 0.6760573538888563, 0.7492329482929483, 0.62693134897021, 0.5271394268979693, 0.6054980647285069]


<span style='color:None'>**Min RMSE = 228.08288053501488**</span>

<span style='color:None'>**MAX R2 = 0.7492329482929483**</span>

Model --> ElasticNet
RMSE scores --> [407.64297780271767, 368.27092490526695, 401.66939107809213, 430.68350202745756, 443.525470852411, 277.1392834058187, 394.14968381861814, 353.29087057675673, 323.7373773927081, 321.13338711140756]
R2 scores  --> [0.5207084537798206, 0.6036588312277885, 0.5582747603102012, 0.5130135384150923, 0.4637441178839396, 0.5217236278159492, 0.5485399357639187, 0.5747135291816845, 0.5465925556667515, 0.5990758174087513]


<span style='color:None'>**Min RMSE = 277.1392834058187**</span>

<span style='color:None'>**MAX R2 = 0.6036588312277885**</span>



**Train model with Non-Holiday Data**

In [132]:
X, y = prepareData(dfWithNonHolidays)
soFarBestMSE = sys.maxsize
finalModelWithNonHolidays = []
LRsoFarBestMSE = sys.maxsize
LRfinalModelWithNonHolidays = []
for model in models:
  bestModelMSE, imputer, metric = runCV(10, model, X, y, withHoliday=False)
  if metric < soFarBestMSE:
    finalModelWithNonHolidays = [bestModelMSE, imputer]
    soFarBestMSE = metric
  if metric < LRsoFarBestMSE:
    if type(model()).__name__ in LinearRegressionModels:
      LRfinalModelWithNonHolidays = [bestModelMSE, imputer]
      LRsoFarBestMSE = metric

Model --> LinearRegression
RMSE scores --> [441.05541605960065, 485.4414403365998, 453.25482494569474, 439.25448164141, 464.1918613662689, 427.17030531068013, 451.4888475769772, 452.3720923736406, 462.25956564381863, 439.6637539555557]
R2 scores  --> [0.49898305872401705, 0.43620143464375793, 0.4968939824748362, 0.5278340585815195, 0.5273873619041881, 0.5670438322934015, 0.510845337964709, 0.5286986355530862, 0.4747862821977905, 0.5416274781932754]


<span style='color:None'>**Min RMSE = 427.17030531068013**</span>

<span style='color:None'>**MAX R2 = 0.5670438322934015**</span>

Model --> RidgeCV
RMSE scores --> [440.7903886191445, 485.4465320067661, 453.5635490133974, 439.4388738110826, 464.14512765618355, 427.4141881017338, 451.46995702632483, 452.09309646823925, 461.87197360957094, 439.6225244706494]
R2 scores  --> [0.4995849937119107, 0.43618960750601743, 0.4962083909519178, 0.5274375596042098, 0.5274825201007001, 0.5665493189834259, 0.5108862701279631, 0.5292797969694917, 0.47566666769996346, 0.5417134419630756]


<span style='color:None'>**Min RMSE = 427.4141881017338**</span>

<span style='color:None'>**MAX R2 = 0.5665493189834259**</span>

Model --> Lasso
RMSE scores --> [440.5951203533117, 485.4654943714337, 454.1996539326249, 439.6027416947663, 464.1310664027287, 427.8401216679247, 451.6455571602349, 451.91332811218075, 461.49517599582504, 439.5603528931921]
R2 scores  --> [0.5000282589431619, 0.436145559865881, 0.4947943044772028, 0.5270850543980917, 0.5275111494539266, 0.5656849902348923, 0.5105057126252341, 0.5296540728592939, 0.4765218265810969, 0.541843054901231]


<span style='color:None'>**Min RMSE = 427.8401216679247**</span>

<span style='color:None'>**MAX R2 = 0.5656849902348923**</span>

Model --> KNeighborsRegressor
RMSE scores --> [272.9156571468847, 315.3248601798587, 296.9350486606846, 255.80632207671178, 293.74294005956096, 291.40555838806415, 288.179596570783, 270.95838339832665, 303.7891325453094, 258.5923713032552]
R2 scores  --> [0.8081671425316533, 0.762115199082261, 0.7840776665335707, 0.8398655392162888, 0.8107459586743961, 0.7985174292990844, 0.8007131691007561, 0.8309122003132036, 0.7731657057985879, 0.8414345132963612]


<span style='color:None'>**Min RMSE = 255.80632207671178**</span>

<span style='color:None'>**MAX R2 = 0.8414345132963612**</span>

Model --> SGDRegressor
RMSE scores --> [442.9698987932647, 485.2919848416159, 454.922452410215, 439.3022228606656, 464.50833584913806, 427.90234340114125, 451.82119548861994, 453.2397756707043, 461.74447936804086, 441.0736679710551]
R2 scores  --> [0.494624105107054, 0.43654854066081306, 0.4931850894203914, 0.5277314164746101, 0.5267427111069954, 0.5655586542679333, 0.5101249243100142, 0.526888919535316, 0.4759560996543982, 0.5386829458780391]


<span style='color:None'>**Min RMSE = 427.90234340114125**</span>

<span style='color:None'>**MAX R2 = 0.5655586542679333**</span>

Model --> ElasticNet
RMSE scores --> [467.29435994489086, 510.2554693380713, 493.7850988122922, 480.55656431116125, 509.44293107379616, 473.88933154270296, 480.2426523207184, 490.2790463642004, 489.47898583107286, 487.899785877513]
R2 scores  --> [0.4375975928113084, 0.3770895622885576, 0.40289514857141917, 0.43486620013271626, 0.4307521902672503, 0.46716139334550033, 0.44655610555154934, 0.44640296801944734, 0.4111124808506492, 0.43553307093950333]


<span style='color:None'>**Min RMSE = 467.29435994489086**</span>

<span style='color:None'>**MAX R2 = 0.46716139334550033**</span>



**Test 2 models on unseen data**

In [133]:
mask = (testDf['Holiday'] == 'Holiday')

dfWithHolidays = testDf[mask].copy()
dfWithHolidays = dfWithHolidays.drop(['Holiday'], axis=1)

dfWithNonHolidays = testDf[~mask].copy()
dfWithNonHolidays = dfWithNonHolidays.drop(['Holiday'], axis=1)

dfWithHolidays.reset_index(drop=True, inplace=True)
dfWithNonHolidays.reset_index(drop=True, inplace=True)

predicted = []
ground_truth = []

LRpredicted = []
LRground_truth = []

#With Holiday
print("Evaluation for Model With Holidays")
X, y = prepareData(dfWithHolidays)
y_test_predict, y1 = evaluateCVModel(finalModelWithHolidays, X, y)
predicted.extend(y_test_predict)
ground_truth.extend(y1)
LRy_test_predict, LRy1 = evaluateCVModel(LRfinalModelWithHolidays, X, y)
LRpredicted.extend(LRy_test_predict)
LRground_truth.extend(LRy1)

#With Non-Holiday
print("Evaluation for Model With Non-Holidays")
X, y = prepareData(dfWithNonHolidays)
y_test_predict, y1 = evaluateCVModel(finalModelWithNonHolidays, X, y)
predicted.extend(y_test_predict)
ground_truth.extend(y1)
LRy_test_predict, LRy1 = evaluateCVModel(LRfinalModelWithNonHolidays, X, y)
LRpredicted.extend(LRy_test_predict)
LRground_truth.extend(LRy1)

#Metrics On full testset
printmd("**Final Results on complete testset**")

print('Linear Models Best output')
getFinalMetrics(LRpredicted, LRground_truth)

print("Overall Best Model")
getFinalMetrics(predicted, ground_truth)

Evaluation for Model With Holidays


<span style='color:None'>**Model : KNeighborsRegressor,  RMSE = 180.69184403287272, R2 =0.9082499549116151**</span>

<span style='color:None'>**Model : RidgeCV,  RMSE = 335.13382924217996, R2 =0.684379121837787**</span>

Evaluation for Model With Non-Holidays


<span style='color:None'>**Model : KNeighborsRegressor,  RMSE = 310.66842664775174, R2 =0.7693802176284472**</span>

<span style='color:None'>**Model : LinearRegression,  RMSE = 464.386206911759, R2 =0.484699046696452**</span>

<span style='color:None'>**Final Results on complete testset**</span>

Linear Models Best output


<span style='color:None'>**RMSE = 459.2775128858522, R2 =0.49372917657251547**</span>

Overall Best Model


<span style='color:None'>**RMSE = 305.9389583817461, R2 =0.7753522346187448**</span>

In [134]:
#End

# Experiment with separate models for Day-time & Night-time

In [135]:
day = (trainDf['Hour'] >= 7) & (trainDf['Hour'] <= 18) # 7AM to 6PM
night = ((trainDf['Hour'] >= 0) & (trainDf['Hour'] <= 6)) | ((trainDf['Hour'] >= 19) & (trainDf['Hour'] <= 23)) #7PM to 6AM
trainDay = trainDf[day].copy()
trainNight = trainDf[night].copy()

day = (testDf['Hour'] >= 7) & (testDf['Hour'] <= 18) # 7AM to 6PM
night = ((testDf['Hour'] >= 0) & (testDf['Hour'] <= 6)) | ((testDf['Hour'] >= 19) & (testDf['Hour'] <= 23))
testDay = testDf[day].copy()
testNight = testDf[night].copy()

trainDay.reset_index(drop=True, inplace=True)
trainNight.reset_index(drop=True, inplace=True)
testDay.reset_index(drop=True, inplace=True)
testNight.reset_index(drop=True, inplace=True)

**Train model with Day-time Data**

In [136]:
X, y = prepareData(trainDay)
soFarBestMSE = sys.maxsize
finalModelWithDay = []
LRfinalModelWithDay = []
LRsoFarBestMSE = sys.maxsize

for model in models:
  bestModelMSE, imputer, metric = runCV(10, model, X, y)
  if metric < soFarBestMSE:
    finalModelWithDay = [bestModelMSE, imputer]
    soFarBestMSE = metric
  if metric < LRsoFarBestMSE:
    if type(model()).__name__ in LinearRegressionModels:
      LRsoFarBestMSE = metric
      LRfinalModelWithDay = [bestModelMSE, imputer]

Model --> LinearRegression
RMSE scores --> [451.22339241038645, 501.73208662535285, 418.7611706847837, 425.7470143536772, 460.27179080326, 401.8652311597946, 446.46856773644157, 452.5181210404397, 429.2623074382998, 407.93631672052254]
R2 scores  --> [0.4916488694536732, 0.4481036383030525, 0.5190155175303388, 0.5611666702785131, 0.5421982699375186, 0.6163853100649411, 0.4967851335322701, 0.4877849727062792, 0.5515754158675628, 0.5648720753162658]


<span style='color:None'>**Min RMSE = 401.8652311597946**</span>

<span style='color:None'>**MAX R2 = 0.6163853100649411**</span>

Model --> RidgeCV
RMSE scores --> [451.1926910077157, 501.685813731268, 418.77982633831675, 425.80531335602655, 460.2051498185242, 401.9994133648931, 446.44852343265785, 452.1587396966953, 429.2718697520481, 408.0599396459662]
R2 scores  --> [0.4917180438895, 0.44820543232795534, 0.518972661218142, 0.5610464801370476, 0.5423308270488632, 0.6161290905435585, 0.49683031640049846, 0.48859823263820884, 0.5515554372938495, 0.564608308961354]


<span style='color:None'>**Min RMSE = 401.9994133648931**</span>

<span style='color:None'>**MAX R2 = 0.6161290905435585**</span>

Model --> Lasso
RMSE scores --> [450.51156221542965, 501.8529197389701, 418.66588052000264, 425.7247998326897, 460.8283477372221, 402.4214576701756, 446.16471604856997, 451.29630511149645, 429.8043208624612, 408.5592694585116]
R2 scores  --> [0.49325150913715116, 0.4478377777460798, 0.5192343910987682, 0.5612124637547221, 0.541090460252047, 0.61532264371632, 0.49746984312813974, 0.49054723819235724, 0.5504422831115521, 0.5635421074281745]


<span style='color:None'>**Min RMSE = 402.4214576701756**</span>

<span style='color:None'>**MAX R2 = 0.61532264371632**</span>

Model --> KNeighborsRegressor
RMSE scores --> [319.78595007228876, 394.9255125394477, 337.9521056034689, 293.71689252500516, 375.92312756853505, 317.064000027812, 345.50775711591876, 301.55991357816896, 335.9102372658505, 337.6074797310782]
R2 scores  --> [0.7446714735111623, 0.658064524837634, 0.6867373855351921, 0.791140500654164, 0.6946156318886185, 0.7612033490819615, 0.6986388327162887, 0.7725281887986204, 0.7254064182581124, 0.7019725035426014]


<span style='color:None'>**Min RMSE = 293.71689252500516**</span>

<span style='color:None'>**MAX R2 = 0.791140500654164**</span>

Model --> SGDRegressor
RMSE scores --> [452.2349780835588, 501.19678979293576, 419.5410024972429, 426.03471373243525, 458.71800218298165, 402.93864318585565, 449.614706726305, 449.1252359631078, 430.180382634519, 409.1266981778003]
R2 scores  --> [0.4893669968356748, 0.44928064406440044, 0.5172224370007759, 0.5605733849253562, 0.5452839531414788, 0.6143332461757376, 0.48966811722537773, 0.4954371372875982, 0.5496552485263033, 0.5623289139342975]


<span style='color:None'>**Min RMSE = 402.93864318585565**</span>

<span style='color:None'>**MAX R2 = 0.6143332461757376**</span>

Model --> ElasticNet
RMSE scores --> [486.0120473542857, 545.0443672537904, 464.4473209074055, 480.9175295157063, 531.7001552685736, 466.03832259089603, 485.57269713149697, 488.52844742420973, 493.0719386344177, 464.3403746172496]
R2 scores  --> [0.41024090025391735, 0.3487053794773828, 0.4083414034434447, 0.44006502184783836, 0.3890828928157314, 0.48408563095199786, 0.40477631657053625, 0.40301958782258995, 0.40835053446437286, 0.43622591392364396]


<span style='color:None'>**Min RMSE = 464.3403746172496**</span>

<span style='color:None'>**MAX R2 = 0.48408563095199786**</span>



**Train model with Night-time Data**

In [137]:
X, y = prepareData(trainNight)
soFarBestMSE = sys.maxsize
finalModelWithNight = []
LRfinalModelWithNight = []
LRsoFarBestMSE = sys.maxsize
for model in models:
  bestModelMSE, imputer, metric = runCV(10, model, X, y)
  if metric < soFarBestMSE:
    finalModelWithNight = [bestModelMSE, imputer]
    soFarBestMSE = metric
  if metric < LRsoFarBestMSE:
    if type(model()).__name__ in LinearRegressionModels:
      LRsoFarBestMSE = metric
      LRfinalModelWithNight = [bestModelMSE, imputer]

Model --> LinearRegression
RMSE scores --> [339.05783061470134, 356.30155768497974, 328.256809541209, 343.82855209177586, 345.61123191247236, 342.98591438500847, 340.60946806566847, 331.90965644736036, 369.82235116654164, 347.1848402344603]
R2 scores  --> [0.6845156757933356, 0.6531108689537657, 0.7192760476276208, 0.7018165403856642, 0.703168856320308, 0.6791521227595145, 0.7178083861507212, 0.7401953967585825, 0.6242264682805478, 0.7133367494324162]


<span style='color:None'>**Min RMSE = 328.256809541209**</span>

<span style='color:None'>**MAX R2 = 0.7401953967585825**</span>

Model --> RidgeCV
RMSE scores --> [338.60025547690856, 356.2785308887998, 328.2004714723784, 343.90482925248114, 345.6285611457311, 342.8969161866201, 340.624264383066, 332.1029738113486, 369.7453348799655, 347.6235658699041]
R2 scores  --> [0.6853666242814551, 0.6531557044984455, 0.7193723995551644, 0.7016842237968651, 0.7031390888555331, 0.6793186087702471, 0.7177838684096245, 0.7398926675880595, 0.6243829632486655, 0.7126117985382097]


<span style='color:None'>**Min RMSE = 328.2004714723784**</span>

<span style='color:None'>**MAX R2 = 0.7398926675880595**</span>

Model --> Lasso
RMSE scores --> [338.6301074826654, 355.8381646128085, 328.54463522548775, 345.01888739285204, 345.8357507313858, 342.870307928363, 341.34396550935634, 331.43084714257014, 368.63030448118684, 346.8608268666364]
R2 scores  --> [0.6853111438072149, 0.6540125854150554, 0.7187835369767592, 0.6997483427642179, 0.7027830712611999, 0.6793683755765871, 0.71659002601632, 0.7409444387479109, 0.6266450223279365, 0.7138715618589415]


<span style='color:None'>**Min RMSE = 328.54463522548775**</span>

<span style='color:None'>**MAX R2 = 0.7409444387479109**</span>

Model --> KNeighborsRegressor
RMSE scores --> [230.91195806675645, 263.25094197879343, 211.5925700538791, 223.56714676299794, 235.04581471348652, 238.26113530323937, 212.6212724574676, 216.80581543073177, 247.09224592097044, 230.80644226949875]
R2 scores  --> [0.8536734696219298, 0.8106370031099177, 0.8833585771205287, 0.8739288713183921, 0.862710028713157, 0.8451707485719903, 0.890037628626327, 0.8891466248747162, 0.8322517069385843, 0.8733088239958909]


<span style='color:None'>**Min RMSE = 211.5925700538791**</span>

<span style='color:None'>**MAX R2 = 0.890037628626327**</span>

Model --> SGDRegressor
RMSE scores --> [337.9209845516961, 356.3478311610955, 328.28180391661584, 345.78468380992257, 346.4188890945082, 341.37574977496985, 340.0076502696351, 332.43056817146925, 371.43908183658937, 348.0397193385248]
R2 scores  --> [0.6866277392750576, 0.6530207609656932, 0.719233295813944, 0.6984139994724234, 0.7017799088329054, 0.682157524621126, 0.718804705500532, 0.739379262239158, 0.6209337925858, 0.711923299851369]


<span style='color:None'>**Min RMSE = 328.28180391661584**</span>

<span style='color:None'>**MAX R2 = 0.739379262239158**</span>

Model --> ElasticNet
RMSE scores --> [369.1463190110432, 390.28895538717137, 373.1508378811343, 403.5142904305999, 391.2007053947612, 363.74358718414976, 391.18722670527717, 377.5865382689215, 383.40324490690574, 399.872836903535]
R2 scores  --> [0.6260381399204131, 0.5837753722785022, 0.6372388050277097, 0.5893067999695873, 0.6196941308901069, 0.6391412077757206, 0.627779745159694, 0.6637672048684686, 0.5961208400770991, 0.6197280237426306]


<span style='color:None'>**Min RMSE = 363.74358718414976**</span>

<span style='color:None'>**MAX R2 = 0.6637672048684686**</span>



**Test 2 models on unseen data**

In [138]:
testDay = testDf[day].copy()
testNight = testDf[night].copy()

predicted = []
ground_truth = []

LRpredicted = []
LRground_truth = []

#With Day-time
print("Evaluation for Model With Day-time")
X, y = prepareData(testDay)
LRy_test_predict, LRy1 = evaluateCVModel(LRfinalModelWithDay, X, y)
y_test_predict, y1 = evaluateCVModel(finalModelWithDay, X, y)
predicted.extend(y_test_predict)
ground_truth.extend(y1)
LRpredicted.extend(LRy_test_predict)
LRground_truth.extend(LRy1)

#With Night-time
print("Evaluation for Model With Night-time")
X, y = prepareData(testNight)
LRy_test_predict, LRy1 = evaluateCVModel(LRfinalModelWithNight, X, y)
y_test_predict, y1 = evaluateCVModel(finalModelWithNight, X, y)
predicted.extend(y_test_predict)
ground_truth.extend(y1)
LRpredicted.extend(LRy_test_predict)
LRground_truth.extend(LRy1)

#Metrics On full testset
printmd("**Final Results on complete testset**")
print('Linear Models Best output')
getFinalMetrics(LRpredicted, LRground_truth)

print("Overall Best Model")
getFinalMetrics(predicted, ground_truth)

Evaluation for Model With Day-time


<span style='color:None'>**Model : LinearRegression,  RMSE = 455.94145042416085, R2 =0.5175410728914348**</span>

<span style='color:None'>**Model : KNeighborsRegressor,  RMSE = 371.91530132395064, R2 =0.6789813332500849**</span>

Evaluation for Model With Night-time


<span style='color:None'>**Model : RidgeCV,  RMSE = 342.1207646961955, R2 =0.6744532163537097**</span>

<span style='color:None'>**Model : KNeighborsRegressor,  RMSE = 240.36059862992806, R2 =0.8393128697203489**</span>

<span style='color:None'>**Final Results on complete testset**</span>

Linear Models Best output


<span style='color:None'>**RMSE = 402.1031043699273, R2 =0.6119323975341436**</span>

Overall Best Model


<span style='color:None'>**RMSE = 312.02160664559534, R2 =0.7663305840725535**</span>

# Experiment with separate models based on 6 Hrs Time-slot

*  Offtime - 12AM to 6AM

*  Peaktime - 7AM to 11PM

In [139]:
day = (trainDf['Hour'] >= 7) & (trainDf['Hour'] <= 23) # 7AM to 11PM
night = ((trainDf['Hour'] >= 0) & (trainDf['Hour'] <= 6)) #12AM to 6AM - Offtime
trainPeaktime = trainDf[day].copy()
trainOfftime = trainDf[night].copy()

day = (testDf['Hour'] >= 7) & (testDf['Hour'] <= 23) # 7AM to 11PM
night = ((testDf['Hour'] >= 0) & (testDf['Hour'] <= 6))
testPeaktime = testDf[day].copy()
testOfftime = testDf[night].copy()

trainPeaktime.reset_index(drop=True, inplace=True)
trainOfftime.reset_index(drop=True, inplace=True)
testPeaktime.reset_index(drop=True, inplace=True)
testOfftime.reset_index(drop=True, inplace=True)

**Training Peaktime**

In [140]:
X, y = prepareData(trainPeaktime)
soFarBestMSE = sys.maxsize
finalModelWithPeakTime = []
LRfinalModelWithPeakTime = []
LRsoFarBestMSE = sys.maxsize
for model in models:
  bestModelMSE, imputer, metric = runCV(10, model, X, y)
  if metric < soFarBestMSE:
    finalModelWithPeakTime = [bestModelMSE, imputer]
    soFarBestMSE = metric
  if metric < LRsoFarBestMSE:
    if type(model()).__name__ in LinearRegressionModels:
      LRsoFarBestMSE = metric
      LRfinalModelWithPeakTime = [bestModelMSE, imputer]

Model --> LinearRegression
RMSE scores --> [459.18396690887334, 505.37203332556845, 446.5769521740289, 441.7642857197718, 479.0900300418251, 441.17116862453105, 468.0074648981901, 478.299693057098, 471.4797307826033, 449.8824284765776]
R2 scores  --> [0.5094970609785205, 0.46946074951966943, 0.538708631245443, 0.5748464120831691, 0.5204937942211656, 0.5725080396084145, 0.5224347236270375, 0.5161797573546967, 0.5051837242688909, 0.5610926308521667]


<span style='color:None'>**Min RMSE = 441.17116862453105**</span>

<span style='color:None'>**MAX R2 = 0.5748464120831691**</span>

Model --> RidgeCV
RMSE scores --> [459.1798896176363, 505.3543353536008, 446.64839209154877, 441.8119569514291, 479.02605726957984, 441.251187423446, 467.9941301788469, 478.12221460564706, 471.4873494342911, 449.85477967852]
R2 scores  --> [0.5095057717114732, 0.46949790750925113, 0.538561031808938, 0.5747546496234908, 0.5206218423596521, 0.5723529501651972, 0.5224619373305917, 0.5165387445654863, 0.5051677326447912, 0.5611465777838273]


<span style='color:None'>**Min RMSE = 441.251187423446**</span>

<span style='color:None'>**MAX R2 = 0.5747546496234908**</span>

Model --> Lasso
RMSE scores --> [458.8748061691531, 505.33527223918134, 447.14716870668616, 442.14217003400074, 478.8054378655309, 441.8671188893016, 467.75273509463983, 477.476678834448, 471.5890400891797, 450.1650423212153]
R2 scores  --> [0.5101573331685023, 0.4695379302443434, 0.5375298695843687, 0.5741187498847912, 0.5210633038104406, 0.5711582333263417, 0.5229544461184201, 0.5178433517975187, 0.5049542582264811, 0.5605410189098633]


<span style='color:None'>**Min RMSE = 441.8671188893016**</span>

<span style='color:None'>**MAX R2 = 0.5741187498847912**</span>

Model --> KNeighborsRegressor
RMSE scores --> [306.7310561439986, 365.13784193706607, 308.79422126124274, 291.3192158821122, 321.71086685268597, 335.4031836665867, 318.39039498975217, 305.6920173916583, 338.5723581306378, 300.7095164434908]
R2 scores  --> [0.7811312877376834, 0.7230452624484388, 0.7794428980648163, 0.8151144762037357, 0.7837822082397292, 0.7529140405975652, 0.7789721003699194, 0.8023703840564906, 0.7448351083676461, 0.8039038100891803]


<span style='color:None'>**Min RMSE = 291.3192158821122**</span>

<span style='color:None'>**MAX R2 = 0.8151144762037357**</span>

Model --> SGDRegressor
RMSE scores --> [458.98380044308595, 506.4657212865142, 448.4752470203694, 443.4930509331197, 478.94589333526085, 442.549852558649, 468.5868116401324, 479.0606156643988, 471.07308485563834, 448.8682485401121]
R2 scores  --> [0.509924605697601, 0.4671619588732354, 0.5347786113363608, 0.5715123767885142, 0.5207822746495869, 0.5698319934102242, 0.5212516350811277, 0.514639122439432, 0.5060369030469016, 0.5630692775723574]


<span style='color:None'>**Min RMSE = 442.549852558649**</span>

<span style='color:None'>**MAX R2 = 0.5715123767885142**</span>

Model --> ElasticNet
RMSE scores --> [496.080671047438, 547.3185311398463, 504.08268797447477, 510.89921960498225, 535.0058519278664, 499.4914017380605, 512.182014105167, 522.5769088763017, 515.7602052108282, 519.0368648099541]
R2 scores  --> [0.4275035548280529, 0.377734941410126, 0.4122586005659949, 0.43136299075645557, 0.40203325702542925, 0.4520136514235167, 0.42802660474963067, 0.42245708322472286, 0.4078747481410322, 0.41578699457090595]


<span style='color:None'>**Min RMSE = 496.080671047438**</span>

<span style='color:None'>**MAX R2 = 0.4520136514235167**</span>



**Training Offtime**

In [141]:
X, y = prepareData(trainOfftime)
soFarBestMSE = sys.maxsize
finalModelWithOffTime = []
LRfinalModelWithOffTime = []
LRsoFarBestMSE = sys.maxsize
for model in models:
  bestModelMSE, imputer, metric = runCV(10, model, X, y)
  if metric < soFarBestMSE:
    finalModelWithOffTime = [bestModelMSE, imputer]
    soFarBestMSE = metric
  if metric < LRsoFarBestMSE:
    if type(model()).__name__ in LinearRegressionModels:
      LRsoFarBestMSE = metric
      LRfinalModelWithOffTime = [bestModelMSE, imputer]

Model --> LinearRegression
RMSE scores --> [160.31183828041372, 177.13518199001993, 153.17075947684205, 135.44700510416865, 150.3091801962329, 140.23224777506113, 139.94750328510784, 147.7669716470439, 144.00507129471936, 173.3814774470394]
R2 scores  --> [0.6348879480099341, 0.601837513100748, 0.6966666395878987, 0.6632626957799543, 0.705971489822049, 0.6837829956635088, 0.6715460847874597, 0.6898977339752178, 0.6647528481826932, 0.6337822663291668]


<span style='color:None'>**Min RMSE = 135.44700510416865**</span>

<span style='color:None'>**MAX R2 = 0.705971489822049**</span>

Model --> RidgeCV
RMSE scores --> [160.41284989346795, 177.11014922537032, 153.21051209280654, 135.31362245879535, 150.39171157732508, 140.2274425116442, 139.77674884049162, 147.76997679186618, 143.91256111916186, 173.36830475029]
R2 scores  --> [0.6344276928368717, 0.6019500418873895, 0.6965091701155308, 0.6639255793515485, 0.7056485123284665, 0.6838046665700628, 0.6723471101580323, 0.6898851207141513, 0.6651834415059044, 0.633837911164657]


<span style='color:None'>**Min RMSE = 135.31362245879535**</span>

<span style='color:None'>**MAX R2 = 0.7056485123284665**</span>

Model --> Lasso
RMSE scores --> [160.42494801956752, 176.52939494992725, 153.52331899140103, 135.35980330356097, 151.21978089278988, 140.50009813792823, 140.24083994972318, 148.0524366638693, 143.40028987857562, 172.77054796679295]
R2 scores  --> [0.634372548792587, 0.6045562189755614, 0.6952686424726302, 0.6636961442245797, 0.702398140772929, 0.6825738625270528, 0.6701677315011283, 0.6886984286780552, 0.667562825579638, 0.6363585396219222]


<span style='color:None'>**Min RMSE = 135.35980330356097**</span>

<span style='color:None'>**MAX R2 = 0.702398140772929**</span>

Model --> KNeighborsRegressor
RMSE scores --> [140.16401806734964, 137.8011297941676, 130.91612342469134, 111.82628159293307, 115.00268234181674, 120.49811150738957, 120.54359389680005, 133.78415157193567, 113.82628714545774, 144.13162429929358]
R2 scores  --> [0.720894716215873, 0.7590337377718085, 0.7784076727678553, 0.77046961472883, 0.8278787976379018, 0.7665198786220991, 0.756312976054237, 0.74580937350932, 0.7905431037441438, 0.7469230838316174]


<span style='color:None'>**Min RMSE = 111.82628159293307**</span>

<span style='color:None'>**MAX R2 = 0.8278787976379018**</span>

Model --> SGDRegressor
RMSE scores --> [161.3704507681272, 177.10758924889473, 153.45034277350751, 135.0661407953113, 151.31155889677254, 141.03042439292383, 138.32197111358462, 147.63454090754016, 143.677585353611, 172.70209481266687]
R2 scores  --> [0.6300500228726159, 0.6019615487509877, 0.6955582773856931, 0.6651537808625922, 0.7020367913128528, 0.6801730511298771, 0.6791319525924081, 0.6904533205115553, 0.6662759041588262, 0.6366466381889055]


<span style='color:None'>**Min RMSE = 135.0661407953113**</span>

<span style='color:None'>**MAX R2 = 0.7020367913128528**</span>

Model --> ElasticNet
RMSE scores --> [191.00904689172052, 199.12305921577942, 185.31466309825467, 157.61745416400447, 186.96401911261785, 167.9429344133299, 163.02219668565732, 179.8755632964749, 168.98833471043463, 201.46629597383367]
R2 scores  --> [0.48167416552904574, 0.4968542641212931, 0.5559948436965205, 0.5440039985848916, 0.5450805536439209, 0.5464628112563932, 0.5543051508712503, 0.5404904646958824, 0.5383392249789838, 0.5055313344708192]


<span style='color:None'>**Min RMSE = 157.61745416400447**</span>

<span style='color:None'>**MAX R2 = 0.5559948436965205**</span>



**Testing both the models on unseen data**

In [142]:
predicted = []
ground_truth = []

LRpredicted = []
LRground_truth = []

#With Peaktime
print("Evaluation for Model With Peak Time")
X, y = prepareData(testPeaktime)
LRy_test_predict, LRy1 = evaluateCVModel(LRfinalModelWithPeakTime, X, y)
y_test_predict, y1 = evaluateCVModel(finalModelWithPeakTime, X, y)
predicted.extend(y_test_predict)
ground_truth.extend(y1)
LRpredicted.extend(LRy_test_predict)
LRground_truth.extend(LRy1)

#With Offtime
print("Evaluation for Model With Off-time")
X, y = prepareData(testOfftime)
LRy_test_predict, LRy1 = evaluateCVModel(LRfinalModelWithOffTime, X, y)
y_test_predict, y1 = evaluateCVModel(finalModelWithOffTime, X, y)
predicted.extend(y_test_predict)
ground_truth.extend(y1)
LRpredicted.extend(LRy_test_predict)
LRground_truth.extend(LRy1)

#Metrics On full testset
printmd("**Final Results on complete testset**")

print('Linear Models Best output')
getFinalMetrics(LRpredicted, LRground_truth)

print("Overall Best Model")
getFinalMetrics(predicted, ground_truth)

Evaluation for Model With Peak Time


<span style='color:None'>**Model : LinearRegression,  RMSE = 473.1006327032698, R2 =0.5163774483613727**</span>

<span style='color:None'>**Model : KNeighborsRegressor,  RMSE = 342.47009814504514, R2 =0.7465777147642559**</span>

Evaluation for Model With Off-time


<span style='color:None'>**Model : RidgeCV,  RMSE = 158.5815978153015, R2 =0.619842505719126**</span>

<span style='color:None'>**Model : KNeighborsRegressor,  RMSE = 123.82153090765263, R2 =0.7682336847113371**</span>

<span style='color:None'>**Final Results on complete testset**</span>

Linear Models Best output


<span style='color:None'>**RMSE = 405.88547786128794, R2 =0.6045973628736554**</span>

Overall Best Model


<span style='color:None'>**RMSE = 294.9021135484493, R2 =0.791268350210807**</span>

# Experiment with separate models based on Different Time-slots

5-8, 9-10, 11-18, 19-4 

In [151]:
model1 = (trainDf['Hour'] >= 5) & (trainDf['Hour'] <= 8) 
model2 = ((trainDf['Hour'] >= 9) & (trainDf['Hour'] <= 10)) 
model3 = (trainDf['Hour'] >= 11) & (trainDf['Hour'] <= 17) 
model4 = ((trainDf['Hour'] >= 18) | (trainDf['Hour'] <= 4)) 
train1 = trainDf[model1].copy() #.drop("Dew point temperature(C)",axis=1)
train2 = trainDf[model2].copy() #.drop("Dew point temperature(C)",axis=1)
train3 = trainDf[model3].copy() #.drop("Dew point temperature(C)",axis=1)
train4 = trainDf[model4].copy() #.drop("Dew point temperature(C)",axis=1)

m1 = (testDf['Hour'] >= 5) & (testDf['Hour'] <= 8) # 7AM to 11PM
m2 = ((testDf['Hour'] >= 9) & (testDf['Hour'] <= 10))
m3 = (testDf['Hour'] >= 11) & (testDf['Hour'] <= 17) # 7AM to 11PM
m4 = ((testDf['Hour'] >= 18) | (testDf['Hour'] <= 4))
test1 = testDf[m1].copy() #.drop("Dew point temperature(C)",axis=1)
test2 = testDf[m2].copy() #.drop("Dew point temperature(C)",axis=1)
test3 = testDf[m3].copy() #.drop("Dew point temperature(C)",axis=1)
test4 = testDf[m4].copy() #.drop("Dew point temperature(C)",axis=1)

train1.reset_index(drop=True, inplace=True)
train2.reset_index(drop=True, inplace=True)
train3.reset_index(drop=True, inplace=True)
train4.reset_index(drop=True, inplace=True)
test1.reset_index(drop=True, inplace=True)
test2.reset_index(drop=True, inplace=True)
test3.reset_index(drop=True, inplace=True)
test4.reset_index(drop=True, inplace=True)

**Training Peaktime**

In [152]:
X, y = prepareData(train1)
soFarBestMSE = sys.maxsize
finalModelWithTime = []
LRfinalModelWithTime = []
LRsoFarBestMSE = sys.maxsize
for model in models:
  bestModelMSE, imputer, metric = runCV(10, model, X, y)
  if metric < soFarBestMSE:
    finalModelWithTime = [bestModelMSE, imputer]
    soFarBestMSE = metric
  if metric < LRsoFarBestMSE:
    if type(model()).__name__ in LinearRegressionModels:
      LRsoFarBestMSE = metric
      LRfinalModelWithTime = [bestModelMSE, imputer]

X, y = prepareData(train2)
soFarBestMSE = sys.maxsize
finalModelWithTime2 = []
LRfinalModelWithTime2 = []
LRsoFarBestMSE = sys.maxsize
for model in models:
  bestModelMSE, imputer, metric = runCV(10, model, X, y)
  if metric < soFarBestMSE:
    finalModelWithTime2 = [bestModelMSE, imputer]
    soFarBestMSE = metric
  if metric < LRsoFarBestMSE:
    if type(model()).__name__ in LinearRegressionModels:
      LRsoFarBestMSE = metric
      LRfinalModelWithTime2 = [bestModelMSE, imputer]

X, y = prepareData(train3)
soFarBestMSE = sys.maxsize
finalModelWithTime3 = []
LRfinalModelWithTime3 = []
LRsoFarBestMSE = sys.maxsize
for model in models:
  bestModelMSE, imputer, metric = runCV(10, model, X, y)
  if metric < soFarBestMSE:
    finalModelWithTime3 = [bestModelMSE, imputer]
    soFarBestMSE = metric
  if metric < LRsoFarBestMSE:
    if type(model()).__name__ in LinearRegressionModels:
      LRsoFarBestMSE = metric
      LRfinalModelWithTime3 = [bestModelMSE, imputer]

X, y = prepareData(train4)
soFarBestMSE = sys.maxsize
finalModelWithTime4 = []
LRfinalModelWithTime4 = []
LRsoFarBestMSE = sys.maxsize
for model in models:
  bestModelMSE, imputer, metric = runCV(10, model, X, y)
  if metric < soFarBestMSE:
    finalModelWithTime4 = [bestModelMSE, imputer]
    soFarBestMSE = metric
  if metric < LRsoFarBestMSE:
    if type(model()).__name__ in LinearRegressionModels:
      LRsoFarBestMSE = metric
      LRfinalModelWithTime4 = [bestModelMSE, imputer]

Model --> LinearRegression
RMSE scores --> [370.42832802071837, 300.7083417113362, 374.826371453365, 333.3382831179428, 350.33122946358793, 354.44312704150246, 390.20417175539944, 405.9365515300386, 376.55356371129005, 394.43421821258096]
R2 scores  --> [0.5373474375674354, 0.6905873159282168, 0.6025290579462672, 0.7008154505071666, 0.6451616341446302, 0.6052054289123854, 0.39895172902901, 0.49756283637626975, 0.5501118692890203, 0.5349881453600271]


<span style='color:None'>**Min RMSE = 300.7083417113362**</span>

<span style='color:None'>**MAX R2 = 0.7008154505071666**</span>

Model --> RidgeCV
RMSE scores --> [370.33703567762785, 300.6660428072252, 374.8870559258107, 333.35921853524184, 350.0293235232429, 354.3928164758414, 389.1833959880983, 405.7233624654674, 376.49539429895674, 396.13473400014954]
R2 scores  --> [0.5375754516184199, 0.6906743563933131, 0.602400346261016, 0.7007778685647225, 0.6457729507592467, 0.6053174972642337, 0.40209230541742513, 0.498090435980261, 0.5502508545987952, 0.5309699108512174]


<span style='color:None'>**Min RMSE = 300.6660428072252**</span>

<span style='color:None'>**MAX R2 = 0.7007778685647225**</span>

Model --> Lasso
RMSE scores --> [370.90370835837194, 301.12675358099995, 375.22529194347845, 333.9885654237233, 349.98240955766676, 353.88569846870814, 389.62608170478256, 405.019381141342, 376.6275048171656, 395.1047218531673]
R2 scores  --> [0.5361592074054455, 0.6897256703421529, 0.6016825663923169, 0.6996470027249165, 0.6458678975583869, 0.6064462300844089, 0.40073132375588727, 0.4998306779861118, 0.5499351693709355, 0.533405842726455]


<span style='color:None'>**Min RMSE = 301.12675358099995**</span>

<span style='color:None'>**MAX R2 = 0.6996470027249165**</span>

Model --> KNeighborsRegressor
RMSE scores --> [348.76227700551124, 345.84447120182443, 429.00597908066095, 324.95166249231863, 343.43949533581855, 373.91958354239983, 441.12411503970765, 399.1179500936491, 389.7978365657054, 442.6491086530101]
R2 scores  --> [0.589885052850622, 0.5907310131465746, 0.4793189202883662, 0.7156807270620318, 0.6589851128757368, 0.5606258767004133, 0.23184805252857954, 0.5143001605446975, 0.5179080718835384, 0.41435553313276785]


<span style='color:None'>**Min RMSE = 324.95166249231863**</span>

<span style='color:None'>**MAX R2 = 0.7156807270620318**</span>

Model --> SGDRegressor
RMSE scores --> [371.44555626535384, 300.94704993114163, 376.1556630780222, 333.471915478357, 346.3242860556468, 359.3792713106045, 389.74208608900614, 405.1213601052863, 377.01094432031704, 396.5382112742433]
R2 scores  --> [0.5348029807621559, 0.6900958851545628, 0.5997048613659268, 0.7005755215608984, 0.6532322031024194, 0.5941326642757696, 0.4003744269982342, 0.4995787731486094, 0.549018292723332, 0.5300139767160322]


<span style='color:None'>**Min RMSE = 300.94704993114163**</span>

<span style='color:None'>**MAX R2 = 0.7005755215608984**</span>

Model --> ElasticNet
RMSE scores --> [398.25678901104806, 336.43333579330243, 416.25838388955344, 367.87726983784864, 375.589520200788, 375.53468633873456, 382.75564091111414, 420.1674971811373, 398.6877617264265, 417.43463224129766]
R2 scores  --> [0.4652226992181073, 0.6127020437411306, 0.5098024974980995, 0.6356030922100413, 0.592150629801591, 0.556822026895989, 0.42167930253937425, 0.4617173935533725, 0.4956676571593191, 0.4791750139992939]


<span style='color:None'>**Min RMSE = 336.43333579330243**</span>

<span style='color:None'>**MAX R2 = 0.6356030922100413**</span>

Model --> LinearRegression
RMSE scores --> [195.9778830294308, 202.90943143091698, 251.58785589101817, 213.18829652958703, 194.29398507067611, 205.41449058085345, 193.49623787626203, 208.63306887783858, 178.1438998291754, 203.9751089548516]
R2 scores  --> [0.7051000487145893, 0.6968340451187468, 0.5279483347856284, 0.7044997001632298, 0.6197386352376469, 0.6684590289886234, 0.7215525645927108, 0.676542125438516, 0.7490179908032119, 0.5283015046577635]


<span style='color:None'>**Min RMSE = 178.1438998291754**</span>

<span style='color:None'>**MAX R2 = 0.7490179908032119**</span>

Model --> RidgeCV
RMSE scores --> [195.60971171693473, 202.1091764284023, 253.38657731700306, 213.53028702237117, 194.15841087837808, 204.1050169418037, 193.71782352533887, 209.00949054471914, 178.16696506201978, 203.7808484046262]
R2 scores  --> [0.7062070278813521, 0.6992206434651168, 0.5211743614248989, 0.7035508736826077, 0.6202691266047483, 0.6726725620483944, 0.7209144613945768, 0.6753738888663722, 0.748952994659634, 0.5291995433994702]


<span style='color:None'>**Min RMSE = 178.16696506201978**</span>

<span style='color:None'>**MAX R2 = 0.748952994659634**</span>

Model --> Lasso
RMSE scores --> [193.24724915340084, 200.89854256659692, 252.61974198412702, 215.5073415181861, 192.13431170041565, 200.40471547282624, 196.68868422757959, 211.87482682031174, 178.6259663445466, 204.00882435585095]
R2 scores  --> [0.713260701971171, 0.7028131878523615, 0.524068159623857, 0.6980358773119779, 0.6281452371960583, 0.6844334776834132, 0.7122886986462069, 0.666412201740156, 0.7476578125954939, 0.5281455560387218]


<span style='color:None'>**Min RMSE = 178.6259663445466**</span>

<span style='color:None'>**MAX R2 = 0.7476578125954939**</span>

Model --> KNeighborsRegressor
RMSE scores --> [221.72221048124533, 215.99334121281578, 240.84845285236796, 221.16195813054722, 236.58586106353104, 278.7817030061953, 214.01551345638475, 164.15504865827307, 250.35648408381002, 178.4336881102657]
R2 scores  --> [0.6225330116916747, 0.6564763145252919, 0.5673886515780058, 0.6819817378473608, 0.436179238512321, 0.3893342187234312, 0.6593654606185249, 0.7997558099563568, 0.5043006282179885, 0.6390360356830642]


<span style='color:None'>**Min RMSE = 164.15504865827307**</span>

<span style='color:None'>**MAX R2 = 0.7997558099563568**</span>

Model --> SGDRegressor
RMSE scores --> [194.35633465479003, 200.2440324934618, 254.52685767859205, 218.11218573578282, 194.4349544053584, 198.9033215082159, 196.1632800419388, 213.4561330989837, 179.60112665693558, 205.05951936218912]
R2 scores  --> [0.7099599462207395, 0.7047464513636352, 0.5168550797027329, 0.6906920611910352, 0.6191866404025222, 0.6891440943964655, 0.7138237419393794, 0.6614142227361185, 0.744895103691311, 0.5232727100656944]


<span style='color:None'>**Min RMSE = 179.60112665693558**</span>

<span style='color:None'>**MAX R2 = 0.744895103691311**</span>

Model --> ElasticNet
RMSE scores --> [250.19203948233547, 243.75274200868438, 271.2884280393495, 263.9191441916292, 222.36953276061476, 280.9687386673676, 237.6979007903984, 256.9337776158934, 246.91397157278965, 220.40819997239274]
R2 scores  --> [0.5193736783582505, 0.5625030992321567, 0.4511259173553265, 0.5471305803558071, 0.501902848990595, 0.37971531980452145, 0.579806950996287, 0.5094380717380695, 0.5178390757230007, 0.44923588140779835]


<span style='color:None'>**Min RMSE = 220.40819997239274**</span>

<span style='color:None'>**MAX R2 = 0.579806950996287**</span>

Model --> LinearRegression
RMSE scores --> [372.5712308311842, 395.5595496475551, 349.2035523287872, 399.2155583664497, 356.8521972693629, 377.5269392364872, 306.17746014233234, 324.4885065963169, 365.2147479646782, 363.15240613404313]
R2 scores  --> [0.5688475683923486, 0.4927426321145597, 0.6222083820427478, 0.4812541998729738, 0.5966743127323347, 0.5490279247369617, 0.6522544047344633, 0.612626762397371, 0.5586060389621703, 0.6045641038879147]


<span style='color:None'>**Min RMSE = 306.17746014233234**</span>

<span style='color:None'>**MAX R2 = 0.6522544047344633**</span>

Model --> RidgeCV
RMSE scores --> [372.40627720801746, 395.7882651089297, 349.2639650351855, 399.32773484788555, 356.8222254284085, 377.4049585578784, 306.16348705553276, 324.38823595842257, 365.2913307471102, 363.1339549619135]
R2 scores  --> [0.5692292640566446, 0.4921558625919752, 0.6220776537716031, 0.48096263180645193, 0.5967420601403326, 0.5493192999321397, 0.6522861442916934, 0.612866130877384, 0.558420905574161, 0.6046042857526459]


<span style='color:None'>**Min RMSE = 306.16348705553276**</span>

<span style='color:None'>**MAX R2 = 0.6522861442916934**</span>

Model --> Lasso
RMSE scores --> [372.2812218258017, 396.34677154820776, 349.63079751928376, 400.4557811463635, 356.325959701395, 377.3685356386577, 306.7597752965893, 324.0450976688312, 365.01213047934493, 363.66209937411423]
R2 scores  --> [0.5695185242406038, 0.49072158892858175, 0.621283372020245, 0.4780260706463555, 0.597862976471114, 0.5494062850883181, 0.6509304007164101, 0.6136847190971491, 0.5590956650046228, 0.6034533171185472]


<span style='color:None'>**Min RMSE = 306.7597752965893**</span>

<span style='color:None'>**MAX R2 = 0.6509304007164101**</span>

Model --> KNeighborsRegressor
RMSE scores --> [293.1381691516685, 270.63451655950695, 282.5634471380483, 297.1737936120144, 238.25666136107682, 268.1953812737747, 272.802710808361, 274.4019117475891, 236.79729563386528, 280.4659812421933]
R2 scores  --> [0.7330948530062817, 0.762550648829968, 0.7526414238912521, 0.7125510719292949, 0.8202084668905297, 0.7724083390577786, 0.7239342250885813, 0.7229837527586374, 0.8144404759419313, 0.7641376498886914]


<span style='color:None'>**Min RMSE = 236.79729563386528**</span>

<span style='color:None'>**MAX R2 = 0.8202084668905297**</span>

Model --> SGDRegressor
RMSE scores --> [372.0533626331571, 395.86181272918253, 350.0258243358382, 402.7403134153079, 356.3631323740041, 378.0520964416527, 306.3872892899296, 325.9141564952619, 365.65983478577147, 361.8644434172793]
R2 scores  --> [0.5700453256876896, 0.4919671040995147, 0.6204271102788135, 0.472053537596026, 0.5977790685365809, 0.5477724066186466, 0.6517776082734656, 0.6092154132240364, 0.5575295306994162, 0.6073640497202244]


<span style='color:None'>**Min RMSE = 306.3872892899296**</span>

<span style='color:None'>**MAX R2 = 0.6517776082734656**</span>

Model --> ElasticNet
RMSE scores --> [427.274556548881, 426.6558450907356, 420.4839236048315, 439.6048223540782, 398.09549940935926, 422.5074016505756, 367.5518934383308, 368.87790211865246, 402.42025898658596, 428.3553007057687]
R2 scores  --> [0.43294358107295494, 0.4098532553604216, 0.45223559909799127, 0.37097982873933677, 0.49805784875641335, 0.4351639600923566, 0.49886754015004375, 0.49939376472272823, 0.4640929800123712, 0.4498178080436597]


<span style='color:None'>**Min RMSE = 367.5518934383308**</span>

<span style='color:None'>**MAX R2 = 0.49939376472272823**</span>

Model --> LinearRegression
RMSE scores --> [417.03162888829087, 386.61310569198804, 404.871259522881, 366.79156633311703, 405.6751391776213, 354.61227519559804, 368.6349516892332, 403.5997671097828, 380.45592472093364, 380.70072496769023]
R2 scores  --> [0.6905533022377495, 0.6890242006932148, 0.7094451055664039, 0.7150190898964011, 0.678559087823678, 0.7541588553388763, 0.7420475661572826, 0.6859102874642234, 0.6985799387018181, 0.7417943323737677]


<span style='color:None'>**Min RMSE = 354.61227519559804**</span>

<span style='color:None'>**MAX R2 = 0.7541588553388763**</span>

Model --> RidgeCV
RMSE scores --> [417.174290383985, 386.7253497259887, 404.80647618016474, 365.59254466592813, 405.68103727636833, 354.05145627945325, 368.73438467863724, 403.88191458529053, 380.6099221910556, 380.79401786105313]
R2 scores  --> [0.690341550051051, 0.6888436054437208, 0.7095380813502874, 0.7168792187581967, 0.6785497409160646, 0.7549358354460752, 0.7419083908528732, 0.68547098792698, 0.6983358772051811, 0.7416677673121376]


<span style='color:None'>**Min RMSE = 354.05145627945325**</span>

<span style='color:None'>**MAX R2 = 0.7549358354460752**</span>

Model --> Lasso
RMSE scores --> [416.7840985829212, 386.6758424175742, 405.3160162494455, 363.259818349575, 406.2479089288767, 353.8733687526516, 369.0972300079894, 404.7516411328637, 380.7184633507236, 380.7330832706901]
R2 scores  --> [0.6909205391472584, 0.68892326678262, 0.7088063977346667, 0.720480694064541, 0.6776507669213203, 0.755182307500005, 0.7414002016811336, 0.6841149045759647, 0.6981637973902366, 0.7417504372521713]


<span style='color:None'>**Min RMSE = 353.8733687526516**</span>

<span style='color:None'>**MAX R2 = 0.755182307500005**</span>

Model --> KNeighborsRegressor
RMSE scores --> [281.8788238432773, 329.5134072731418, 326.3390123165668, 275.0696956433047, 322.97925342414004, 267.62478516511675, 324.2019435255725, 285.86607840492144, 288.50236120759956, 259.65510186681456]
R2 scores  --> [0.858624952068127, 0.7740982326586964, 0.8112302991007418, 0.839726182914966, 0.7962518768818059, 0.8599768953597757, 0.8004839500237639, 0.8424286898628197, 0.8266746066639883, 0.8798864276375232]


<span style='color:None'>**Min RMSE = 259.65510186681456**</span>

<span style='color:None'>**MAX R2 = 0.8798864276375232**</span>

Model --> SGDRegressor
RMSE scores --> [418.68521334019465, 387.0473572561315, 404.6848728027571, 359.5578956581582, 407.06610285990985, 357.3318757674957, 372.93750240384804, 406.4634396207177, 383.7584175958128, 383.5877686600759]
R2 scores  --> [0.6880944445225838, 0.6883252198979266, 0.7097125639595275, 0.7261487381383707, 0.6763510197756704, 0.7503735746459076, 0.7359910037365998, 0.6814373363095123, 0.6933243602433321, 0.7378632780902841]


<span style='color:None'>**Min RMSE = 357.3318757674957**</span>

<span style='color:None'>**MAX R2 = 0.7503735746459076**</span>

Model --> ElasticNet
RMSE scores --> [461.5835037831147, 429.9364948709761, 462.4878800361987, 404.87736071240977, 456.8954090667193, 406.32118678874025, 431.83267868276795, 431.6168652575557, 411.78003453320457, 427.4001561014445]
R2 scores  --> [0.6209046802728011, 0.6154240940580742, 0.6208640117971396, 0.6527645371270163, 0.5922650378010235, 0.6772353407795701, 0.6460209018659779, 0.6407897583464008, 0.6469030046618417, 0.6745624093203186]


<span style='color:None'>**Min RMSE = 404.87736071240977**</span>

<span style='color:None'>**MAX R2 = 0.6772353407795701**</span>



**Training Offtime**

**Testing both the models on unseen data**

In [153]:
predicted = []
ground_truth = []

LRpredicted = []
LRground_truth = []

#With Peaktime
print("Evaluation for Model With Time 1")
X, y = prepareData(test1)
LRy_test_predict, LRy1 = evaluateCVModel(LRfinalModelWithTime, X, y)
y_test_predict, y1 = evaluateCVModel(finalModelWithTime, X, y)
predicted.extend(y_test_predict)
ground_truth.extend(y1)
LRpredicted.extend(LRy_test_predict)
LRground_truth.extend(LRy1)

#With Offtime
print("Evaluation for Model With Time 2")
X, y = prepareData(test2)
LRy_test_predict, LRy1 = evaluateCVModel(LRfinalModelWithTime2, X, y)
y_test_predict, y1 = evaluateCVModel(finalModelWithTime2, X, y)
predicted.extend(y_test_predict)
ground_truth.extend(y1)
LRpredicted.extend(LRy_test_predict)
LRground_truth.extend(LRy1)

print("Evaluation for Model With Time 3")
X, y = prepareData(test3)
LRy_test_predict, LRy1 = evaluateCVModel(LRfinalModelWithTime3, X, y)
y_test_predict, y1 = evaluateCVModel(finalModelWithTime3, X, y)
predicted.extend(y_test_predict)
ground_truth.extend(y1)
LRpredicted.extend(LRy_test_predict)
LRground_truth.extend(LRy1)

print("Evaluation for Model With Time 4")
X, y = prepareData(test4)
LRy_test_predict, LRy1 = evaluateCVModel(LRfinalModelWithTime4, X, y)
y_test_predict, y1 = evaluateCVModel(finalModelWithTime4, X, y)
predicted.extend(y_test_predict)
ground_truth.extend(y1)
LRpredicted.extend(LRy_test_predict)
LRground_truth.extend(LRy1)

#Metrics On full testset
printmd("**Final Results on complete testset**")

print('Linear Models Best output')
getFinalMetrics(LRpredicted, LRground_truth)

print("Overall Best Model")
getFinalMetrics(predicted, ground_truth)

Evaluation for Model With Time 1


<span style='color:None'>**Model : RidgeCV,  RMSE = 384.68550346136016, R2 =0.5921210321541444**</span>

<span style='color:None'>**Model : RidgeCV,  RMSE = 384.68550346136016, R2 =0.5921210321541444**</span>

Evaluation for Model With Time 2


<span style='color:None'>**Model : LinearRegression,  RMSE = 248.5036411352101, R2 =0.5801771079200388**</span>

<span style='color:None'>**Model : KNeighborsRegressor,  RMSE = 220.8583524390809, R2 =0.6683895108400838**</span>

Evaluation for Model With Time 3


<span style='color:None'>**Model : RidgeCV,  RMSE = 368.68386538746984, R2 =0.5860913441968791**</span>

<span style='color:None'>**Model : KNeighborsRegressor,  RMSE = 272.3673113394526, R2 =0.7741052114043875**</span>

Evaluation for Model With Time 4


<span style='color:None'>**Model : Lasso,  RMSE = 405.2924709527108, R2 =0.7351829006651227**</span>

<span style='color:None'>**Model : KNeighborsRegressor,  RMSE = 306.7556017219853, R2 =0.8482970742733619**</span>

<span style='color:None'>**Final Results on complete testset**</span>

Linear Models Best output


<span style='color:None'>**RMSE = 379.9509325371697, R2 =0.6891959971474123**</span>

Overall Best Model


<span style='color:None'>**RMSE = 306.5174704336199, R2 =0.7977250792884576**</span>

# Additional Experiments

**Creating a custom Imputer**

In [154]:
class CustomImputerForTemperature(BaseEstimator, TransformerMixin):
    '''
    Class used for imputing missing values in a temperature column using KNNRegressor model, which was found out to be the best*
    
    group_cols : list
        List of columns used for training the regressor model
    target : str
        The name of the column to impute

    Returns
    -------
    X : array-like
        The array with imputed values in the target column
    '''
    def __init__(self, testing=0):       
        self.group_cols = ["Dew point temperature(C)", "Solar Radiation (MJ/m2)", "Snowfall (cm)", "Humidity(%)", "Rainfall(mm)", "Hour_sin", "Hour_cos"]
        self.target = "Temperature(C)"
        self.regressorImputer = KNeighborsRegressor()
        self.testing = testing
    
    def fit(self, X, y=None):
        
        assert pd.isnull(X[self.group_cols]).any(axis=None) == False, 'There are missing values in group_cols'

        for col in self.group_cols:
          if col not in X.columns:
            print("Error: Required Column Not found")
            break

        dfWithTemp = X.loc[X[self.target].notnull()].copy()

        X_train = dfWithTemp[self.group_cols]
        Y_train = dfWithTemp[self.target]

        self.regressorImputer.fit(X_train, Y_train)
        self.impute_regressor_ = self.regressorImputer
        
        return self 
    
    def transform(self, X, y=None):
        
        # make sure that the imputer was fitted
        check_is_fitted(self, 'impute_regressor_')
        
        X = X.copy()
        dfWithTemp = X.loc[X[self.target].notnull()]
        dfPrediction = X.loc[X[self.target].isna()]

        if self.testing == 1:#Required to Check the accuracy of imputer regressor
          X['Temperature(C)'] = self.regressorImputer.predict(X[self.group_cols])
          return X

        if len(dfPrediction) > 0:
          dfPrediction[self.target] = self.regressorImputer.predict(dfPrediction[self.group_cols])
        else:
            return X

        X = pd.concat([dfWithTemp, dfPrediction])
        return X

**Validation and Verification of Custom Imputer**

In [155]:
def customImputerTesting(dataf):
  temperatureImputer = CustomImputerForTemperature()
  data, y = prepareData(dataf)
  A = data['Temperature(C)']
  data = temperatureImputer.fit_transform(data)
  B = data['Temperature(C)']

  #Value to Value comparison
  nanCount = 0
  for i in range(len(A)):
    if np.isnan(A[i]):
      nanCount += 1
      continue
    elif A[i] != B[i]:
      print("Mismatched data, after imputation..")
      break

  uncommon_items_fromA = set(A) - set(B)
  for elem in uncommon_items_fromA:
    if not np.isnan(elem):
      print("Error in Imputation")
      break

  printmd("**No Errors in Custom Imputation, Total : " + str(len(uncommon_items_fromA)) + " Values imputed**")
customImputerTesting(df)

<span style='color:None'>**No Errors in Custom Imputation, Total : 500 Values imputed**</span>

In [156]:
#performance of imputer
X, y = prepareData(df)
dfWithTemp = X.loc[X["Temperature(C)"].notnull()]
dfTempTrain=dfWithTemp.sample(frac=0.8, random_state=42) #random state is a seed value
dfTempTest=dfWithTemp.drop(dfTempTrain.index)

GlobalImputer = CustomImputerForTemperature(True)
GlobalImputer.fit(dfTempTrain)
out = GlobalImputer.transform(dfTempTest)

y_test_predict = out["Temperature(C)"]
y1 = dfTempTest["Temperature(C)"]
print("Evaluation on Test Data (Temperature Imputation )")
getFinalMetrics(y1, y_test_predict, GlobalImputer.regressorImputer)

Evaluation on Test Data (Temperature Imputation )


<span style='color:None'>**Model : KNeighborsRegressor,  RMSE = 0.5022672566417514, R2 =0.9982201920189937**</span>

**Stratified Sampling**

In [157]:
stratbikedata = df_new.copy()

stratbikedata["hour_cat"] = pd.cut(stratbikedata["Hour"], bins=[-1, 6, 12, 18, 23], labels=[1,2,3,4]) #when min was given as 0, issue, 0 was classified as Nan

trainDf, testDf = train_test_split(stratbikedata, test_size=0.2, random_state=42, stratify=stratbikedata["hour_cat"])

trainDf.reset_index(drop=True, inplace=True)
testDf.reset_index(drop=True, inplace=True)
X, y = prepareData(trainDf)
soFarBestMSE = sys.maxsize
finalModelAllData = []
LRsoFarBestMSE = sys.maxsize
LRfinalModelAllData = []

for model in models:
  bestModelMSE, imputer, metric = runCV(10, model, X, y)
  if metric < soFarBestMSE:
    finalModelAllData = [bestModelMSE, imputer]
    soFarBestMSE = metric
  if metric < LRsoFarBestMSE:
    if type(model()).__name__ in LinearRegressionModels:
      LRsoFarBestMSE = metric
      LRfinalModelAllData = [bestModelMSE, imputer]

Model --> LinearRegression
RMSE scores --> [439.81112470064807, 418.8871874120276, 460.02970512380494, 429.62504174539623, 427.0845246498865, 431.6634155518132, 404.55270391162185, 432.60638338061835, 425.7176490714148, 450.7261817984651]
R2 scores  --> [0.5476591392710017, 0.5342747364660545, 0.5178854060496993, 0.529654212255495, 0.5475734379963586, 0.5347038652647174, 0.5840627012444755, 0.5219423592305672, 0.5235923788986447, 0.5234914174368948]


<span style='color:None'>**Min RMSE = 404.55270391162185**</span>

<span style='color:None'>**MAX R2 = 0.5840627012444755**</span>

Model --> RidgeCV
RMSE scores --> [439.67543130893915, 418.81337402137837, 460.10341772890126, 429.5335705346812, 427.0197295661073, 431.73710640351896, 404.6442457579303, 432.4174587799555, 425.61246858905594, 450.93301808721844]
R2 scores  --> [0.5479382145081986, 0.5344388557469893, 0.5177308909413101, 0.5298544730153142, 0.5477107072792979, 0.5345449868703487, 0.5838744440617967, 0.5223598155192315, 0.5238277583497923, 0.5230539816500805]


<span style='color:None'>**Min RMSE = 404.6442457579303**</span>

<span style='color:None'>**MAX R2 = 0.5838744440617967**</span>

Model --> Lasso
RMSE scores --> [439.437333125508, 419.09157182626905, 460.59000804552204, 429.0959643273159, 427.0015252262115, 431.6481505148414, 404.74682754123586, 432.6841660755992, 425.8387925823428, 451.0127866667489]
R2 scores  --> [0.5484276935118232, 0.5338201500763343, 0.5167102874954377, 0.5308119478785502, 0.5477492696779007, 0.5347367734837378, 0.5836634324739383, 0.5217704340996243, 0.5233212043134662, 0.5228852263364726]


<span style='color:None'>**Min RMSE = 404.74682754123586**</span>

<span style='color:None'>**MAX R2 = 0.5836634324739383**</span>

Model --> KNeighborsRegressor
RMSE scores --> [276.3389196555656, 266.59346429882214, 294.1197987797435, 276.1720179693672, 274.39124497871654, 287.4778192274821, 286.3755454617527, 291.3744684352425, 283.1589158052418, 269.68170391884473]
R2 scores  --> [0.821426294642963, 0.8113596297591384, 0.8029270368596297, 0.8056442333548365, 0.8132498649160858, 0.793629487526033, 0.7915751443425857, 0.7831310356711798, 0.7892362721649793, 0.8294121396141482]


<span style='color:None'>**Min RMSE = 266.59346429882214**</span>

<span style='color:None'>**MAX R2 = 0.8294121396141482**</span>

Model --> SGDRegressor
RMSE scores --> [438.8174396834148, 418.86652745815667, 459.2521967525967, 430.2786853532342, 427.81426785370155, 432.1457365501422, 406.69967923095254, 432.7407394228206, 426.4509593549304, 450.4102583404332]
R2 scores  --> [0.5497008182586605, 0.5343206754466207, 0.5195136981279503, 0.5282219289361545, 0.5460260287523029, 0.5336634830241207, 0.5796361990057175, 0.521645369130185, 0.5219497150584655, 0.524159173102542]


<span style='color:None'>**Min RMSE = 406.69967923095254**</span>

<span style='color:None'>**MAX R2 = 0.5796361990057175**</span>

Model --> ElasticNet
RMSE scores --> [482.41046360482, 446.8839192817095, 506.69055453994594, 462.064776353934, 457.559892762768, 467.3079773929718, 446.0345809900278, 461.9678105622849, 456.67491717548285, 489.1186596197332]
R2 scores  --> [0.4557895989756679, 0.4699399194966102, 0.41512353883917164, 0.4559437415645181, 0.4807023905889023, 0.4546876304575036, 0.49439109747344995, 0.45484770108313677, 0.4517865236952421, 0.4388569123149494]


<span style='color:None'>**Min RMSE = 446.0345809900278**</span>

<span style='color:None'>**MAX R2 = 0.49439109747344995**</span>



Testing on Stratified Sampling

In [158]:
X, y = prepareData(testDf)
out = evaluateCVModel(finalModelAllData, X, y)
out = evaluateCVModel(LRfinalModelAllData, X, y)

<span style='color:None'>**Model : KNeighborsRegressor,  RMSE = 300.8225141175441, R2 =0.8051716076015484**</span>

<span style='color:None'>**Model : LinearRegression,  RMSE = 462.80920336140315, R2 =0.5388570778036845**</span>

**Best params for ElasticNet and SGDRegressor Using Grid Search CV**

In [159]:
#scores = GridSCV(df) # Takes time, best values are found out and stored in below list
scores = [{'best_model': ElasticNet(alpha=0.1, l1_ratio=0.99, max_iter=20000), 'best_params': {'alpha': 0.1, 'l1_ratio': 0.99, 'max_iter': 20000}, 'best_score': -203336.15897393713, 'model': 'elasticnet'}, {'best_model': SGDRegressor(alpha=0.1, max_iter=20000, random_state=42), 'best_params': {'alpha': 0.1, 'learning_rate': 'invscaling', 'max_iter': 20000}, 'best_score': -214061.26102864108, 'model': 'SGDRegressor'}]

#for model in scores:
#  models.append(model['best_model'])

**Experiment with XGBoosr**

In [160]:
X, y = prepareData(trainDf)
num_pipeline = Pipeline([('scaler', StandardScaler())])
cat_pipeline = Pipeline([('encoder',OneHotEncoder())])

cat_cols = ['Holiday','Functioning Day']
num_cols = [col for col in list(X) if col not in cat_cols]

full_pipeline = ColumnTransformer([('num', num_pipeline, num_cols),
                                ('cat', cat_pipeline, cat_cols)])

#Data preparation
X_train_prepared = full_pipeline.fit_transform(X)
xgb_r = xg.XGBRegressor(n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8)
xgb_r.fit(X_train_prepared, y)

X, y = prepareData(testDf)
out = evaluateCVModel([xgb_r, full_pipeline], X, y)



<span style='color:None'>**Model : XGBRegressor,  RMSE = 223.69385632142476, R2 =0.8922692491047886**</span>

In [161]:
#End