In [1]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import SGDRegressor, LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from sklearn.ensemble import RandomForestRegressor 

# Pre-processing

In [2]:
df = pd.read_csv(r'./Data/processed.csv')
X = df.drop(['KWH'], axis=1)
Y = df['KWH']

In [3]:
best_df = SelectKBest(mutual_info_regression, k = 100).fit_transform(X, Y)

In [4]:
X_train, X_test, Y_train, Y_test = train_test_split(best_df, Y, test_size=0.3, shuffle = True, random_state = 42)

In [5]:
scaler = MinMaxScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Model implementation

In [19]:
rr = Ridge(
    random_state=42,
    solver = 'auto'
)
rf = RandomForestRegressor(
    oob_score = True,  
    random_state = 42,
)
sgd = SGDRegressor( 
    random_state=42,
    penalty='l1', 
)
ls = Lasso(
    random_state = 42 
)
estimators = [rr, rf, sgd, ls]

In [20]:
param_rr = {'alpha': [0.0001, 0.001, 0.01, 0.1, 1]} 
param_rf = {'n_estimators': [5, 50, 100, 200, 500], 'max_depth': [1, 5, 10, 15], 'min_samples_leaf': [1, 5, 10]}
param_sgd = {'alpha': [0.0001, 0.001, 0.01, 0.1, 1], 'loss': ['squared_epsilon_insensitive', 'huber'], 'learning_rate': ['optimal', 'adaptive']}
param_ls = {"selection":["cyclic", "random"], 'alpha': [0.0001, 0.001, 0.01, 0.1, 1]}
params = [param_rr, param_rf, param_sgd, param_ls]

In [21]:
def performingGridSearch(estimators, X_train, Y_train, X_test, Y_test, params):
  scores_train = []
  scores_test = []
 
  for index, estimator in enumerate(estimators):
    model = GridSearchCV(estimator, param_grid = params[index], cv = 3, n_jobs=-1, scoring='r2', verbose = 3)
    model.fit(X_train, Y_train)
    best_params = model.best_params_
    scores_train.append(model.score(X_train, Y_train))
 
    print("\n\n Model: {} \n\n Best params: {} ".format(estimator, best_params))
    scores_test.append(model.score(X_test, Y_test))
    
  return scores_train, scores_test

In [15]:
# scores_train, scores_val = performingGridSearch(estimators, X_train, Y_train, X_test, Y_test, params)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


 Model: Ridge(random_state=42) 

 Best params: {'alpha': 0.01} 
Fitting 3 folds for each of 60 candidates, totalling 180 fits


 Model: RandomForestRegressor(oob_score=True, random_state=42) 

 Best params: {'max_depth': 15, 'min_samples_leaf': 1, 'n_estimators': 500} 
Fitting 3 folds for each of 20 candidates, totalling 60 fits


 Model: SGDRegressor(penalty='l1', random_state=42) 

 Best params: {'alpha': 0.001, 'learning_rate': 'adaptive', 'loss': 'squared_epsilon_insensitive'} 
Fitting 3 folds for each of 10 candidates, totalling 30 fits


 Model: Lasso(random_state=42) 

 Best params: {'alpha': 0.1, 'selection': 'cyclic'} 
  model = cd_fast.enet_coordinate_descent(


In [17]:
estimators_names = ['Ridge', 'RandomForest', 'SGDRegressor', 'Lasso']
models_scores = pd.DataFrame([estimators_names, scores_train, scores_val], columns = None, index = ['Estimators', 'Scores Train', 'Scores Val'])
print(models_scores)

                     0                      1             2         3
Estimators       Ridge  RandomForestRegressor  SGDRegressor     Lasso
Scores Train  0.986648               0.991172      0.984034  0.986648
Scores Val    0.985901               0.957601      0.984046  0.985898


In [10]:
lr = LinearRegression()
rr = Ridge(
    random_state=42,
    solver = 'auto',
    alpha = 0.01
)
rf = RandomForestRegressor(
    oob_score = True,  
    random_state = 42,
    max_depth= 15, 
    min_samples_leaf= 1, 
    n_estimators= 500
)
sgd = SGDRegressor(
    random_state = 42,
    shuffle= True, 
    early_stopping= True,
    validation_fraction= 0.2,
    loss = 'squared_epsilon_insensitive',
    penalty = 'l1',
    alpha = 0.001,
    learning_rate = 'adaptive',
    tol = 0.001, 
)

ls = Lasso(
    random_state = 42,
    alpha = 0.1, 
    selection = 'cyclic' 
)

estimators = [lr, rr, rf, sgd, ls]

In [11]:
r2_score_train = []
r2_score_test = []
mape_score_train = []
mape_score_test = []
for estimator in estimators:
    print(estimator)
    estimator.fit(X_train, Y_train)
    r2_score_train.append(estimator.score(X_train, Y_train))
    y_train_pred = estimator.predict(X_train)
    mape_score_train.append(mean_absolute_percentage_error(Y_train, y_train_pred))

    r2_score_test.append(estimator.score(X_test, Y_test))
    y_test_pred = estimator.predict(X_test)
    mape_score_test.append(mean_absolute_percentage_error(Y_test, y_test_pred))



LinearRegression()
Ridge(alpha=0.01, random_state=42)
RandomForestRegressor(max_depth=15, n_estimators=500, oob_score=True,
                      random_state=42)
SGDRegressor(alpha=0.001, early_stopping=True, learning_rate='adaptive',
             loss='squared_epsilon_insensitive', penalty='l1', random_state=42,
             validation_fraction=0.2)
Lasso(alpha=0.1, random_state=42)
  model = cd_fast.enet_coordinate_descent(


In [13]:
estimators_names = ['LinearRegression','Ridge', 'RandomForest', 'SGDRegressor', 'Lasso']
models_scores = pd.DataFrame([estimators_names, r2_score_train, r2_score_test, mape_score_train, mape_score_test], columns = None, index = ['Estimators', 'r2_score_train', 'r2_score_test', 'mape_score_train', 'mape_score_test'])
models_scores

Unnamed: 0,0,1,2,3,4
Estimators,LinearRegression,Ridge,RandomForest,SGDRegressor,Lasso
r2_score_train,0.986686,0.986648,0.991172,0.949876,0.986648
r2_score_test,0.98581,0.985901,0.957601,0.950134,0.985898
mape_score_train,0.075041,0.075634,0.04118,0.174636,0.074374
mape_score_test,0.065387,0.065376,0.096979,0.126128,0.065236
