In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import numpy as np
import pandas as pd
df = pd.read_csv("E:/EndToEndBlueBerry/Notebook/data/train.csv")
train_data, test_data = train_test_split(df, test_size=0.2, random_state=50)
y_train = train_data['yield']
y_test = test_data["yield"]
x_train = train_data[['clonesize', 'honeybee', 'bumbles', 'andrena', 'osmia','MaxOfUpperTRange', 'RainingDays', 'fruitset', 'fruitmass', 'seeds']]
x_test = test_data[['clonesize', 'honeybee', 'bumbles', 'andrena', 'osmia','MaxOfUpperTRange', 'RainingDays', 'fruitset', 'fruitmass', 'seeds']]         
num_attribs = ['clonesize', 'honeybee', 'bumbles', 'andrena', 'osmia','MaxOfUpperTRange', 'RainingDays', 'fruitset', 'fruitmass', 'seeds']
num_pipeline = Pipeline(steps=[("standardscale", StandardScaler())])
preprocessor = ColumnTransformer([("num_pipeline", num_pipeline, num_attribs)],remainder='passthrough')
x_train_transformed = preprocessor.fit_transform(x_train)
x_test_transformed = preprocessor.fit_transform(x_test)
train_arr = np.c_[x_train_transformed, np.array(y_train)]
test_arr = np.c_[x_test_transformed, np.array(y_test)]
train_x = train_arr[:,:-1]
train_y = train_arr[:,-1]
test_x =test_arr[:,:-1]
test_y = test_arr[:,-1]

In [42]:
import pickle
import os
from sklearn.linear_model import HuberRegressor
def fit_base_models(x,y,models,best_parameters):
    "fit each base model and return a model list"
    fitted_models = list()
    for i in range(len(list(models))):
        print(f"start fitting model {i}")
        model = list(models.values())[i]
        best_param = best_parameters[list(models.keys())[i]]
        try:
            model.set_params(**best_param)
            model.fit(x,y)
        except Exception:
            model.fit(x,y)
        fitted_models.append(model)
    return fitted_models

def get_out_of_fold_predictions(train_x, train_y,models,best_parameters):
    '''
    input model dict
    '''
    kfold = KFold(n_splits=5, shuffle=True)
    meta_x = list()
    meta_y = list()
    for train_ix, valid_ix in kfold.split(train_x):
        print(train_ix)
        print(valid_ix)
        fold_train_x, fold_valid_x = train_x[train_ix], train_x[valid_ix]
        fold_train_y, fold_valid_y = train_y[train_ix], train_y[valid_ix]
        meta_y.extend(fold_valid_y)
        y_hat = []
        model_list = fit_base_models(fold_train_x,fold_train_y,models,best_parameters)
        print(model_list)
        for model in model_list:
            print(2)
            fold_valid_y_hat = model.predict(fold_valid_x)
            print(fold_valid_y_hat)
            y_hat.append(fold_valid_y_hat.reshape(len(fold_valid_y_hat),1)) 
        meta_x.append(np.hstack(y_hat))
    return meta_x, meta_y

def load_object(file_path):
    with open (file_path, 'rb') as file_obj:
        object = pickle.load(file_obj)
        return object

def fit_meta_model(x_meta, y_meta):
    lr_model =  HuberRegressor()
    lr_model.fit(x_meta, y_meta)
    return lr_model 

def super_learner_prediction(x,fitted_models,meta_model):
    meta_x = []
    for model in fitted_models:
        y_pred = model.predict(x)
        meta_x.append(y_pred.reshape(len(y_pred),1))
    meta_x = np.hstack(meta_x)
    return meta_model.predict(meta_x)

def save_object(file_path, obj):
    dir_path = os.path.dirname(file_path)
    os.makedirs(dir_path,exist_ok=True)
    with open(file_path, 'wb') as file_obj:
        pickle.dump(obj, file_obj)


In [8]:

from sklearn.linear_model import ElasticNet 
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold
models = {
            "ElasticNet": ElasticNet(max_iter=5000),
            "KNN": KNeighborsRegressor(),
            "DecisionTree": DecisionTreeRegressor(),
            "RandomForest": RandomForestRegressor(),
            "AdaBoost": AdaBoostRegressor(),
            "XGBoost": XGBRegressor(),
            "LGBM": LGBMRegressor(),
            "CatBoost": CatBoostRegressor(),
                            }
best_params = load_object("E:/EndtoEndBlueBerry/artifact/params.pkl")
meta_x, meta_y = get_out_of_fold_predictions(train_x, train_y,models,best_params)

[    2     3     5 ... 12227 12229 12230]
[    0     1     4 ... 12221 12226 12228]
start fitting model 0
start fitting model 1
start fitting model 2
start fitting model 3
start fitting model 4
start fitting model 5
start fitting model 6
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001108 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 817
[LightGBM] [Info] Number of data points in the train set: 9784, number of used features: 10
[LightGBM] [Info] Start training from score 6029.678882
start fitting model 7
0:	learn: 1003.9659540	total: 160ms	remaining: 40.7s
1:	learn: 927.6716439	total: 164ms	remaining: 20.8s
2:	learn: 866.7518637	total: 168ms	remaining: 14.1s
3:	learn: 810.7755856	total: 172ms	remaining: 10.8s
4:	learn: 756.5131499	total: 177ms	remaining: 8.88s
5:	learn: 714.0453317	total: 181ms	remaining: 7.55s
6:	learn: 673.857024

In [9]:
from sklearn.metrics import mean_absolute_error
base_train_mae = []
base_test_mae = []
# evaluate the base model
fitted_models = fit_base_models(train_x, train_y, models, best_params)
for model in fitted_models:
    train_y_prediction = model.predict(train_x)
    print(train_y_prediction)
    train_mae = mean_absolute_error(train_y, train_y_prediction)
    test_y_prediction = model.predict(test_x)
    print(test_y_prediction)
    test_mae = mean_absolute_error(test_y, test_y_prediction)
    base_train_mae.append(train_mae)
    base_test_mae.append(test_mae)

start fitting model 0
start fitting model 1
start fitting model 2
start fitting model 3
start fitting model 4
start fitting model 5
start fitting model 6
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001206 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 820
[LightGBM] [Info] Number of data points in the train set: 12231, number of used features: 10
[LightGBM] [Info] Start training from score 6027.780843
start fitting model 7
0:	learn: 1007.2920936	total: 5.82ms	remaining: 1.48s
1:	learn: 929.3616143	total: 11.3ms	remaining: 1.44s
2:	learn: 868.3866795	total: 15.6ms	remaining: 1.32s
3:	learn: 814.2437903	total: 20.1ms	remaining: 1.27s
4:	learn: 759.9780286	total: 25.4ms	remaining: 1.28s
5:	learn: 714.5717722	total: 31.2ms	remaining: 1.3s
6:	learn: 671.7736375	total: 35.5ms	remaining: 1.26s
7:	learn: 635.1448591	total: 40.5ms	remaining: 1.26s
8:	learn: 602.1011976	total: 45.3ms	remaining: 1.24s
9:	

In [10]:
base_train_mae

[406.4451452136159,
 392.32353211138366,
 350.790743295724,
 136.86737800370236,
 397.92608542489114,
 304.5387754514751,
 328.4489547746227,
 319.756319054831]

In [11]:
base_test_mae

[418.8708206846694,
 417.258912823087,
 367.85399792511447,
 387.77998352316826,
 414.35535936580925,
 368.23300790201927,
 366.349925185466,
 354.8277026516179]

In [43]:
x = np.concatenate(meta_x, axis= 0)
meta_model  =  fit_meta_model(x, meta_y) 
train_y_prediction = super_learner_prediction(train_x, fitted_models,meta_model) 
test_y_prediction = super_learner_prediction(test_x, fitted_models, meta_model)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [44]:
test_mae = mean_absolute_error(test_y, test_y_prediction)
train_mae = mean_absolute_error(train_y, train_y_prediction)



In [45]:
test_mae

354.1968327633804

In [46]:
train_mae

290.0695528095724

In [49]:
for i in range(len(list(models.keys()))):
    print(f"{list(models.keys())[i]}: train_MAE = {base_train_mae[i]}, test_MAE = {base_test_mae[i]}")
print(f"Super Linear (Stacking) with Huber Loss :train_MAE = {train_mae}, test_MAE = {test_mae}")

ElasticNet: train_MAE = 406.4451452136159, test_MAE = 418.8708206846694
KNN: train_MAE = 392.32353211138366, test_MAE = 417.258912823087
DecisionTree: train_MAE = 350.790743295724, test_MAE = 367.85399792511447
RandomForest: train_MAE = 136.86737800370236, test_MAE = 387.77998352316826
AdaBoost: train_MAE = 397.92608542489114, test_MAE = 414.35535936580925
XGBoost: train_MAE = 304.5387754514751, test_MAE = 368.23300790201927
LGBM: train_MAE = 328.4489547746227, test_MAE = 366.349925185466
CatBoost: train_MAE = 319.756319054831, test_MAE = 354.8277026516179
Super Linear (Stacking) with Huber Loss :train_MAE = 290.0695528095724, test_MAE = 354.1968327633804


In [48]:
final_model = {"base_models": fitted_models, "meta_model": meta_model}
with open("E:/EndtoEndBlueBerry/artifact/model.pkl", 'wb') as file_object:
    pickle.dump(final_model,file_object)