# Import Libraries

In [4]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, train_test_split
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')

import xgboost
import catboost
import lightgbm

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.ensemble import VotingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [33]:
train_df = pd.read_csv('train.csv')

In [34]:
X = train_df.drop(columns=['yield'])
y = train_df['yield']

std_scaler = StandardScaler()

X = std_scaler.fit_transform(X)

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=4003)

In [36]:
def train_with_folds(folds, X_data, y_data, model,X_data_test):    
    kfold = KFold(n_splits=folds, random_state=43, shuffle=True)

    fold_preds = []

    i = 0
    for train_idx, test_idx in kfold.split(X_data, y_data):
        i += 1
        X_train_fold = X_data[train_idx,:]
        X_test_fold = X_data[test_idx,:]
        y_train_fold = y_data.iloc[train_idx]
        y_test_fold = y_data.iloc[test_idx]

        train_model = model
        train_model.fit(X_train_fold, y_train_fold)
        
        test_pred = train_model.predict(X_test_fold)
        mse = mean_absolute_error(test_pred, y_test_fold)

        fold_preds.append(train_model.predict(X_data_test))
        print(f'fold {i} => {mse}')
        
    return fold_preds, train_model


# Random Forest Regressor

In [37]:
rf_model = RandomForestRegressor(max_depth=50, n_jobs=1000, max_features=12, min_samples_leaf=8)

rf_preds,_ = train_with_folds(10, X_train,y_train,rf_model, X_test)

fold 1 => 355.2069857492865
fold 2 => 356.51602805843817
fold 3 => 333.58446382692165
fold 4 => 369.68118743425816
fold 5 => 342.7417572920463
fold 6 => 346.3679378139747
fold 7 => 333.8722748822731
fold 8 => 363.78263269539457
fold 9 => 328.15674230166076
fold 10 => 368.5088051764209


In [38]:
rf_test_preds = np.mean(rf_preds, axis=0)
mean_absolute_error(rf_test_preds, y_test)

362.3312045453497

# Catboost Regressor

In [39]:
cat_reg = catboost.CatBoostRegressor(verbose = 0, learning_rate=0.05)

cat_preds, cat_model = train_with_folds(10, X,y, cat_reg, X_test)

fold 1 => 377.20693386259
fold 2 => 346.89132761593453
fold 3 => 353.164533766479
fold 4 => 370.4169796928056
fold 5 => 360.48467301970936
fold 6 => 369.56707316326214
fold 7 => 337.69892039955613
fold 8 => 350.88343295148235
fold 9 => 325.6438185289466
fold 10 => 357.6538180496972


In [40]:
cat_test_preds = np.mean(cat_preds, axis=0)
mean_absolute_error(np.mean(cat_preds, axis=0), y_test)

322.4577475124318

# lgbm regressor

In [41]:

lgbm_reg = lightgbm.LGBMRegressor(learning_rate=0.02368, n_estimators=300, min_child_weight=0.01, max_depth=12, reg_lambda=0.002)
lgbm_preds, lgbm_model = train_with_folds(10, X,y, lgbm_reg, X_test)

fold 1 => 371.24442112821225
fold 2 => 346.07034420735994
fold 3 => 345.5513429495608
fold 4 => 366.40109570225684
fold 5 => 356.7804957007191
fold 6 => 373.2947603003219
fold 7 => 338.63933440177976
fold 8 => 345.0486575832865
fold 9 => 322.5398456787346
fold 10 => 355.79065794861407


In [42]:
lgbm_test_preds = np.mean(lgbm_preds, axis=0)
mean_absolute_error(lgbm_test_preds, y_test)

336.50962981347834

# Voting Regressor

In [46]:

lbmParam1={
        'objective': 'regression_l1',
        'metric': 'mae',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'n_estimators': 1000,
}

lbmParam2={
            'n_estimators': 250,
            "num_leaves": 57,
            'min_child_samples': 9,
            'learning_rate': 0.043,
            'colsample_bytree': 0.87,
            'reg_alpha': 0.0009765625,
            'reg_lambda': 0.0018511523615235433,
            'objective': 'regression_l1',
            'metric': 'mae',
            'boosting_type': 'gbdt',
}


lbmParam3 = {
     'n_jobs': -1,
     'random_state': 43,
     'objective': 'regression_l1',
     'metric': 'mae',
     'num_leaves': 58,
     'learning_rate': 0.023640239702728376,
     'min_child_samples': 14,
     'colsample_bytree': 0.7006991908929474,
     'n_estimators': 663,
     'reg_alpha': 0.6469675138495181,
     'reg_lambda': 0.14870311599361627}

In [53]:
Regressors = {
    'LGBMRegressor_tune1': lightgbm.LGBMRegressor(**lbmParam1, n_jobs=-1,random_state=43),
    'LGBMRegressor_tune2': lightgbm.LGBMRegressor(**lbmParam2, n_jobs=-1,random_state=43),
    'LGBMRegressor_Optunatune': lightgbm.LGBMRegressor(**lbmParam3),
    'catboost' : catboost.CatBoostRegressor(verbose = 0, learning_rate=0.1)
}
# Create the voting regressor
voting = VotingRegressor(estimators=list(Regressors.items()))

In [54]:
voting_preds, voting_model = train_with_folds(10, X,y, voting, X_test)

fold 1 => 362.41011855896767
fold 2 => 331.3080905416069
fold 3 => 340.9800071891104
fold 4 => 355.83730986396364
fold 5 => 351.62536777975066
fold 6 => 360.9551471697812
fold 7 => 325.6118636313448
fold 8 => 339.80786813994894
fold 9 => 314.0594905308837
fold 10 => 346.17151495021375


In [55]:
voting_test_pred = np.mean(voting_preds, axis=0)
mean_absolute_error(voting_test_pred, y_test)

301.4433878793118