In [24]:
#Basic packages
import os
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold, KFold

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectFromModel
from sklearn.decomposition import PCA, NMF
from sklearn.manifold import TSNE
from umap import UMAP
from scipy.cluster.hierarchy import dendrogram, ward


import optuna
from optuna.integration import CatBoostPruningCallback

import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

sns.set_theme(style = 'white', palette = 'viridis')
pal = sns.color_palette('viridis')

pd.set_option('display.max_rows', 100)

In [25]:
#Models

import xgboost as xgb
import lightgbm as lgb
import xgboost as xgb
import lightgbm as lgb
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, SGDRegressor, LassoCV
from sklearn.linear_model import PassiveAggressiveRegressor, ARDRegression, RidgeCV, ElasticNetCV
from sklearn.linear_model import TheilSenRegressor, RANSACRegressor, HuberRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, ExtraTreesRegressor, HistGradientBoostingRegressor
from sklearn.ensemble import BaggingRegressor, GradientBoostingRegressor, VotingRegressor, StackingRegressor

from sklearn.cross_decomposition import PLSRegression
from catboost import CatBoost, CatBoostRegressor, CatBoostClassifier
from catboost import Pool

#Metrics

from sklearn.metrics import mean_absolute_error

In [26]:
# File path and files

#File Path
data_dir = "/kaggle/input/playground-series-s3e14"
train_file = "train.csv"
test_file = "test.csv"
orig_file = "/kaggle/input/wild-blueberry-yield-prediction-dataset/WildBlueberryPollinationSimulationData.csv"
submission_file = "sample_submission.csv"

In [27]:
# test_data = Path(data_dir)/test_file
# train_data = Path(data_dir)/train_file
# submission_data = Path(data_dir)/submission_file

In [28]:
# train = pd.read_csv(train_data)
# test = pd.read_csv(test_data)
# submission_df = pd.read_csv(submission_data)

In [29]:
def get_data(data_dir = data_dir,train_file = train_file,  test_file = test_file, submission_file = submission_file):
    test_data = Path(data_dir)/test_file
    train_data = Path(data_dir)/train_file
    orig_data = Path(orig_file)
    submission_data = Path(data_dir)/submission_file
    train = pd.read_csv(train_data, index_col = "id")
    test = pd.read_csv(test_data, index_col = "id")
    orig_train = pd.read_csv(orig_data)
    submission_df = pd.read_csv(submission_data)
    return train,test,submission_df,orig_train
    

In [30]:
train, test, submission_df, orig_train = get_data()

In [31]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15289 entries, 0 to 15288
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   clonesize             15289 non-null  float64
 1   honeybee              15289 non-null  float64
 2   bumbles               15289 non-null  float64
 3   andrena               15289 non-null  float64
 4   osmia                 15289 non-null  float64
 5   MaxOfUpperTRange      15289 non-null  float64
 6   MinOfUpperTRange      15289 non-null  float64
 7   AverageOfUpperTRange  15289 non-null  float64
 8   MaxOfLowerTRange      15289 non-null  float64
 9   MinOfLowerTRange      15289 non-null  float64
 10  AverageOfLowerTRange  15289 non-null  float64
 11  RainingDays           15289 non-null  float64
 12  AverageRainingDays    15289 non-null  float64
 13  fruitset              15289 non-null  float64
 14  fruitmass             15289 non-null  float64
 15  seeds              

In [32]:
def create_EDA_summary (df = None):
    train_import_table = pd.DataFrame()
    train_import_table["dtype"] = df.apply(lambda x: x.dtype)
    train_import_table["NROW"] = df.shape[0]
    train_import_table["Unique_values"] = [ len(df[col].unique()) for col in df.columns]
    train_import_table["Percent_missing"] = (df.isnull().sum()/df.shape[0])*100
    
    return train_import_table

In [33]:
create_EDA_summary(df = train)

Unnamed: 0,dtype,NROW,Unique_values,Percent_missing
clonesize,float64,15289,6,0.0
honeybee,float64,15289,7,0.0
bumbles,float64,15289,11,0.0
andrena,float64,15289,16,0.0
osmia,float64,15289,14,0.0
MaxOfUpperTRange,float64,15289,6,0.0
MinOfUpperTRange,float64,15289,5,0.0
AverageOfUpperTRange,float64,15289,5,0.0
MaxOfLowerTRange,float64,15289,6,0.0
MinOfLowerTRange,float64,15289,7,0.0


In [34]:
create_EDA_summary(df = orig_train)

Unnamed: 0,dtype,NROW,Unique_values,Percent_missing
Row#,int64,777,777,0.0
clonesize,float64,777,6,0.0
honeybee,float64,777,7,0.0
bumbles,float64,777,10,0.0
andrena,float64,777,12,0.0
osmia,float64,777,12,0.0
MaxOfUpperTRange,float64,777,5,0.0
MinOfUpperTRange,float64,777,5,0.0
AverageOfUpperTRange,float64,777,5,0.0
MaxOfLowerTRange,float64,777,5,0.0


In [35]:
#cat_columns = test.iloc[:,:-3].columns.to_list()
#num_colums = test.iloc[:,-3:].columns.to_list()

In [36]:
orig_train = orig_train.drop(["Row#"], axis = 1)


In [37]:
# Define the target variable
target = "yield"
# Concatenate train and oiginal dataset
train = pd.concat([train, orig_train]).reset_index(drop = True)

In [38]:
#Feature engineering steps - All data

for df in [train, test]:
    df["fruit_seed"] = df["fruitset"] * df["seeds"]

In [39]:
#Separate the training data set into the features and target
train_X = train.drop([target], axis = 1).reset_index(drop=True)
train_y = train[target].reset_index(drop=True)

In [40]:
#Lists for feature engineering
num_list = test.columns.to_list()

In [None]:
#Feature engineering steps for cross-validation
numeric_transformer = Pipeline([('scaler', StandardScaler())])

#Preprocessing pipeline
preprocessor = ColumnTransformer([('num', numeric_transformer, num_list)])


In [None]:
#Define the cross-validation strategy
#Cross validation strategy
seed = 42
splits = 5
#cv = RepeatedStratifiedKFold(n_splits = splits, n_repeats = 5, random_state = seed)
cv = KFold(n_splits = splits, random_state = seed, shuffle = True)



In [None]:
#Define Hyperparamaters for Optuna

xgb_params = {
        #'tree_method':'gpu_hist',  # this parameter means using the GPU when training our model to speedup the training process
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.008,0.01,0.012,0.014,0.016,0.018, 0.02]),
        'n_estimators': 10000,
        'max_depth': trial.suggest_categorical('max_depth', [5,7,9,11,13,15,17]),
        'random_state': trial.suggest_categorical('random_state', [2020]),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
    }

cat_params = param = {
        'loss_function': 'RMSE',
        #'task_type': 'GPU',
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 10.0),
        'max_bin': trial.suggest_int('max_bin', 200, 400),
        #'rsm': trial.suggest_uniform('rsm', 0.3, 1.0),
        'subsample': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'learning_rate': trial.suggest_uniform('learning_rate', 0.006, 0.018),
        'n_estimators':  25000,
        'max_depth': trial.suggest_categorical('max_depth', [5,7,9,11,13,15]),
        'random_state': trial.suggest_categorical('random_state', [2020]),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 300),
    }


lgb_params = {
        'metric': 'rmse', 
        'random_state': 48,
        'n_estimators': 20000,
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.006,0.008,0.01,0.014,0.017,0.02]),
        'max_depth': trial.suggest_categorical('max_depth', [10,20,100]),
        'num_leaves' : trial.suggest_int('num_leaves', 1, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
        'cat_smooth' : trial.suggest_int('min_data_per_groups', 1, 100)
}

rf_params =   {
            'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
            'max_depth': trial.suggest_int('max_depth', 4, 50),
            'min_samples_split': trial.suggest_int('min_samples_split', 1, 150),
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 60),
        }

In [None]:
#Define models

models = {
            "xgb": xgb.XGBRegressor(),
            #"xgb_exact": xgb.XGBRegressor(**xgb_exact_params),
            #"xgb_approx": xgb.XGBRegressor(**xgb_approx_params),
            "lgb": lgb.LGBMRegressor(),
            #"lgb2": lgb.LGBMRegressor(**lgb2_params),
            #"lgb3": lgb.LGBMRegressor(**lgb3_params),
            "cat": CatBoostRegressor(),
            #"cat2": CatBoostRegressor(**cb2_params),
           # "cat3": CatBoostRegressor(**cb3_params),
            #"cat_sym": CatBoostRegressor(**cb_sym_params),
            #"cat_loss": CatBoostRegressor(**cb_loss_params),
            #"Ridge": RidgeCV(),
            #"Lasso": LassoCV(),
            #"RandomForestRegressor": RandomForestRegressor(n_estimators=200, random_state=self.random_state, n_jobs=-1),
            #"PLSRegression": PLSRegression(n_components=10, max_iter=2000),
            #"PassiveAggressiveRegressor": PassiveAggressiveRegressor(max_iter=3000, tol=1e-3, n_iter_no_change=30, random_state=self.random_state),
            #"GradientBoostingRegressor": GradientBoostingRegressor(n_estimators=2000, learning_rate=0.05, loss="absolute_error", random_state=self.random_state),
           # "HistGradientBoostingRegressor": HistGradientBoostingRegressor(max_iter=self.n_estimators, learning_rate=0.01, loss="absolute_error", n_iter_no_change=300,random_state=self.random_state),
            #"ARDRegression": ARDRegression(n_iter=1000),
           # "HuberRegressor": HuberRegressor(max_iter=3000),
            #"KNeighborsRegressor": KNeighborsRegressor(n_neighbors=5, n_jobs=-1)
        }

In [None]:
#XGB Optuna
def objective(trial):
    
    xgb_params = {
        'tree_method':'gpu_hist',  # this parameter means using the GPU when training our model to speedup the training process
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.008,0.01,0.012,0.014,0.016,0.018, 0.02]),
        'n_estimators': 10000,
        'max_depth': trial.suggest_categorical('max_depth', [5,7,9,11,13,15,17]),
        'random_state': trial.suggest_categorical('random_state', [2020]),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
    }
    
    global train_X
    global train_y
    
    train_x, valid_x, y_train, y_valid = train_test_split(train_X, train_y, test_size=0.25)
    
    model = xgb.XGBRegressor(**xgb_params)
    pipe = make_pipeline(preprocessor, model)
    model.fit(train_x, y_train)
    val_pred = model.predict(valid_x)
    val_metric = mean_absolute_error( y_valid, val_pred)
    return val_metric

In [None]:
study = optuna.create_study( direction="minimize")
study.optimize(objective, n_trials=20, timeout=600 )

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
#LGBM Optuna