In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv("data/train.csv")
train.shape

(15289, 18)

In [3]:
#定义X，y
X = train.iloc[:,1:-1]
y = train['yield']

In [4]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer

num_cols = ['fruitset', 'fruitmass', 'seeds']

oe_cols = ['clonesize', 'honeybee', 'bumbles', 'andrena', 'osmia',
        'AverageOfLowerTRange',
        'AverageRainingDays']

oe_cols_drop = ['MaxOfUpperTRange', 'MinOfUpperTRange', 'AverageOfUpperTRange',
       'MaxOfLowerTRange', 'MinOfLowerTRange']

oe_cols_drop2 = ['RainingDays']

scaler = StandardScaler()
oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

preprocessor = make_column_transformer(
    (scaler, num_cols),
    (oe, oe_cols),
    (oe, oe_cols_drop),
    (oe, oe_cols_drop2))

preprocessor1 = make_column_transformer(
    (scaler, num_cols),
    (oe, oe_cols),
    (oe, oe_cols_drop2),
    ('drop', oe_cols_drop))

preprocessor2 = make_column_transformer(
    (scaler, num_cols),
    (oe, oe_cols),
    ('drop', oe_cols_drop2),
    ('drop', oe_cols_drop))

In [5]:
from catboost import CatBoostRegressor
import lightgbm as lgb
from sklearn.ensemble import VotingRegressor,HistGradientBoostingRegressor

lgbm = lgb.LGBMRegressor(random_state = 318, objective = 'mae') #要设置objective='mae'
cb = CatBoostRegressor(random_state = 318, objective = 'MAE',verbose=0) #verbose:不显示训练过程
hgbr = HistGradientBoostingRegressor(random_state = 318,loss = 'absolute_error')

vc = VotingRegressor([('lgbm',lgbm),('cb',cb),('hgbr',hgbr)]) #342.73

In [6]:
#make pipeline
from sklearn.pipeline import make_pipeline
pipea = make_pipeline(preprocessor, hgbr) #lgbm:344.59, cb:344.11 , vc:342.67, hgbr:345.30
pipeb = make_pipeline(preprocessor1, hgbr) #lgbm:344.05, cb:344.73, vc:342.73, hgbr:344.70
pipec = make_pipeline(preprocessor2, hgbr) #lgbm:344.56, cb:344.81, vc:343.16, hgbr:345.23
pipe = make_pipeline(preprocessor, cb) #343.32
pipe1 = make_pipeline(preprocessor1, lgbm)
pipe2 = make_pipeline(preprocessor1, hgbr)
#pipe2 = make_pipeline(preprocessor, vc)

In [7]:
pipe.steps

[('columntransformer',
  ColumnTransformer(transformers=[('standardscaler', StandardScaler(),
                                   ['fruitset', 'fruitmass', 'seeds']),
                                  ('ordinalencoder-1',
                                   OrdinalEncoder(handle_unknown='use_encoded_value',
                                                  unknown_value=-1),
                                   ['clonesize', 'honeybee', 'bumbles', 'andrena',
                                    'osmia', 'AverageOfLowerTRange',
                                    'AverageRainingDays']),
                                  ('ordinalencoder-2',
                                   OrdinalEncoder(handle_unknown='use_encoded_value',
                                                  unknown_value=-1),
                                   ['MaxOfUpperTRange', 'MinOfUpperTRange',
                                    'AverageOfUpperTRange', 'MaxOfLowerTRange',
                                    'MinOfLowe

In [8]:
pipe1.steps

[('columntransformer',
  ColumnTransformer(transformers=[('standardscaler', StandardScaler(),
                                   ['fruitset', 'fruitmass', 'seeds']),
                                  ('ordinalencoder-1',
                                   OrdinalEncoder(handle_unknown='use_encoded_value',
                                                  unknown_value=-1),
                                   ['clonesize', 'honeybee', 'bumbles', 'andrena',
                                    'osmia', 'AverageOfLowerTRange',
                                    'AverageRainingDays']),
                                  ('ordinalencoder-2',
                                   OrdinalEncoder(handle_unknown='use_encoded_value',
                                                  unknown_value=-1),
                                   ['RainingDays']),
                                  ('drop', 'drop',
                                   ['MaxOfUpperTRange', 'MinOfUpperTRange',
                      

In [9]:
#cv score baseline
from sklearn.model_selection import cross_val_score
print(cross_val_score(pipea, X, y, scoring='neg_mean_absolute_error').mean())
print(cross_val_score(pipeb, X, y, scoring='neg_mean_absolute_error').mean())
print(cross_val_score(pipec, X, y, scoring='neg_mean_absolute_error').mean())

-345.3072950841423
-344.70799042274683
-345.2302186178372


In [10]:
print(cross_val_score(pipe, X, y, scoring='neg_mean_absolute_error').mean())

-343.3207824857909


In [11]:
print(cross_val_score(pipe1, X, y, scoring='neg_mean_absolute_error').mean())

-344.05706659966364


In [12]:
print(cross_val_score(pipe2, X, y, scoring='neg_mean_absolute_error').mean())

-344.70799042274683


In [13]:
#fit model
pipe.fit(X,y)
pipe1.fit(X,y)
pipe2.fit(X,y)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('standardscaler',
                                                  StandardScaler(),
                                                  ['fruitset', 'fruitmass',
                                                   'seeds']),
                                                 ('ordinalencoder-1',
                                                  OrdinalEncoder(handle_unknown='use_encoded_value',
                                                                 unknown_value=-1),
                                                  ['clonesize', 'honeybee',
                                                   'bumbles', 'andrena',
                                                   'osmia',
                                                   'AverageOfLowerTRange',
                                                   'AverageRainingDays']),
                                                 ('ordinalencoder-2',
    

In [14]:
test = pd.read_csv('data/test.csv')
X_test = test.drop(columns=['id'])
X_test.head()

Unnamed: 0,clonesize,honeybee,bumbles,andrena,osmia,MaxOfUpperTRange,MinOfUpperTRange,AverageOfUpperTRange,MaxOfLowerTRange,MinOfLowerTRange,AverageOfLowerTRange,RainingDays,AverageRainingDays,fruitset,fruitmass,seeds
0,25.0,0.25,0.25,0.25,0.25,86.0,52.0,71.9,62.0,30.0,50.8,24.0,0.39,0.399367,0.408088,31.394569
1,12.5,0.25,0.25,0.75,0.63,94.6,57.2,79.0,68.2,33.0,55.9,1.0,0.1,0.488048,0.442866,36.846956
2,12.5,0.25,0.25,0.63,0.63,86.0,52.0,71.9,62.0,30.0,50.8,16.0,0.26,0.583379,0.487057,40.037644
3,25.0,0.5,0.38,0.38,0.63,86.0,52.0,71.9,62.0,30.0,50.8,16.0,0.26,0.433014,0.422847,33.116091
4,37.5,0.75,0.25,0.25,0.25,94.6,57.2,79.0,68.2,33.0,55.9,24.0,0.39,0.360996,0.38886,29.558019


In [15]:
r1 = pipe.predict(X_test)

In [16]:
r1

array([4236.68473571, 5890.65744798, 7236.79945553, ..., 6498.0249337 ,
       4449.66627663, 7280.9382808 ])

In [17]:
r2 = pipe1.predict(X_test)

In [18]:
r2

array([4305.05727836, 5885.87252203, 7213.38327699, ..., 6503.64137909,
       4374.19795967, 7254.88629305])

In [19]:
r3 = pipe2.predict(X_test)

In [20]:
r3

array([4295.0508749 , 5907.22072143, 7252.36339381, ..., 6518.24749303,
       4380.81715391, 7282.02389802])

In [21]:
#simple model ensembling
result = ((2*r1 + 2*r2 + r3)/5) #weighted_ensemble

In [22]:
result

array([4275.70698061, 5892.05613229, 7230.54577177, ..., 6504.31602372,
       4405.7091253 , 7270.73460914])

In [23]:
pd.DataFrame({'id':test.id,'yield':result}).set_index('id').to_csv('catboost_lgbm_hgbr_ver1.csv')

- 341.8444