In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv("data/train.csv")
train.shape

(15289, 18)

In [3]:
#定义X，y
X = train.iloc[:,1:-1]
y = train['yield']

In [4]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer

num_cols = ['fruitset', 'fruitmass', 'seeds']

oe_cols = ['clonesize', 'honeybee', 'bumbles', 'andrena', 'osmia',
        'AverageOfLowerTRange',
        'AverageRainingDays']

oe_cols_drop = ['MaxOfUpperTRange', 'MinOfUpperTRange', 'AverageOfUpperTRange',
       'MaxOfLowerTRange', 'MinOfLowerTRange']

oe_cols_drop2 = ['RainingDays']

scaler = StandardScaler()
oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

preprocessor = make_column_transformer(
    (scaler, num_cols),
    (oe, oe_cols),
    (oe, oe_cols_drop),
    (oe, oe_cols_drop2))

preprocessor1 = make_column_transformer(
    (scaler, num_cols),
    (oe, oe_cols),
    (oe, oe_cols_drop2),
    ('drop', oe_cols_drop))

preprocessor2 = make_column_transformer(
    (scaler, num_cols),
    (oe, oe_cols),
    ('drop', oe_cols_drop2),
    ('drop', oe_cols_drop))

In [5]:
from catboost import CatBoostRegressor
import lightgbm as lgb
from sklearn.ensemble import VotingRegressor,HistGradientBoostingRegressor

lgbm = lgb.LGBMRegressor(boosting_type='gbdt',learning_rate=0.02,n_estimators=1000,importance_type='gain',num_leaves=31,random_state = 318, objective = 'mae') #要设置objective='mae'
cb = CatBoostRegressor(random_state = 318, objective = 'MAE',verbose=0) #verbose:不显示训练过程
hgbr = HistGradientBoostingRegressor(random_state = 318,loss = 'absolute_error',learning_rate=0.12)

vc = VotingRegressor([('lgbm',lgbm),('cb',cb),('hgbr',hgbr)])

In [6]:
#make pipeline
from sklearn.pipeline import make_pipeline
# pipea = make_pipeline(preprocessor, hgbr) #lgbm:344.59, cb:344.11 , vc:342.67, hgbr:345.30
# pipeb = make_pipeline(preprocessor1, hgbr) #lgbm:344.05, cb:344.73, vc:342.73, hgbr:344.70
# pipec = make_pipeline(preprocessor2, hgbr) #lgbm:344.56, cb:344.81, vc:343.16, hgbr:345.23
pipe = make_pipeline(preprocessor, cb) #343.32
pipe1 = make_pipeline(preprocessor1, lgbm) #343.32078
pipe2 = make_pipeline(preprocessor1, hgbr) #344.59532
#pipe2 = make_pipeline(preprocessor, vc)

In [7]:
pipe.steps

[('columntransformer',
  ColumnTransformer(transformers=[('standardscaler', StandardScaler(),
                                   ['fruitset', 'fruitmass', 'seeds']),
                                  ('ordinalencoder-1',
                                   OrdinalEncoder(handle_unknown='use_encoded_value',
                                                  unknown_value=-1),
                                   ['clonesize', 'honeybee', 'bumbles', 'andrena',
                                    'osmia', 'AverageOfLowerTRange',
                                    'AverageRainingDays']),
                                  ('ordinalencoder-2',
                                   OrdinalEncoder(handle_unknown='use_encoded_value',
                                                  unknown_value=-1),
                                   ['MaxOfUpperTRange', 'MinOfUpperTRange',
                                    'AverageOfUpperTRange', 'MaxOfLowerTRange',
                                    'MinOfLowe

In [8]:
pipe1.steps

[('columntransformer',
  ColumnTransformer(transformers=[('standardscaler', StandardScaler(),
                                   ['fruitset', 'fruitmass', 'seeds']),
                                  ('ordinalencoder-1',
                                   OrdinalEncoder(handle_unknown='use_encoded_value',
                                                  unknown_value=-1),
                                   ['clonesize', 'honeybee', 'bumbles', 'andrena',
                                    'osmia', 'AverageOfLowerTRange',
                                    'AverageRainingDays']),
                                  ('ordinalencoder-2',
                                   OrdinalEncoder(handle_unknown='use_encoded_value',
                                                  unknown_value=-1),
                                   ['RainingDays']),
                                  ('drop', 'drop',
                                   ['MaxOfUpperTRange', 'MinOfUpperTRange',
                      

In [9]:
#cv score baseline
# from sklearn.model_selection import cross_val_score
# print(cross_val_score(pipea, X, y, scoring='neg_mean_absolute_error').mean())
# print(cross_val_score(pipeb, X, y, scoring='neg_mean_absolute_error').mean())
# print(cross_val_score(pipec, X, y, scoring='neg_mean_absolute_error').mean())

In [10]:
#cv score baseline: catboost
from sklearn.model_selection import cross_val_score
print(cross_val_score(pipe, X, y, scoring='neg_mean_absolute_error').mean())

-343.3207824857909


In [11]:
#cv score : lgbt
print(cross_val_score(pipe1, X, y, scoring='neg_mean_absolute_error').mean())

-343.360391104551


In [12]:
#cv score : hgbt
print(cross_val_score(pipe2, X, y, scoring='neg_mean_absolute_error').mean())

-344.5953245466311


In [13]:
#fit model
pipe.fit(X,y)
pipe1.fit(X,y)
pipe2.fit(X,y)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('standardscaler',
                                                  StandardScaler(),
                                                  ['fruitset', 'fruitmass',
                                                   'seeds']),
                                                 ('ordinalencoder-1',
                                                  OrdinalEncoder(handle_unknown='use_encoded_value',
                                                                 unknown_value=-1),
                                                  ['clonesize', 'honeybee',
                                                   'bumbles', 'andrena',
                                                   'osmia',
                                                   'AverageOfLowerTRange',
                                                   'AverageRainingDays']),
                                                 ('ordinalencoder-2',
    

In [14]:
test = pd.read_csv('data/test.csv')
X_test = test.drop(columns=['id'])
X_test.head()

Unnamed: 0,clonesize,honeybee,bumbles,andrena,osmia,MaxOfUpperTRange,MinOfUpperTRange,AverageOfUpperTRange,MaxOfLowerTRange,MinOfLowerTRange,AverageOfLowerTRange,RainingDays,AverageRainingDays,fruitset,fruitmass,seeds
0,25.0,0.25,0.25,0.25,0.25,86.0,52.0,71.9,62.0,30.0,50.8,24.0,0.39,0.399367,0.408088,31.394569
1,12.5,0.25,0.25,0.75,0.63,94.6,57.2,79.0,68.2,33.0,55.9,1.0,0.1,0.488048,0.442866,36.846956
2,12.5,0.25,0.25,0.63,0.63,86.0,52.0,71.9,62.0,30.0,50.8,16.0,0.26,0.583379,0.487057,40.037644
3,25.0,0.5,0.38,0.38,0.63,86.0,52.0,71.9,62.0,30.0,50.8,16.0,0.26,0.433014,0.422847,33.116091
4,37.5,0.75,0.25,0.25,0.25,94.6,57.2,79.0,68.2,33.0,55.9,24.0,0.39,0.360996,0.38886,29.558019


In [15]:
r1 = pipe.predict(X_test)

In [16]:
r1

array([4236.68473571, 5890.65744798, 7236.79945553, ..., 6498.0249337 ,
       4449.66627663, 7280.9382808 ])

In [17]:
r2 = pipe1.predict(X_test)

In [18]:
r2

array([4300.7709737 , 5854.52985182, 7231.38009641, ..., 6515.71492293,
       4392.59610627, 7246.83150535])

In [19]:
r3 = pipe2.predict(X_test)

In [20]:
r3

array([4297.62026728, 5869.53709463, 7241.02035065, ..., 6505.06900906,
       4423.49968852, 7274.1660343 ])

In [21]:
#simple model ensembling
result = ((2*r1 + 2*r2 + r3)/5) #weighted_ensemble

In [22]:
result

array([4274.50633722, 5871.98233885, 7235.4758909 , ..., 6506.50974447,
       4421.60489086, 7265.94112132])

In [23]:
unique_targets = np.unique(train["yield"])

In [24]:
unique_targets

array([1945.53061, 2379.90521, 2384.72892, 2452.68075, 2508.37567,
       2605.69676, 2625.26916, 2688.02883, 2825.00374, 2946.92602,
       2988.05944, 3049.26032, 3049.39771, 3139.43255, 3182.69865,
       3238.02815, 3244.32926, 3269.27176, 3276.36206, 3322.9462 ,
       3373.43684, 3385.148  , 3385.36422, 3387.78516, 3396.72989,
       3436.49354, 3447.0821 , 3471.19214, 3494.94296, 3496.35236,
       3501.17992, 3502.08292, 3511.5379 , 3519.43131, 3626.36653,
       3631.90544, 3662.18313, 3680.56025, 3708.20502, 3712.99786,
       3720.71159, 3723.52338, 3748.46547, 3764.31973, 3784.11935,
       3804.61017, 3813.16579, 3822.98167, 3826.17226, 3828.16556,
       3834.5748 , 3837.51296, 3866.79896, 3895.62684, 3900.34404,
       3901.98955, 3911.42225, 3923.90628, 3937.91539, 3941.25512,
       3943.13168, 3948.98691, 3968.33018, 3983.09785, 3998.05416,
       4010.55215, 4016.3616 , 4042.84264, 4045.47966, 4051.55129,
       4056.83427, 4057.55521, 4062.73953, 4068.35066, 4079.91

In [25]:
test_preds = [min(unique_targets, key = lambda x: abs(x - pred)) for pred in result]

In [26]:
test_preds

[4278.76321,
 5867.99722,
 7235.77564,
 4673.38052,
 3804.61017,
 5124.8549,
 7206.90196,
 6475.89456,
 8247.52116,
 4234.86859,
 5794.89185,
 5781.85965,
 5503.09583,
 5686.12196,
 4966.77945,
 4705.62653,
 4667.72494,
 6443.2569,
 4888.50303,
 5852.8742,
 6274.93484,
 7411.17457,
 5200.57849,
 5299.66133,
 6482.70572,
 4905.49871,
 4340.82151,
 8064.73881,
 5674.02852,
 7825.87492,
 8212.79289,
 7016.89385,
 6677.62443,
 2825.00374,
 5865.38472,
 2688.02883,
 6503.27204,
 6315.23321,
 4162.76156,
 7778.34916,
 7554.4683,
 5203.87681,
 5852.8742,
 6880.7759,
 6376.66256,
 5556.3746,
 8064.73881,
 4203.02762,
 7667.83619,
 6500.09778,
 5514.66748,
 6801.49205,
 5596.22636,
 5086.60991,
 6717.44627,
 6202.32702,
 4310.6254,
 6190.66597,
 6244.19576,
 5356.87186,
 6539.11085,
 4546.78674,
 6263.80788,
 5807.00693,
 6909.43,
 4340.82151,
 4357.82292,
 5971.39402,
 6347.56076,
 7041.38018,
 4830.95981,
 4179.18592,
 5605.221,
 6157.05484,
 6708.24129,
 4813.45285,
 4387.74185,
 4737.63925,

In [27]:
pd.DataFrame({'id':test.id,'yield':test_preds}).set_index('id').to_csv('catboost_lgbm_hgbr_ver3_trick.csv')

- 341.32303