In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv("data/train.csv")
train.shape

(15289, 18)

In [3]:
#定义X，y
X = train.iloc[:,1:-1]
y = train['yield']

In [4]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer

num_cols = ['fruitset', 'fruitmass', 'seeds']

oe_cols = ['clonesize', 'honeybee', 'bumbles', 'andrena', 'osmia',
        'AverageOfLowerTRange',
       'RainingDays', 'AverageRainingDays']

oe_cols_drop = ['MaxOfUpperTRange', 'MinOfUpperTRange', 'AverageOfUpperTRange',
       'MaxOfLowerTRange', 'MinOfLowerTRange']

scaler = StandardScaler()
oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

preprocessor = make_column_transformer(
    (scaler, num_cols),
    (oe, oe_cols),
    (oe, oe_cols_drop),)

preprocessor1 = make_column_transformer(
    (scaler, num_cols),
    (oe, oe_cols),
    ('drop', oe_cols_drop))

In [5]:
from catboost import CatBoostRegressor
import lightgbm as lgb
from sklearn.ensemble import VotingRegressor

lgbm = lgb.LGBMRegressor(random_state = 318, objective = 'mae') #要设置objective='mae'
cb = CatBoostRegressor(random_state = 318, objective = 'MAE',verbose=0) #verbose:不显示训练过程
vc = VotingRegressor([('lgbm',lgbm),('cb',cb)]) #342.73

In [6]:
#make pipeline
from sklearn.pipeline import make_pipeline
#pipe = make_pipeline(preprocessor, cb) #lgbm:344.59, cb:344.11 , vc:342.67
#pipe = make_pipeline(preprocessor1, vc) #lgbm:344.05, cb:344.73, vc:342.73
pipe = make_pipeline(preprocessor, cb)
pipe1 = make_pipeline(preprocessor1, lgbm)
pipe2 = make_pipeline(preprocessor, vc)

In [7]:
pipe.steps

[('columntransformer',
  ColumnTransformer(transformers=[('standardscaler', StandardScaler(),
                                   ['fruitset', 'fruitmass', 'seeds']),
                                  ('ordinalencoder-1',
                                   OrdinalEncoder(handle_unknown='use_encoded_value',
                                                  unknown_value=-1),
                                   ['clonesize', 'honeybee', 'bumbles', 'andrena',
                                    'osmia', 'AverageOfLowerTRange',
                                    'RainingDays', 'AverageRainingDays']),
                                  ('ordinalencoder-2',
                                   OrdinalEncoder(handle_unknown='use_encoded_value',
                                                  unknown_value=-1),
                                   ['MaxOfUpperTRange', 'MinOfUpperTRange',
                                    'AverageOfUpperTRange', 'MaxOfLowerTRange',
                               

In [8]:
pipe1.steps

[('columntransformer',
  ColumnTransformer(transformers=[('standardscaler', StandardScaler(),
                                   ['fruitset', 'fruitmass', 'seeds']),
                                  ('ordinalencoder',
                                   OrdinalEncoder(handle_unknown='use_encoded_value',
                                                  unknown_value=-1),
                                   ['clonesize', 'honeybee', 'bumbles', 'andrena',
                                    'osmia', 'AverageOfLowerTRange',
                                    'RainingDays', 'AverageRainingDays']),
                                  ('drop', 'drop',
                                   ['MaxOfUpperTRange', 'MinOfUpperTRange',
                                    'AverageOfUpperTRange', 'MaxOfLowerTRange',
                                    'MinOfLowerTRange'])])),
 ('lgbmregressor', LGBMRegressor(objective='mae', random_state=318))]

In [9]:
#cv score baseline
from sklearn.model_selection import cross_val_score
print(cross_val_score(pipe, X, y, scoring='neg_mean_absolute_error').mean())

-344.11342747481063


In [10]:
#fit model
pipe.fit(X,y)
pipe1.fit(X,y)
pipe2.fit(X,y)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('standardscaler',
                                                  StandardScaler(),
                                                  ['fruitset', 'fruitmass',
                                                   'seeds']),
                                                 ('ordinalencoder-1',
                                                  OrdinalEncoder(handle_unknown='use_encoded_value',
                                                                 unknown_value=-1),
                                                  ['clonesize', 'honeybee',
                                                   'bumbles', 'andrena',
                                                   'osmia',
                                                   'AverageOfLowerTRange',
                                                   'RainingDays',
                                                   'AverageRainingDays']),
        

In [11]:
test = pd.read_csv('data/test.csv')
X_test = test.drop(columns=['id'])
X_test.head()

Unnamed: 0,clonesize,honeybee,bumbles,andrena,osmia,MaxOfUpperTRange,MinOfUpperTRange,AverageOfUpperTRange,MaxOfLowerTRange,MinOfLowerTRange,AverageOfLowerTRange,RainingDays,AverageRainingDays,fruitset,fruitmass,seeds
0,25.0,0.25,0.25,0.25,0.25,86.0,52.0,71.9,62.0,30.0,50.8,24.0,0.39,0.399367,0.408088,31.394569
1,12.5,0.25,0.25,0.75,0.63,94.6,57.2,79.0,68.2,33.0,55.9,1.0,0.1,0.488048,0.442866,36.846956
2,12.5,0.25,0.25,0.63,0.63,86.0,52.0,71.9,62.0,30.0,50.8,16.0,0.26,0.583379,0.487057,40.037644
3,25.0,0.5,0.38,0.38,0.63,86.0,52.0,71.9,62.0,30.0,50.8,16.0,0.26,0.433014,0.422847,33.116091
4,37.5,0.75,0.25,0.25,0.25,94.6,57.2,79.0,68.2,33.0,55.9,24.0,0.39,0.360996,0.38886,29.558019


In [12]:
r1 = pipe.predict(X_test)

In [13]:
r1

array([4232.13018279, 5888.31486722, 7241.70742716, ..., 6530.81351721,
       4442.526779  , 7268.58051109])

In [14]:
r2 = pipe1.predict(X_test)

In [15]:
r2

array([4309.44181395, 5885.31191706, 7213.66165235, ..., 6504.66566258,
       4376.37881906, 7225.19658197])

In [16]:
#simple model ensembling
result = ((r1 + r2)/2)

In [17]:
result

array([4270.78599837, 5886.81339214, 7227.68453976, ..., 6517.7395899 ,
       4409.45279903, 7246.88854653])

In [18]:
pd.DataFrame({'id':test.id,'yield':result}).set_index('id').to_csv('catboost_lgbm_ver1.csv')

- 342.83692

In [19]:
#result by using the voting-regressor
r = pipe2.predict(X_test)

In [20]:
pd.DataFrame({'id':test.id,'yield':r}).set_index('id').to_csv('catboost_lgbm_vc_ver1.5.csv')

In [21]:
r

array([4271.0008809 , 5882.64610569, 7230.78737409, ..., 6511.39885957,
       4429.34387387, 7256.46188771])

- 342.92801