In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv("data/train.csv")
train.head()

Unnamed: 0,id,Sex,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Age
0,0,I,1.525,1.175,0.375,28.973189,12.728926,6.647958,8.348928,9
1,1,I,1.1,0.825,0.275,10.418441,4.521745,2.324659,3.40194,8
2,2,M,1.3875,1.1125,0.375,24.777463,11.3398,5.556502,6.662133,9
3,3,F,1.7,1.4125,0.5,50.660556,20.354941,10.991839,14.996885,11
4,4,I,1.25,1.0125,0.3375,23.289114,11.977664,4.50757,5.953395,8


In [3]:
feature_cols = ['Sex','Length','Diameter','Height','Weight','Shucked Weight','Viscera Weight','Shell Weight']
X = train[feature_cols]
y = train.Age

In [4]:
def create_feature(df):
    df['Total Weight'] = df['Weight']
    df['Viscera Ratio'] = df['Viscera Weight'] / df['Total Weight']
    df['Shell Ratio'] = df['Shell Weight'] / df['Total Weight']
    df['Shell-to-Body Ratio'] = df['Shell Weight'] / (df['Total Weight'] + df['Shell Weight'])
    df['Meat Yield'] = df['Shucked Weight'] / (df['Total Weight'] + df['Shell Weight'])
    df['Length-to-Diameter Ratio'] = df['Length'] / df['Diameter']
    df['Weight-to-VisceraWeight Ratio'] = df['Total Weight'] / df['Viscera Weight']
    df['Weight-to-ShellWeight Ratio'] = df['Total Weight'] / df['Shell Weight']
    df['Weight-to-ShuckedWeight Ratio']= df['Total Weight'] / df['Shucked Weight']
    
    df['Surface Area'] = 2 * (df['Length'] * df['Diameter'] + df['Length'] * df['Height'] + df['Diameter'] * df['Height'])
#    df['Volume'] = df['Length'] * df['Diameter'] * df['Height']
#    df['Density'] = df['Total Weight'] / df['Volume']
#    df['Pseudo BMI'] = df['Total Weight'] / (df['Height'] ** 2)
    
    df['Length^2'] = df['Length'] ** 2
    df['Diameter^2'] = df['Diameter'] ** 2
    
    df['Log Weight'] = np.log(df['Total Weight'] + 1)
    
    df['Weight_wo_Viscera'] = df['Shucked Weight'] - df['Viscera Weight']
    df['Body Condition Index'] = np.sqrt(df['Length'] * df['Total Weight'] * df['Shucked Weight'])
    
    return df

In [5]:
X = create_feature(X)
X.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Total Weight'] = df['Weight']


Unnamed: 0,Sex,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Total Weight,Viscera Ratio,...,Length-to-Diameter Ratio,Weight-to-VisceraWeight Ratio,Weight-to-ShellWeight Ratio,Weight-to-ShuckedWeight Ratio,Surface Area,Length^2,Diameter^2,Log Weight,Weight_wo_Viscera,Body Condition Index
0,I,1.525,1.175,0.375,28.973189,12.728926,6.647958,8.348928,28.973189,0.229452,...,1.297872,4.358209,3.470289,2.276169,5.60875,2.325625,1.380625,3.400303,6.080968,23.715318
1,I,1.1,0.825,0.275,10.418441,4.521745,2.324659,3.40194,10.418441,0.223129,...,1.333333,4.481707,3.0625,2.304075,2.87375,1.21,0.680625,2.43523,2.197086,7.198645
2,M,1.3875,1.1125,0.375,24.777463,11.3398,5.556502,6.662133,24.777463,0.224256,...,1.247191,4.459184,3.719149,2.185,4.962188,1.925156,1.237656,3.249501,5.783298,19.744567
3,F,1.7,1.4125,0.5,50.660556,20.354941,10.991839,14.996885,50.660556,0.21697,...,1.20354,4.608925,3.378072,2.488858,7.915,2.89,1.995156,3.944695,9.363103,41.869171
4,I,1.25,1.0125,0.3375,23.289114,11.977664,4.50757,5.953395,23.289114,0.193548,...,1.234568,5.166667,3.911905,1.944379,4.058438,1.5625,1.025156,3.190028,7.470093,18.673149


In [6]:
from sklearn.compose import make_column_selector, make_column_transformer

In [7]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
scaler = StandardScaler()
ohe = OneHotEncoder(handle_unknown='ignore')

In [8]:
# select columns by data type
num_cols = make_column_selector(dtype_include='number')
cat_cols = make_column_selector(dtype_exclude='number')

In [9]:
# do all preprocessing
preprocessor = make_column_transformer(
    (scaler, num_cols),
    (ohe, cat_cols))

In [10]:
from sklearn.linear_model import LinearRegression
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.ensemble import VotingRegressor,HistGradientBoostingRegressor,GradientBoostingRegressor

In [12]:
from sklearn.pipeline import make_pipeline

lgbm = lgb.LGBMRegressor(boosting_type='dart',learning_rate=0.06281895152195545,n_estimators=712,num_leaves=56,random_state = 318)
cbr = CatBoostRegressor(learning_rate=0.06281895152195545,l2_leaf_reg=5.119977542926322, depth=6,random_state = 318, objective = 'MAE',verbose=0)
hgbr = HistGradientBoostingRegressor(random_state = 318,loss = 'absolute_error')

vr = VotingRegressor([('lgbm',lgbm),('cbr',cbr),('hgbr',hgbr)],weights = (1,2,2))

In [13]:
pipe = make_pipeline(preprocessor, vr)

In [14]:
pipe.steps

[('columntransformer',
  ColumnTransformer(transformers=[('standardscaler', StandardScaler(),
                                   <sklearn.compose._column_transformer.make_column_selector object at 0x0000021D40123970>),
                                  ('onehotencoder',
                                   OneHotEncoder(handle_unknown='ignore'),
                                   <sklearn.compose._column_transformer.make_column_selector object at 0x0000021D3A873340>)])),
 ('votingregressor',
  VotingRegressor(estimators=[('lgbm',
                               LGBMRegressor(boosting_type='dart',
                                             learning_rate=0.06281895152195545,
                                             n_estimators=712, num_leaves=56,
                                             random_state=318)),
                              ('cbr',
                               <catboost.core.CatBoostRegressor object at 0x0000021D3A95CBE0>),
                              ('hgbr',
   

In [15]:
# cross-validate the pipeline
from sklearn.model_selection import cross_val_score
%time cross_val_score(pipe, X, y, scoring='neg_mean_absolute_error').mean()

Wall time: 3min 50s


-1.3591614622646666

In [19]:
pipe.fit(X,y)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('standardscaler',
                                                  StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x0000021D40123970>),
                                                 ('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x0000021D3A873340>)])),
                ('votingregressor',
                 VotingRegressor(estimators=[('lgbm',
                                              LGBMRegressor(boosting_type='dart',
                                                            learning_rate=0.06281895152195545,
                                                            n_estimators=712,
                                         

In [20]:
test = pd.read_csv("data/test.csv")
test.drop(columns=['id'],inplace=True)

In [21]:
test = create_feature(test)
test.head()

Unnamed: 0,Sex,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Total Weight,Viscera Ratio,...,Length-to-Diameter Ratio,Weight-to-VisceraWeight Ratio,Weight-to-ShellWeight Ratio,Weight-to-ShuckedWeight Ratio,Surface Area,Length^2,Diameter^2,Log Weight,Weight_wo_Viscera,Body Condition Index
0,I,1.05,0.7625,0.275,8.618248,3.657085,1.729319,2.721552,8.618248,0.200658,...,1.377049,4.983607,3.166667,2.356589,2.598125,1.1025,0.581406,2.263662,1.927766,5.7527
1,I,1.1625,0.8875,0.275,15.507176,7.030676,3.246018,3.96893,15.507176,0.209324,...,1.309859,4.777293,3.907143,2.205645,3.190938,1.351406,0.787656,2.803795,3.784658,11.258004
2,F,1.2875,0.9875,0.325,14.571643,5.556502,3.883882,4.819415,14.571643,0.266537,...,1.303797,3.751825,3.023529,2.622449,4.021562,1.657656,0.975156,2.745452,1.67262,10.210068
3,F,1.55,0.9875,0.3875,28.377849,13.380964,6.548735,7.030676,28.377849,0.230769,...,1.56962,4.333333,4.03629,2.120763,5.027813,2.4025,0.975156,3.380241,6.83223,24.260474
4,I,1.1125,0.85,0.2625,11.765042,5.528153,2.466407,3.331066,11.765042,0.209639,...,1.308824,4.770115,3.531915,2.128205,2.921563,1.237656,0.7225,2.54671,3.061746,8.506223


In [22]:
preds = pipe.predict(test)

In [23]:
test = pd.read_csv("data/test.csv")
pd.DataFrame({'id':test.id,'Age':preds}).set_index('id').to_csv('crab_vc_ver4_featureengineering.csv')