In [1]:
import pandas as pd
import numpy as np

In [4]:
train1 = pd.read_csv("data/train_synthetic.csv")
train2 = pd.read_csv("data/train.csv")
train1.drop(columns=['id'],inplace=True)
train2.drop(columns=['id'],inplace=True)
train = pd.concat([train1,train2],axis=0)
train.shape

(197446, 9)

In [8]:
feature_cols = ['Sex','Length','Diameter','Height','Weight','Shucked Weight','Viscera Weight','Shell Weight']
X = train[feature_cols]
y = train.Age

In [9]:
def create_feature(df):
    df['Total Weight'] = df['Weight']
    df['Viscera Ratio'] = df['Viscera Weight'] / df['Total Weight']
    df['Shell Ratio'] = df['Shell Weight'] / df['Total Weight']
    df['Shell-to-Body Ratio'] = df['Shell Weight'] / (df['Total Weight'] + df['Shell Weight'])
    df['Meat Yield'] = df['Shucked Weight'] / (df['Total Weight'] + df['Shell Weight'])
    df['Length-to-Diameter Ratio'] = df['Length'] / df['Diameter']
    df['Weight-to-VisceraWeight Ratio'] = df['Total Weight'] / df['Viscera Weight']
    df['Weight-to-ShellWeight Ratio'] = df['Total Weight'] / df['Shell Weight']
    df['Weight-to-ShuckedWeight Ratio']= df['Total Weight'] / df['Shucked Weight']
    
    df['Surface Area'] = 2 * (df['Length'] * df['Diameter'] + df['Length'] * df['Height'] + df['Diameter'] * df['Height'])
#    df['Volume'] = df['Length'] * df['Diameter'] * df['Height']
#    df['Density'] = df['Total Weight'] / df['Volume']
#    df['Pseudo BMI'] = df['Total Weight'] / (df['Height'] ** 2)
    
    df['Length^2'] = df['Length'] ** 2
    df['Diameter^2'] = df['Diameter'] ** 2
    
    df['Log Weight'] = np.log(df['Total Weight'] + 1)
    
    df['Weight_wo_Viscera'] = df['Shucked Weight'] - df['Viscera Weight']
    df['Body Condition Index'] = np.sqrt(df['Length'] * df['Total Weight'] * df['Shucked Weight'])
    
    df.drop(columns=['Total Weight'],inplace=True)
    return df

In [10]:
X = create_feature(X)
X.head()

Unnamed: 0,Sex,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Viscera Ratio,Shell Ratio,...,Length-to-Diameter Ratio,Weight-to-VisceraWeight Ratio,Weight-to-ShellWeight Ratio,Weight-to-ShuckedWeight Ratio,Surface Area,Length^2,Diameter^2,Log Weight,Weight_wo_Viscera,Body Condition Index
0,I,0.7,0.6,0.2,4.493396,1.346601,0.86466,1.204854,0.192429,0.268139,...,1.166667,5.196721,3.729412,3.336842,1.36,0.49,0.36,1.703547,0.481942,2.05805
1,I,0.9375,0.7,0.225,8.178831,3.983105,1.545048,2.551455,0.188908,0.311958,...,1.339286,5.293578,3.205556,2.053381,2.049375,0.878906,0.49,2.2169,2.438057,5.526397
2,I,0.825,0.6375,0.2125,5.301356,2.438057,1.445825,1.644271,0.272727,0.31016,...,1.294118,3.666667,3.224138,2.174419,1.673437,0.680625,0.406406,1.840765,0.992233,3.265445
3,M,1.55,1.2125,0.4375,33.027167,14.288148,7.10155,8.363103,0.215021,0.253219,...,1.278351,4.650699,3.949153,2.311508,6.175937,2.4025,1.470156,3.527159,7.186598,27.045156
4,M,1.2,0.9625,0.3125,16.513584,7.10155,3.770484,4.961163,0.228326,0.300429,...,1.246753,4.379699,3.328571,2.325349,3.661563,1.44,0.926406,2.862977,3.331066,11.862818


In [11]:
from sklearn.compose import make_column_selector, make_column_transformer

In [12]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
scaler = StandardScaler()
ohe = OneHotEncoder(handle_unknown='ignore')

In [13]:
# select columns by data type
num_cols = make_column_selector(dtype_include='number')
cat_cols = make_column_selector(dtype_exclude='number')

In [14]:
# do all preprocessing
preprocessor = make_column_transformer(
    (scaler, num_cols),
    (ohe, cat_cols))

In [15]:
from sklearn.linear_model import LinearRegression
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.ensemble import VotingRegressor,HistGradientBoostingRegressor,GradientBoostingRegressor

In [19]:
from sklearn.pipeline import make_pipeline

lgbm = lgb.LGBMRegressor(boosting_type='dart',learning_rate=0.06281895152195545,n_estimators=712,num_leaves=56,random_state = 318)
cbr = CatBoostRegressor(learning_rate=0.06281895152195545,l2_leaf_reg=5.119977542926322, depth=6,random_state = 318, objective = 'MAE',verbose=0)
hgbr = HistGradientBoostingRegressor(random_state = 318,loss = 'absolute_error')

vr = VotingRegressor([('lgbm',lgbm),('cbr',cbr),('hgbr',hgbr)],weights = (1,2,2))

In [20]:
pipe = make_pipeline(preprocessor, vr)

In [21]:
pipe.steps

[('columntransformer',
  ColumnTransformer(transformers=[('standardscaler', StandardScaler(),
                                   <sklearn.compose._column_transformer.make_column_selector object at 0x00000246BAFFC5E0>),
                                  ('onehotencoder',
                                   OneHotEncoder(handle_unknown='ignore'),
                                   <sklearn.compose._column_transformer.make_column_selector object at 0x00000246BAFFC4C0>)])),
 ('votingregressor',
  VotingRegressor(estimators=[('lgbm',
                               LGBMRegressor(boosting_type='dart',
                                             learning_rate=0.06281895152195545,
                                             n_estimators=712, num_leaves=56,
                                             random_state=318)),
                              ('cbr',
                               <catboost.core.CatBoostRegressor object at 0x00000246C9FAE070>),
                              ('hgbr',
   

In [22]:
from sklearn.model_selection import cross_val_score
%time cross_val_score(pipe, X, y, scoring='neg_mean_absolute_error').mean()

Wall time: 8min 45s


-1.3193496437874979

In [23]:
pipe.fit(X,y)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('standardscaler',
                                                  StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x00000246BAFFC5E0>),
                                                 ('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x00000246BAFFC4C0>)])),
                ('votingregressor',
                 VotingRegressor(estimators=[('lgbm',
                                              LGBMRegressor(boosting_type='dart',
                                                            learning_rate=0.06281895152195545,
                                                            n_estimators=712,
                                         

In [27]:
test = pd.read_csv("data/test.csv")
test = create_feature(test)
test.drop(columns=['id'],inplace=True)

In [28]:
test.head()

Unnamed: 0,Sex,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Viscera Ratio,Shell Ratio,...,Length-to-Diameter Ratio,Weight-to-VisceraWeight Ratio,Weight-to-ShellWeight Ratio,Weight-to-ShuckedWeight Ratio,Surface Area,Length^2,Diameter^2,Log Weight,Weight_wo_Viscera,Body Condition Index
0,I,1.05,0.7625,0.275,8.618248,3.657085,1.729319,2.721552,0.200658,0.315789,...,1.377049,4.983607,3.166667,2.356589,2.598125,1.1025,0.581406,2.263662,1.927766,5.7527
1,I,1.1625,0.8875,0.275,15.507176,7.030676,3.246018,3.96893,0.209324,0.255941,...,1.309859,4.777293,3.907143,2.205645,3.190938,1.351406,0.787656,2.803795,3.784658,11.258004
2,F,1.2875,0.9875,0.325,14.571643,5.556502,3.883882,4.819415,0.266537,0.330739,...,1.303797,3.751825,3.023529,2.622449,4.021562,1.657656,0.975156,2.745452,1.67262,10.210068
3,F,1.55,0.9875,0.3875,28.377849,13.380964,6.548735,7.030676,0.230769,0.247752,...,1.56962,4.333333,4.03629,2.120763,5.027813,2.4025,0.975156,3.380241,6.83223,24.260474
4,I,1.1125,0.85,0.2625,11.765042,5.528153,2.466407,3.331066,0.209639,0.283133,...,1.308824,4.770115,3.531915,2.128205,2.921563,1.237656,0.7225,2.54671,3.061746,8.506223


In [29]:
preds = pipe.predict(test)

In [30]:
test = pd.read_csv("data/test.csv")
pd.DataFrame({'id':test.id,'Age':preds}).set_index('id').to_csv('crab_vc_ver4_extradata.csv')