<a href="https://www.kaggle.com/code/szymnq/regression-with-an-abalone-dataset?scriptVersionId=171321986" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
pip install scikit-lego

Collecting scikit-lego
  Downloading scikit_lego-0.8.1-py2.py3-none-any.whl.metadata (11 kB)
Downloading scikit_lego-0.8.1-py2.py3-none-any.whl (209 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.6/209.6 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-lego
Successfully installed scikit-lego-0.8.1
Note: you may need to restart the kernel to use updated packages.


In [2]:
import scipy
import os
import optuna
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GridSearchCV, RepeatedKFold, RepeatedStratifiedKFold, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_log_error
from sklearn.inspection import PartialDependenceDisplay
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import SVR
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklego.linear_model import LADRegression
from scipy.stats import loguniform

In [3]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/playground-series-s4e4/sample_submission.csv
/kaggle/input/playground-series-s4e4/train.csv
/kaggle/input/playground-series-s4e4/test.csv


In [4]:
train_df = pd.read_csv('../input/playground-series-s4e4/train.csv')
test_df = pd.read_csv('../input/playground-series-s4e4/test.csv')
combine = [train_df, test_df]

In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90615 entries, 0 to 90614
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              90615 non-null  int64  
 1   Sex             90615 non-null  object 
 2   Length          90615 non-null  float64
 3   Diameter        90615 non-null  float64
 4   Height          90615 non-null  float64
 5   Whole weight    90615 non-null  float64
 6   Whole weight.1  90615 non-null  float64
 7   Whole weight.2  90615 non-null  float64
 8   Shell weight    90615 non-null  float64
 9   Rings           90615 non-null  int64  
dtypes: float64(7), int64(2), object(1)
memory usage: 6.9+ MB


In [6]:
train_df.head()

Unnamed: 0,id,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
0,0,F,0.55,0.43,0.15,0.7715,0.3285,0.1465,0.24,11
1,1,F,0.63,0.49,0.145,1.13,0.458,0.2765,0.32,11
2,2,I,0.16,0.11,0.025,0.021,0.0055,0.003,0.005,6
3,3,M,0.595,0.475,0.15,0.9145,0.3755,0.2055,0.25,10
4,4,I,0.555,0.425,0.13,0.782,0.3695,0.16,0.1975,9


In [7]:
train_df.describe()

Unnamed: 0,id,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
count,90615.0,90615.0,90615.0,90615.0,90615.0,90615.0,90615.0,90615.0,90615.0
mean,45307.0,0.517098,0.401679,0.135464,0.789035,0.340778,0.169422,0.225898,9.696794
std,26158.441658,0.118217,0.098026,0.038008,0.457671,0.204428,0.100909,0.130203,3.176221
min,0.0,0.075,0.055,0.0,0.002,0.001,0.0005,0.0015,1.0
25%,22653.5,0.445,0.345,0.11,0.419,0.1775,0.0865,0.12,8.0
50%,45307.0,0.545,0.425,0.14,0.7995,0.33,0.166,0.225,9.0
75%,67960.5,0.6,0.47,0.16,1.0675,0.463,0.2325,0.305,11.0
max,90614.0,0.815,0.65,1.13,2.8255,1.488,0.76,1.005,29.0


In [8]:
test_df.head()

Unnamed: 0,id,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight
0,90615,M,0.645,0.475,0.155,1.238,0.6185,0.3125,0.3005
1,90616,M,0.58,0.46,0.16,0.983,0.4785,0.2195,0.275
2,90617,M,0.56,0.42,0.14,0.8395,0.3525,0.1845,0.2405
3,90618,M,0.57,0.49,0.145,0.874,0.3525,0.1865,0.235
4,90619,I,0.415,0.325,0.11,0.358,0.1575,0.067,0.105


Preprocessing

In [9]:
le = LabelEncoder()

for dataset in combine:
    dataset['Sex'] = le.fit_transform(dataset['Sex'])
    dataset.columns = dataset.columns.str.replace(' ', '_')

Feature engineering

In [10]:
# Ratio
for dataset in combine:
    dataset['Diameter_Length_Ratio'] = dataset['Diameter'] / dataset['Length']
    dataset['Height_Length_Ratio'] = dataset['Height'] / dataset['Length']
    dataset['Shell_Whole_weight_Ratio'] = dataset['Shell_weight'] / dataset['Whole_weight']
    dataset['Mean_weight'] = (dataset['Whole_weight'] + dataset['Whole_weight.1'] + dataset['Whole_weight.2']) / 3

In [11]:
train_df.head()

Unnamed: 0,id,Sex,Length,Diameter,Height,Whole_weight,Whole_weight.1,Whole_weight.2,Shell_weight,Rings,Diameter_Length_Ratio,Height_Length_Ratio,Shell_Whole_weight_Ratio,Mean_weight
0,0,0,0.55,0.43,0.15,0.7715,0.3285,0.1465,0.24,11,0.781818,0.272727,0.311082,0.4155
1,1,0,0.63,0.49,0.145,1.13,0.458,0.2765,0.32,11,0.777778,0.230159,0.283186,0.6215
2,2,1,0.16,0.11,0.025,0.021,0.0055,0.003,0.005,6,0.6875,0.15625,0.238095,0.009833
3,3,2,0.595,0.475,0.15,0.9145,0.3755,0.2055,0.25,10,0.798319,0.252101,0.273373,0.4985
4,4,1,0.555,0.425,0.13,0.782,0.3695,0.16,0.1975,9,0.765766,0.234234,0.252558,0.437167


In [12]:
test_df.head()

Unnamed: 0,id,Sex,Length,Diameter,Height,Whole_weight,Whole_weight.1,Whole_weight.2,Shell_weight,Diameter_Length_Ratio,Height_Length_Ratio,Shell_Whole_weight_Ratio,Mean_weight
0,90615,2,0.645,0.475,0.155,1.238,0.6185,0.3125,0.3005,0.736434,0.24031,0.24273,0.723
1,90616,2,0.58,0.46,0.16,0.983,0.4785,0.2195,0.275,0.793103,0.275862,0.279756,0.560333
2,90617,2,0.56,0.42,0.14,0.8395,0.3525,0.1845,0.2405,0.75,0.25,0.28648,0.458833
3,90618,2,0.57,0.49,0.145,0.874,0.3525,0.1865,0.235,0.859649,0.254386,0.268879,0.471
4,90619,1,0.415,0.325,0.11,0.358,0.1575,0.067,0.105,0.783133,0.26506,0.293296,0.194167


Base model

In [13]:
def rmsle(predictions, targets):
    """Calculate the root mean squared logarithmic error between predictions and targets"""
    return np.sqrt(np.mean((np.log(predictions + 1) - np.log(targets + 1)) ** 2))

In [14]:
X = train_df.drop(columns = ['id', 'Rings'], axis = 1)
y = train_df['Rings']

gb_cv_scores, gb_preds = list(), list()
hist_cv_scores, hist_preds = list(), list()
lgb_cv_scores, lgb_preds = list(), list()
xgb_cv_scores, xgb_preds = list(), list()
ens_cv_scores, ens_preds = list(), list()

skf = KFold(n_splits = 10, random_state = 42, shuffle = True)

In [15]:
# for i, (train_i, test_i) in enumerate(skf.split(X, y)):
#     X_train, X_test = X.iloc[train_i], X.iloc[test_i]
#     y_train, y_test = y.iloc[train_i], y.iloc[test_i]
    
#     print('-------------------------------------------')
#     # GradientBoosting
#     gb_md = GradientBoostingRegressor(loss = 'absolute_error'
#                                  ).fit(X_train, y_train)
    
#     gb_pred = gb_md.predict(X_test).astype(int)
#     gb_score = rmsle(gb_pred, y_test)
    
#     gb_preds.append(gb_pred)
#     gb_cv_scores.append(gb_score)
    
#     print(f'Fold {i} => GradientBoosting of MAE =>', gb_score)
    
#     # HistGradientBoosting
#     hist_md = HistGradientBoostingRegressor(loss = 'absolute_error',
#                                             ).fit(X_train, y_train)
                                       
#     hist_pred = hist_md.predict(X_test)
#     hist_score = rmsle(hist_pred, y_test)
    
#     hist_preds.append(hist_pred)
#     hist_cv_scores.append(hist_score)
    
#     print(f'Fold {i} => HistGradientBoosting of RMSLE =>', hist_score)
    
#     # LightGBM
#     lgb_md = LGBMRegressor(objective = 'rmse',
#                            force_col_wise = True,
#                            verbose = -1,
#                           ).fit(X_train, y_train)
                                       
#     lgb_pred = lgb_md.predict(X_test)
#     lgb_score = rmsle(lgb_pred, y_test)
    
#     lgb_preds.append(lgb_pred)
#     lgb_cv_scores.append(lgb_score)
    
#     print(f'Fold {i} => LightGBM of RMSLE =>', lgb_score)
    
#     # XGBoost
#     xgb_md = XGBRegressor(objective = 'reg:pseudohubererror',
#                          ).fit(X_train, y_train)
                                       
#     xgb_pred = hist_md.predict(X_test)
#     xgb_score = rmsle(xgb_pred, y_test)
    
#     xgb_preds.append(xgb_pred)
#     xgb_cv_scores.append(xgb_score)
    
#     print(f'Fold {i} => XGBoost of RMSLE =>', xgb_score)
    
#     # LAD Ensemble
#     ens_md = LADRegression().fit(X_train, y_train)
                                       
#     ens_pred = ens_md.predict(X_test)
#     ens_score = rmsle(ens_pred, y_test)
    
#     ens_preds.append(ens_pred)
#     ens_cv_scores.append(ens_score)
    
#     print(f'Fold {i} => Ensemble of RMSLE =>', ens_score)

In [16]:
# gb_cv_score = np.mean(gb_cv_scores)
# hist_cv_score = np.mean(hist_cv_scores)
# lgb_cv_score = np.mean(lgb_cv_scores)
# xgb_cv_score = np.mean(xgb_cv_scores)
# ens_cv_score = np.mean(ens_cv_scores)

# model_perf = pd.DataFrame({'Model': ['GradientBoosting', 'HistGradient' ,'LightGBM', 'XGBoost', 'Ensemble'],
#                            'cv-score': [gb_cv_score, hist_cv_score, lgb_cv_score, xgb_cv_score, ens_cv_score]})

# plt.figure(figsize = (8, 8))
# ax = sns.barplot(y = 'Model', x = 'cv-score', data = model_perf)
# ax.bar_label(ax.containers[0]);

Hyperparameter tuning

In [17]:
# def objective(trial):
#     X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42) 
    
#     params = {
#         "objective": "regression",
#         "metric": "rmse",
#         "n_estimators": 1000,
#         "verbosity": -1,
#         "bagging_freq": 1,
#         "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
#         "num_leaves": trial.suggest_int("num_leaves", 2, 2**10),
#         "subsample": trial.suggest_float("subsample", 0.05, 1.0),
#         "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
#         "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
#         "verbose": -1,
#         "force_col_wise": True,
#     }
    

#     model = LGBMRegressor(**params)
#     model.fit(X_train, y_train)
#     predictions = model.predict(X_val)
#     result = rmsle(predictions, y_val)
#     return result

In [18]:
# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=30)

In [19]:
# print('Best hyperparameters:', study.best_params)
# print('Best RMSLE:', study.best_value)
# lgbmr_clf = LGBMRegressor(**study.best_params)

Final model

In [20]:
lgbmr_clf = LGBMRegressor(objective = 'rmse',
                           force_col_wise = True,
                           verbose = -1,
                          )

In [21]:
lgbmr_clf.fit(train_df.iloc[:, 1:-1], train_df.iloc[:, -1])
predictions = pd.Series(lgbmr_clf.predict(test_df.iloc[:, 1:]), name="Rings")

In [22]:
results = pd.concat([test_df["id"][:len(predictions)], predictions], axis=1)
results.to_csv("/kaggle/working/result.csv", index=False)