### **Regression with an Abalone Dataset**

**Dataset Description**
The dataset for this competition (both train and test) was generated from a deep learning model trained on the Abalone dataset. Feature distributions are close to, but not exactly the same, as the original. Feel free to use the original dataset as part of this competition, both to explore differences as well as to see whether incorporating the original in training improves model performance.

**Files**
train.csv - the training dataset; Rings is the integer target
test.csv - the test dataset; your objective is to predict the value of Rings for each row
sample_submission.csv - a sample submission file in the correct format

Link to the competition : https://www.kaggle.com/competitions/playground-series-s4e4/overview

by [Sathya Narayanan](https://github.com/Sathyavrv)


#### Goal of the Competition

The goal of this competition is to predict the age of abalone from various physical measurements.

<p align=center>
<img src="https://www.kaggle.com/competitions/72489/images/header" width="80%" align="center"></p>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'playground-series-s4e4:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F72489%2F8096274%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240409%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240409T064016Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D60f7143d8ab54b0fb6ba9d88edad367afa462b91c1073f818a3187062baf209a659b174252136efed91fe0daaa8df75292205b119febdaaf83f0223c213401a3dccc6b39235fd975012c1b7fb1356d4b58bb173d15e3f98af0f2d4b2669a679d51fcea887b4bfaf71dc3172016dc240be4c0bb5545d7736118b1dff054c5f146467e716e1a424b94d0bc86ab530845bb269e730a1528b3678124b84cadc4217c56c47e2d00e958d383927d4d1b5fcea67eb1b0f738b52d301cd62326b477c727f9401ae4675a49be18f5a06685cfabfa154b586eac4629c2f58b443a7b8948dc8b30991f50f7c0baccddea4b6c6873eaaf36da17ae97afaf8eff731647cd6080,abalone-dataset:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F37691%2F57419%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240409%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240409T064016Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3Db34acfd6d19f385a76a307988714a55b9f608df8a9441b5140be1093b14cd347dd4289f31da168ced30d013a183d66a12a11c7ef10689c78465afc253e092a61cb6ed528019ba819c3ed9ec92533dc3bfa5a910f89adc2d6954a549324dfa8321c75694e0d18cdf931e4b8f03f39435f597e950615932bc6de9101d626d1923fb1db1f91e618807974935b4993ed6a9e8416f2a6029f60e106e630f39d261f02c811b1227b78a9636d97119907ba6681a9fd8a1e489e8ece9c8ce29c26c24c4f5d218ec4be9c5585db756294fc7831fa6c1e285acf7f84bddda04bca810e1272a4f027c7ee0c168a7d4eae1a013b5c12693e844654227d13cb049fad30ac2f57'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
!pip install -U scikit-learn -q

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
spopt 0.6.0 requires shapely>=2.0.1, but you have shapely 1.8.5.post1 which is incompatible.[0m[31m
[0m

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import make_scorer, mean_squared_error, root_mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import VotingRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
from sklearn.base import copy
import pandas as pd
import numpy as np
import random
import optuna

In [None]:
RANDOM_SEED = 42
FIND_BEST_PARAMS = False


np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

In [None]:
train   = pd.read_csv('/kaggle/input/playground-series-s4e4/train.csv')
orginal = pd.read_csv('/kaggle/input/abalone-dataset/abalone.csv')
test    = pd.read_csv('/kaggle/input/playground-series-s4e4/test.csv')

In [None]:
train = train.drop(['id'], axis = 1)
train.columns = orginal.columns
train = pd.concat([train, orginal], axis = 0, ignore_index=True)

y = train['Rings']
# Because RMSLE score, We make a conversion like below:
y_log = np.log(1+y)
# Add the end for getting the result we back to original like below:
# y = np.exp(y_log)-1


train = train.drop(['Rings'], axis = 1)
train.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight
0,F,0.55,0.43,0.15,0.7715,0.3285,0.1465,0.24
1,F,0.63,0.49,0.145,1.13,0.458,0.2765,0.32
2,I,0.16,0.11,0.025,0.021,0.0055,0.003,0.005
3,M,0.595,0.475,0.15,0.9145,0.3755,0.2055,0.25
4,I,0.555,0.425,0.13,0.782,0.3695,0.16,0.1975


In [None]:
test_id = test['id']
test = test.drop('id', axis = 1)
test.columns = train.columns
test.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight
0,M,0.645,0.475,0.155,1.238,0.6185,0.3125,0.3005
1,M,0.58,0.46,0.16,0.983,0.4785,0.2195,0.275
2,M,0.56,0.42,0.14,0.8395,0.3525,0.1845,0.2405
3,M,0.57,0.49,0.145,0.874,0.3525,0.1865,0.235
4,I,0.415,0.325,0.11,0.358,0.1575,0.067,0.105


In [None]:
encoder = OneHotEncoder(sparse_output = False, handle_unknown = 'ignore')

train = pd.concat([
                    train.iloc[:,1:],
                    pd.DataFrame(encoder.fit_transform(train[['Sex']]).astype('int'),
                                 columns = encoder.categories_[0])
                    ],
                    axis = 1
                )

test  = pd.concat([
                    test.iloc[:,1:],
                    pd.DataFrame(encoder.transform(test[['Sex']]).astype('int'),
                                 columns = encoder.categories_[0])
                    ],
                    axis = 1
                )

In [None]:
%%time


def objective(trial):

    params = {
        "verbose": False,
        "iterations": 1000,
        "loss_function":'RMSE',
        "random_state": RANDOM_SEED,
        "depth": trial.suggest_int("depth", 3, 15),
        "subsample": trial.suggest_float("subsample", 0.01, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 1.0),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
    scores = []
    for _, (train_index, valid_index) in enumerate(cv.split(train, y)):
        X_train, y_train = train.iloc[train_index], y_log.iloc[train_index]
        X_valid, y_valid = train.iloc[valid_index], y_log.iloc[valid_index]
        model = CatBoostRegressor(**params)

        model.fit(X_train, y_train,
                  eval_set=(X_valid, y_valid),
                  early_stopping_rounds=100)

        y_pred = model.predict(X_valid)
        scores = root_mean_squared_error(y_valid, y_pred)
    return np.mean(scores)


study = optuna.create_study(direction='minimize', study_name="optuna_catboost")
if FIND_BEST_PARAMS:
    study.optimize(objective, n_trials=50)
    print(f"Best trial average RMSE: {study.best_value:.4f}")
    for key, value in study.best_params.items():
        print(f"{key}: {value}")

[I 2024-04-05 14:21:12,731] A new study created in memory with name: optuna_catboost


CPU times: user 1.8 ms, sys: 1.13 ms, total: 2.94 ms
Wall time: 2.21 ms


In [None]:
%%time


def objective(trial):

    params = {
        'n_jobs':-1,
        "metric":'rmse',
        "verbosity": -1,
        "bagging_freq": 1,
        "boosting_type": "gbdt",
        "objective":'regression',
        'random_state':RANDOM_SEED,
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "n_estimators": trial.suggest_int('n_estimators', 400, 1000),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
        "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.01),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 60),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.3, 1.0),
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
    scores = []
    for _, (train_index, valid_index) in enumerate(cv.split(train, y)):
        X_train, y_train = train.iloc[train_index], y_log.iloc[train_index]
        X_valid, y_valid = train.iloc[valid_index], y_log.iloc[valid_index]
        model = LGBMRegressor(**params)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_valid)
        scores = root_mean_squared_error(y_valid, y_pred)
    return np.mean(scores)


study = optuna.create_study(direction='minimize', study_name="optuna_lgbm")
if FIND_BEST_PARAMS:
    study.optimize(objective, n_trials=50)
    print(f"Best trial average RMSE: {study.best_value:.4f}")
    for key, value in study.best_params.items():
        print(f"{key}: {value}")

[I 2024-04-05 14:21:12,748] A new study created in memory with name: optuna_lgbm


CPU times: user 1.31 ms, sys: 74 µs, total: 1.38 ms
Wall time: 1.02 ms


In [None]:
%%time


def objective(trial):

    params = {
        'eval_metric': 'rmse',
        'random_state': RANDOM_SEED,
        'objective': 'reg:squarederror',
        'gamma': trial.suggest_float("gamma", 1e-2, 1.0),
        'max_depth': trial.suggest_int('max_depth',2, 20),
        'subsample': trial.suggest_float("subsample", 0.05, 1.0),
        'n_estimators': trial.suggest_int('n_estimators',100, 1000),
        'min_child_weight': trial.suggest_int('min_child_weight',2, 20),
        'colsample_bytree': trial.suggest_float("colsample_bytree", 0.05, 1.0),
        'learning_rate': trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
    scores = []
    for _, (train_index, valid_index) in enumerate(cv.split(train, y)):
        X_train, y_train = train.iloc[train_index], y_log.iloc[train_index]
        X_valid, y_valid = train.iloc[valid_index], y_log.iloc[valid_index]
        model = XGBRegressor(**params)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_valid)
        scores = root_mean_squared_error(y_valid, y_pred)
    return np.mean(scores)


study = optuna.create_study(direction='minimize', study_name="optuna_xgboost")
if FIND_BEST_PARAMS:
    study.optimize(objective, n_trials=50)
    print(f"Best trial average RMSE: {study.best_value:.4f}")
    for key, value in study.best_params.items():
        print(f"{key}: {value}")

[I 2024-04-05 14:21:12,768] A new study created in memory with name: optuna_xgboost


CPU times: user 1.27 ms, sys: 67 µs, total: 1.34 ms
Wall time: 1.01 ms


In [None]:
xgboost_params = {
    'verbosity': 0,
    'max_depth': 10,
    'device': 'cuda',
    'booster': 'dart',
    'eval_metric': 'rmsle',
    'random_state':RANDOM_SEED,
    'lambda': 0.456836886068415,
    'alpha': 0.6422509164613671,
    'subsample': 0.8365423486036913,
    'objective': 'reg:squaredlogerror',
    'learning_rate': 0.09884907639400813,
    'colsample_bytree': 0.8111849113860014,
}

catboost_params = {
    'depth': 15,
    'max_bin': 464,
    'verbose': False,
    'task_type': 'GPU',
    'eval_metric': 'RMSE',
    'min_data_in_leaf': 78,
    'loss_function': 'RMSE',
    'grow_policy': 'Lossguide',
    'random_state':RANDOM_SEED,
    'bootstrap_type': 'Bernoulli',
    'subsample': 0.83862137638162,
    'l2_leaf_reg': 8.365422739510098,
    'random_strength': 3.296124856352495,
    'learning_rate': 0.09992185242598203,
}

lgbm_params = {
    'verbosity': -1,
    'device': 'gpu',
    'metric': 'rmse',
    'num_leaves': 176,
    'bagging_freq': 7,
    'boosting_type': 'gbdt',
    'min_child_samples': 91,
    'objective': 'regression',
    'random_state':RANDOM_SEED,
    'learning_rate': 0.07351805347801958,
    'bagging_fraction': 0.6502062728410578,
    'feature_fraction': 0.7058843944694884,
}

In [None]:
cv_estimators = [
    ('lgbm', LGBMRegressor(**lgbm_params)),
    ('xgboost', XGBRegressor(**xgboost_params)),
    ('catboost', CatBoostRegressor(**catboost_params))
]

In [None]:
%%time


def objective(trial):

    params = {
        'lgbm_weight': trial.suggest_float('lgbm_weight', 0.0, 5.0),
        'xgboost_weight': trial.suggest_float('xgboost_weight', 0.0, 5.0),
        'catboost_weight': trial.suggest_float('catboost_weight', 0.0, 5.0),
    }


    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
    scores = []
    for _, (train_index, valid_index) in enumerate(cv.split(train, y)):
        X_train, y_train = train.iloc[train_index], y_log.iloc[train_index]
        X_valid, y_valid = train.iloc[valid_index], y_log.iloc[valid_index]
        voting_regressor = VotingRegressor(
            estimators=cv_estimators,
            weights=[params['lgbm_weight'], params['xgboost_weight'], params['catboost_weight']]
        )
        voting_regressor.fit(X_train, y_train)
        y_pred = voting_regressor.predict(X_valid)
        scores = root_mean_squared_error(y_valid, y_pred)
    return np.mean(scores)


study = optuna.create_study(direction='minimize', study_name="voting_regressor_optuna")
if FIND_BEST_PARAMS:
    study.optimize(objective, n_trials=100)
    print(f"Best trial average RMSE: {study.best_value:.4f}")
    for key, value in study.best_params.items():
        print(f"{key}: {value}")


[I 2024-04-05 14:21:12,809] A new study created in memory with name: voting_regressor_optuna


CPU times: user 1.28 ms, sys: 0 ns, total: 1.28 ms
Wall time: 1 ms


In [None]:
%%time

weight_best_params = {
    'lgbm_weight': 3.0860711610688636,
    'xgboost_weight': 1.793424750707662,
    'catboost_weight': 4.59273791580418
}


voting_regressor = VotingRegressor(
    estimators=cv_estimators,
    weights=[ weight_best_params['lgbm_weight'],
              weight_best_params['xgboost_weight'],
              weight_best_params['catboost_weight']
    ]
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
scores = []
y_pred_test = []
for fold_i, (train_index, valid_index) in enumerate(cv.split(train, y)):
    X_train, y_train = train.iloc[train_index], y_log.iloc[train_index]
    X_valid, y_valid = train.iloc[valid_index], y_log.iloc[valid_index]
    voting_regressor.fit(X_train, y_train)
    y_pred = voting_regressor.predict(X_valid)
    scores = root_mean_squared_error(y_valid, y_pred)
    y_pred_test.append(voting_regressor.predict(test))
    print(f"FOLD {fold_i} Done. RMSE : {scores}")
print(f"All FOLD. Mean RMSE : {np.mean(scores)}")



FOLD 0 Done. RMSE : 0.14864280090415702
FOLD 1 Done. RMSE : 0.14720041945471446
FOLD 2 Done. RMSE : 0.14836659926410167
FOLD 3 Done. RMSE : 0.1494904817849813
FOLD 4 Done. RMSE : 0.1482838011215943
All FOLD. Mean RMSE : 0.1482838011215943
CPU times: user 2min 14s, sys: 15.8 s, total: 2min 30s
Wall time: 1min 28s


In [None]:
predictions = np.mean(y_pred_test, axis=0)
sub  = pd.DataFrame(columns = ['id', 'Rings'])
sub['id'] = test_id
sub['Rings'] = np.exp(predictions)-1
sub.to_csv('submission.csv', index = False)