<a href="https://colab.research.google.com/github/Pathway2008/CarPrice/blob/main/Carprice_GRBT_Stacks_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv('/content/drive/MyDrive/carprice/train.csv')
test = pd.read_csv('/content/drive/MyDrive/carprice/test.csv')
sub = pd.read_csv('/content/drive/MyDrive/carprice/sample_submission.csv')

In [3]:
smallest_values = train['생산년도'].nsmallest(2).index
train = train.drop(smallest_values)

In [4]:
largest_values = train['주행거리'].nlargest(3).index
train = train.drop(largest_values)

In [5]:
from scipy.stats import boxcox
columns_to_transform = ['주행거리', '배기량']
for column in columns_to_transform:
    transformed_data, lambda_value = boxcox(train[column])
    train[column] = transformed_data

In [5]:
from sklearn.preprocessing import LabelEncoder

In [6]:
ordinal_features = ['브랜드', '차량모델명', '판매도시', '판매구역']

for feature in ordinal_features:
    le = LabelEncoder()
    le = le.fit(train[feature])
    train[feature] = le.transform(train[feature])

    for label in np.unique(test[feature]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
    test[feature] = le.transform(test[feature])

In [8]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.2-cp310-cp310-manylinux2014_x86_64.whl (98.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.6/98.6 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2


In [None]:
!pip install optuna

In [11]:
from catboost import Pool
import optuna
from tqdm import tqdm
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [12]:
X = train.drop(['ID', '가격'], axis=1)
y = train['가격']

In [13]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
train_pool = Pool(X_train, y_train)
valid_pool = Pool(X_valid, y_valid)

In [15]:
def objective(trial):
    params = {
        'loss_function': 'MAE',
        'eval_metric': 'MAE',
        'random_seed': 42,
        'task_type': 'GPU',
        'boosting_type': trial.suggest_categorical('boosting_type', ['Plain', 'Ordered']),
        'iterations': trial.suggest_int('iterations', 500, 2000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-5, 1e2),
        'random_strength': trial.suggest_loguniform('random_strength', 1e-5, 1e2),
        'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.01, 100.00),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'use_best_model': True,
        'early_stopping_rounds': 100,
        'verbose': False
    }

    # Train the CatBoost model with the current set of parameters
    model = CatBoostRegressor(**params)
    model.fit(train_pool, eval_set=valid_pool)

    # Make predictions on the validation set
    y_pred = model.predict(X_valid)

    # Calculate the mean absolute error (MAE)
    mae = mean_absolute_error(y_valid, y_pred)
    return mae

In [None]:
study = optuna.create_study(direction='minimize')
n_trials = 100
with tqdm(total=n_trials) as pbar:
    def update_pbar(_, __):
        pbar.update(1)

    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=n_trials, callbacks=[update_pbar])

In [17]:
best_params = study.best_params
best_score = study.best_value
print("Best Parameters:", best_params)
print("Best MAE:", best_score)

Best Parameters: {'boosting_type': 'Plain', 'iterations': 1997, 'learning_rate': 0.08862372339654186, 'depth': 9, 'l2_leaf_reg': 2.1031379341857397, 'random_strength': 15.074833000342615, 'bagging_temperature': 0.30077839937910833, 'border_count': 250}
Best MAE: 5.85601717965721


In [None]:
final_model = CatBoostRegressor(**best_params)
final_model.fit(train_pool)

In [23]:
test_cat_optuna = test.drop(['ID'],axis=1)
pred = final_model.predict(test_cat_optuna)

In [26]:
sub['가격'] = pred
sub.to_csv('cat_opnuta+boxcox.csv', index = False)

GBRTmodel (no scaling)

In [None]:
!pip install ray

In [31]:
import optuna
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
import ray
from ray import tune
from tqdm import tqdm
import torch

In [12]:
X = train.drop(['ID', '가격'], axis=1)
y = train['가격']

In [13]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [32]:
def objective(trial):
    # Define the search space for the hyperparameters
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 4),
        'max_features': trial.suggest_categorical('max_features', ['auto', 'sqrt']),
        'subsample': trial.suggest_float('subsample', 0.8, 1.0)
    }

    # Initialize the GBRT model with the suggested parameters
    model = GradientBoostingRegressor(**params)

    # Move the model to GPU
    device = torch.device('cuda')
    model.to(device)

    # Move the training and validation data to GPU
    X_train_tensor = torch.tensor(X_train.values, device=device, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train.values, device=device, dtype=torch.float32)
    X_valid_tensor = torch.tensor(X_valid.values, device=device, dtype=torch.float32)

    # Fit the model to the training data
    model.fit(X_train_tensor, y_train_tensor)

    # Make predictions on the validation set
    y_pred_tensor = model.predict(X_valid_tensor)

    # Move the predictions back to CPU for evaluation
    y_pred = y_pred_tensor.detach().cpu().numpy()
    y_valid_cpu = y_valid.to_numpy()

    # Compute the mean absolute error
    mae = mean_absolute_error(y_valid_cpu, y_pred)

    return mae

In [26]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
#study = optuna.create_study(direction='minimize')
n_trials = 100
with tqdm(total=n_trials) as pbar:
    def update_pbar(_, __):
        pbar.update(1)

    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=n_trials, callbacks=[update_pbar])

Stacked Regression
(xgb,lgb,cat)

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error

In [10]:
X = train.drop(['ID', '가격'], axis=1)
y = train['가격']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
xgb_model = XGBRegressor(tree_method='gpu_hist', gpu_id=0)
lgbm_model = LGBMRegressor(device='gpu')
catboost_model = CatBoostRegressor(task_type ='GPU')

In [15]:
stacked_model = StackingRegressor(
    estimators=[('xgb', xgb_model), ('lgbm', lgbm_model), ('catboost', catboost_model)],
    final_estimator=XGBRegressor(tree_method='gpu_hist', gpu_id=0)  # Use XGBoost as the final estimator
)

In [None]:
!git clone --recursive https://github.com/Microsoft/LightGBM

In [23]:
%cd /content/LightGBM

/content/LightGBM


In [24]:
!mkdir build

In [None]:
!cmake -DUSE_GPU=1

In [None]:
!make -j$(nproc)
!sudo apt-get -y install python-pip
!sudo -H pip install setuptools pandas numpy scipy scikit-learn -U

In [None]:
%cd /content/LightGBM/python-package/
!sudo python setup.py install --precompile

In [36]:
!rm -r /opt/conda/lib/python3.6/site-packages/lightgbm
!git clone --recursive https://github.com/Microsoft/LightGBM

rm: cannot remove '/opt/conda/lib/python3.6/site-packages/lightgbm': No such file or directory
Cloning into 'LightGBM'...
remote: Enumerating objects: 29746, done.[K
remote: Counting objects: 100% (3477/3477), done.[K
remote: Compressing objects: 100% (370/370), done.[K
remote: Total 29746 (delta 3222), reused 3269 (delta 3106), pack-reused 26269[K
Receiving objects: 100% (29746/29746), 20.65 MiB | 15.30 MiB/s, done.
Resolving deltas: 100% (22107/22107), done.
Submodule 'include/boost/compute' (https://github.com/boostorg/compute) registered for path 'external_libs/compute'
Submodule 'eigen' (https://gitlab.com/libeigen/eigen.git) registered for path 'external_libs/eigen'
Submodule 'external_libs/fast_double_parser' (https://github.com/lemire/fast_double_parser.git) registered for path 'external_libs/fast_double_parser'
Submodule 'external_libs/fmt' (https://github.com/fmtlib/fmt.git) registered for path 'external_libs/fmt'
Cloning into '/content/LightGBM/python-package/LightGBM/ex

In [37]:
!apt-get install -y libboost-all-dev

Reading package lists... Done
Building dependency tree       
Reading state information... Done
libboost-all-dev is already the newest version (1.71.0.0ubuntu2).
0 upgraded, 0 newly installed, 0 to remove and 46 not upgraded.


In [38]:
!apt-get update --fix-missing

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/ InRelease [3,622 B]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64  InRelease [1,581 B]
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64  Packages [1,012 kB]
Hit:4 http://archive.ubuntu.com/ubuntu focal InRelease
Hit:5 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu focal InRelease
Get:6 http://security.ubuntu.com/ubuntu focal-security InRelease [114 kB]
Get:7 http://archive.ubuntu.com/ubuntu focal-updates InRelease [114 kB]
Hit:8 http://ppa.launchpad.net/cran/libgit2/ubuntu focal InRelease
Hit:9 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu focal InRelease
Get:10 http://security.ubuntu.com/ubuntu focal-security/main amd64 Packages [2,781 kB]
Get:11 http://archive.ubuntu.com/ubuntu focal-backports InRelease [108 kB]
Hit:12 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu focal InRelease
Get:13 http://archive.ubuntu.com/u

In [39]:
%%bash
cd LightGBM
#rm -r build
mkdir build
cd build
cmake -DUSE_GPU=1 -DOpenCL_LIBRARY=/usr/local/cuda/lib64/libOpenCL.so.1.0 -DOpenCL_INCLUDE_DIR=/usr/local/cuda/include/ ..
make -j$(nproc)

-- The C compiler identification is GNU 9.4.0
-- The CXX compiler identification is GNU 9.4.0
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Check for working C compiler: /usr/bin/cc - skipped
-- Detecting C compile features
-- Detecting C compile features - done
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Check for working CXX compiler: /usr/bin/c++ - skipped
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- Found OpenMP_C: -fopenmp (found version "4.5") 
-- Found OpenMP_CXX: -fopenmp (found version "4.5") 
-- Found OpenMP: TRUE (found version "4.5")  
-- Looking for CL_VERSION_3_0
-- Looking for CL_VERSION_3_0 - found
-- Found OpenCL: /usr/local/cuda/lib64/libOpenCL.so.1.0 (found version "3.0") 
-- OpenCL include directory: /usr/local/cuda/include
-- Found Boost: /usr/lib/x86_64-linux-gnu/cmake/Boost-1.71.0/BoostConfig.cmake (found suitable version "1.71.0", minimum required is "1.56.0") foun

In [None]:
#/usr/local/cuda/lib64/libOpenCL.so.1.0

In [40]:
stacked_model.fit(X_train, y_train)

LightGBMError: ignored