# Exploring Models

## Import Dataset & Libraries

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s4e9/sample_submission.csv
/kaggle/input/playground-series-s4e9/train.csv
/kaggle/input/playground-series-s4e9/test.csv
/kaggle/input/used-car-price-prediction-dataset/used_cars.csv


In [2]:
import warnings

warnings.filterwarnings(action='ignore') # ignore warnings

original_data_path = '/kaggle/input/used-car-price-prediction-dataset/used_cars.csv'
data_path = '/kaggle/input/playground-series-s4e9/'

original = pd.read_csv(original_data_path)
train = pd.read_csv(data_path + 'train.csv')
test = pd.read_csv(data_path + 'test.csv')
submission = pd.read_csv(data_path + 'sample_submission.csv')

In [3]:
import polars as pl
import optuna
import matplotlib.pyplot as plt
import seaborn as sns
import re
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from matplotlib.colors import LinearSegmentedColormap
import lightgbm as lgb
from xgboost import XGBRegressor
from lightgbm import log_evaluation, early_stopping
from catboost import CatBoostRegressor, Pool

## Feature Engineering

In [4]:
original.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,Ford,Utility Police Interceptor Base,2013,"51,000 mi.",E85 Flex Fuel,300.0HP 3.7L V6 Cylinder Engine Flex Fuel Capa...,6-Speed A/T,Black,Black,At least 1 accident or damage reported,Yes,"$10,300"
1,Hyundai,Palisade SEL,2021,"34,742 mi.",Gasoline,3.8L V6 24V GDI DOHC,8-Speed Automatic,Moonlight Cloud,Gray,At least 1 accident or damage reported,Yes,"$38,005"
2,Lexus,RX 350 RX 350,2022,"22,372 mi.",Gasoline,3.5 Liter DOHC,Automatic,Blue,Black,None reported,,"$54,598"
3,INFINITI,Q50 Hybrid Sport,2015,"88,900 mi.",Hybrid,354.0HP 3.5L V6 Cylinder Engine Gas/Electric H...,7-Speed A/T,Black,Black,None reported,Yes,"$15,500"
4,Audi,Q3 45 S line Premium Plus,2021,"9,835 mi.",Gasoline,2.0L I4 16V GDI DOHC Turbo,8-Speed Automatic,Glacier White Metallic,Black,None reported,,"$34,999"


In [5]:
# Only extract the numeric value from 'milage', 'price' columns
original[['milage', 'price']] = original[['milage', 'price']].applymap(
    lambda x: int(''.join(re.findall(r'\d+', x))))

In [6]:
original.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,Ford,Utility Police Interceptor Base,2013,51000,E85 Flex Fuel,300.0HP 3.7L V6 Cylinder Engine Flex Fuel Capa...,6-Speed A/T,Black,Black,At least 1 accident or damage reported,Yes,10300
1,Hyundai,Palisade SEL,2021,34742,Gasoline,3.8L V6 24V GDI DOHC,8-Speed Automatic,Moonlight Cloud,Gray,At least 1 accident or damage reported,Yes,38005
2,Lexus,RX 350 RX 350,2022,22372,Gasoline,3.5 Liter DOHC,Automatic,Blue,Black,None reported,,54598
3,INFINITI,Q50 Hybrid Sport,2015,88900,Hybrid,354.0HP 3.5L V6 Cylinder Engine Gas/Electric H...,7-Speed A/T,Black,Black,None reported,Yes,15500
4,Audi,Q3 45 S line Premium Plus,2021,9835,Gasoline,2.0L I4 16V GDI DOHC Turbo,8-Speed Automatic,Glacier White Metallic,Black,None reported,,34999


In [7]:
train.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200
1,1,Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999
2,2,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,Blue,Gray,None reported,Yes,13900
3,3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000
4,4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Beige,None reported,Yes,97500


In [8]:
test.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title
0,188533,Land,Rover LR2 Base,2015,98000,Gasoline,240.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,6-Speed A/T,White,Beige,None reported,Yes
1,188534,Land,Rover Defender SE,2020,9142,Hybrid,395.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,8-Speed A/T,Silver,Black,None reported,Yes
2,188535,Ford,Expedition Limited,2022,28121,Gasoline,3.5L V6 24V PDI DOHC Twin Turbo,10-Speed Automatic,White,Ebony,None reported,
3,188536,Audi,A6 2.0T Sport,2016,61258,Gasoline,2.0 Liter TFSI,Automatic,Silician Yellow,Black,None reported,
4,188537,Audi,A6 2.0T Premium Plus,2018,59000,Gasoline,252.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,A/T,Gray,Black,None reported,Yes


In [9]:
# drop id from both train & test set
train.drop(columns=['id'], inplace=True)
test.drop(columns=['id'], inplace=True)

# merge train & original data set
train = pd.concat([train, original], ignore_index=True)

In [10]:
# resumetable to check data types
def resumetable(df):
    print(f'data set shape: {df.shape}')
    summary = pd.DataFrame(df.dtypes, columns = ['data type'])
    summary = summary.reset_index()
    summary = summary.rename(columns = {'index': 'feature'})
    summary['missing value count'] = df.isnull().sum().values
    summary['unique value count'] = df.nunique().values
    summary['first value'] = df.loc[0].values
    summary['second value'] = df.loc[1].values
    
    return summary

resumetable(train)

data set shape: (192542, 12)


Unnamed: 0,feature,data type,missing value count,unique value count,first value,second value
0,brand,object,0,57,MINI,Lincoln
1,model,object,0,1898,Cooper S Base,LS V8
2,model_year,int64,0,34,2007,2002
3,milage,int64,0,6652,213000,143250
4,fuel_type,object,5253,7,Gasoline,Gasoline
5,engine,object,0,1146,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel
6,transmission,object,0,62,A/T,A/T
7,ext_col,object,0,319,Yellow,Silver
8,int_col,object,0,156,Gray,Beige
9,accident,object,2565,2,None reported,At least 1 accident or damage reported


In [11]:
cat_cols = train.select_dtypes(include=['object', 'category']).columns.tolist()

In [12]:
# Organize categorical data: only use categories included in training data / other: NaN
# Transition in datatype: categorical features [type: object, category] -> CategoricalDtype
# Make both training & test data use same range of data

cat_features = test.select_dtypes('object').columns
# Reference: https://www.kaggle.com/code/ambrosm/pss4e8-eda-which-makes-sense
for feature in cat_features:          
    categories = sorted(list(set(train[feature].dropna())))
    dtype = pd.CategoricalDtype(categories=categories, ordered=False)
    
    train.loc[~train[feature].isin(categories), feature] = np.nan
    test.loc[~test[feature].isin(categories), feature] = np.nan
    
    train[feature] = train[feature].astype(dtype)
    test[feature] = test[feature].astype(dtype)

In [13]:
# Extract age features 
def extract_age_features(df):
    cur_year = 2024

    df['vehicle_age'] = cur_year - df['model_year']
    df['vehicle_age'] = df['vehicle_age'].replace(0, 1)  # if vehicle_age is 0 -> replace w/ 1
    
    df['mileage_per_year'] = df['milage'] / df['vehicle_age']
    df['milage_with_age'] =  df.groupby('vehicle_age')['milage'].transform('mean')
    df['mileage_per_year_with_age'] =  df.groupby('vehicle_age')['mileage_per_year'].transform('mean')

    return df

In [14]:
# Extract engine features
def extract_engine_features(df):
    
    def extract_horsepower(engine):
        try:
            return float(engine.split('HP')[0])
        except:
            return None

    def extract_engine_size(engine):
        try:
            return float(engine.split(' ')[1].replace('L', ''))
        except:
            return None

    df['horsepower'] = df['engine'].apply(extract_horsepower)
    df['engine_size'] = df['engine'].apply(extract_engine_size)
    df['power_to_weight_ratio'] = df['horsepower'] / df['engine_size']
    
    return df

In [15]:
# Extract brand value features 
def extract_other_features(df):
    
    luxury_brands =  ['Mercedes-Benz', 'BMW', 'Audi', 'Porsche', 'Land', 
                    'Lexus', 'Jaguar', 'Bentley', 'Maserati', 'Lamborghini', 
                    'Rolls-Royce', 'Ferrari', 'McLaren', 'Aston', 'Maybach']
    df['is_luxury_brand'] = df['brand'].apply(lambda x: 1 if x in luxury_brands else 0)

    return df

In [16]:
train = extract_age_features(train)
test = extract_age_features(test)

train = extract_engine_features(train)
test = extract_engine_features(test)

train = extract_other_features(train)
test = extract_other_features(test)

In [17]:
# deal with nan and inf values
train.replace([np.inf, -np.inf], np.nan, inplace=True)
test.replace([np.inf, -np.inf], np.nan, inplace=True)

train_numeric_cols = train.select_dtypes(include=['number']).columns
test_numeric_cols = test.select_dtypes(include=['number']).columns
train[train_numeric_cols] = train[train_numeric_cols].fillna(train[train_numeric_cols].median())
test[test_numeric_cols] = test[test_numeric_cols].fillna(test[test_numeric_cols].median())

In [18]:
train.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price,vehicle_age,mileage_per_year,milage_with_age,mileage_per_year_with_age,horsepower,engine_size,power_to_weight_ratio,is_luxury_brand
0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200,17,12529.411765,118194.99236,6952.646609,172.0,1.6,107.5,0
1,Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999,22,6511.363636,118270.783105,5375.944687,252.0,3.9,64.615385,0
2,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,Blue,Gray,None reported,Yes,13900,22,6215.045455,118270.783105,5375.944687,320.0,5.3,60.377358,0
3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000,7,2785.714286,67981.161419,9711.594488,420.0,5.0,84.0,0
4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Beige,None reported,Yes,97500,3,2462.666667,29327.437406,9775.812469,208.0,2.0,104.0,1


In [19]:
test.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,vehicle_age,mileage_per_year,milage_with_age,mileage_per_year_with_age,horsepower,engine_size,power_to_weight_ratio,is_luxury_brand
0,Land,Rover LR2 Base,2015,98000,Gasoline,240.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,6-Speed A/T,White,Beige,None reported,Yes,9,10888.888889,81078.503981,9008.722665,240.0,2.0,120.0,1
1,Land,Rover Defender SE,2020,9142,Hybrid,395.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,8-Speed A/T,Silver,Black,None reported,Yes,4,2285.5,34258.886442,8564.721611,395.0,3.0,131.666667,1
2,Ford,Expedition Limited,2022,28121,Gasoline,3.5L V6 24V PDI DOHC Twin Turbo,10-Speed Automatic,White,Ebony,None reported,,2,14060.5,17877.043403,8938.521702,328.0,3.5,87.428571,0
3,Audi,A6 2.0T Sport,2016,61258,Gasoline,2.0 Liter TFSI,Automatic,Silician Yellow,Black,None reported,,8,7657.25,75999.679762,9499.95997,328.0,3.5,87.428571,1
4,Audi,A6 2.0T Premium Plus,2018,59000,Gasoline,252.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,A/T,Gray,Black,None reported,Yes,6,9833.333333,52105.532436,8684.255406,252.0,2.0,126.0,1


## XBoost

In [20]:
X_train = train.drop(columns=["price"])
y_train = train["price"]

X_test = test

In [21]:
#I have used 5 splits here.
def cross_validate_model_x(model, X_train, y_train, params, n_splits=5):

    # Initialize variables
    cv = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    val_scores = []
    test_preds = np.zeros((len(X_test), n_splits), dtype=np.float32)
    # Cross-validation loop
    for fold, (train_ind, valid_ind) in enumerate(cv.split(X_train)):
        # Data splitting
        X_fold_train = X_train.iloc[train_ind]
        y_fold_train = y_train.iloc[train_ind]
        X_val = X_train.iloc[valid_ind]
        y_val = y_train.iloc[valid_ind]
        
        # Model initialization and training
        clf = model(**params, enable_categorical=True)
        #clf.fit(X_fold_train, y_fold_train)
        clf.fit(X_fold_train, y_fold_train,  eval_set=[(X_val, y_val)],verbose=500, early_stopping_rounds=50)
        # Predict and evaluate
        test_preds[:, fold] = clf.predict(X_test)

        print("-" * 50)
        print(test_preds)

    test_preds= np.mean(test_preds, axis=1)
    return clf,test_preds

In [22]:
xgb_params={
    
'lambda': 0.17694956261235095,
 'alpha': 1.3889763984339085,
 'colsample_bytree': 0.7,
 'subsample': 1.0,
 'learning_rate': 0.008,
 'max_depth': 17,
 'random_state': 2020,
 'min_child_weight': 59,
 'n_estimators': 10000,
 'tree_method': 'gpu_hist'
 
}


print('XGBoost Cross-Validation Results:\n')
xgb_model, test_predsx = cross_validate_model_x(XGBRegressor, X_train, y_train, xgb_params)

XGBoost Cross-Validation Results:

[0]	validation_0-rmse:80865.47046
[377]	validation_0-rmse:75252.45276
--------------------------------------------------
[[19089.879     0.        0.        0.        0.   ]
 [77349.42      0.        0.        0.        0.   ]
 [53359.734     0.        0.        0.        0.   ]
 ...
 [20146.008     0.        0.        0.        0.   ]
 [15939.037     0.        0.        0.        0.   ]
 [42651.773     0.        0.        0.        0.   ]]
[0]	validation_0-rmse:74344.82508
[358]	validation_0-rmse:68001.30118
--------------------------------------------------
[[19089.879 19300.047     0.        0.        0.   ]
 [77349.42  81195.484     0.        0.        0.   ]
 [53359.734 51738.125     0.        0.        0.   ]
 ...
 [20146.008 21380.014     0.        0.        0.   ]
 [15939.037 16676.09      0.        0.        0.   ]
 [42651.773 36773.89      0.        0.        0.   ]]
[0]	validation_0-rmse:75774.08585
[362]	validation_0-rmse:69544.35026
-----

In [23]:
xgb_result =  pd.read_csv('/kaggle/input/playground-series-s4e9/sample_submission.csv')
xgb_result['price'] = test_predsx.astype(np.float32)
xgb_result

Unnamed: 0,id,price
0,188533,19140.791016
1,188534,79952.500000
2,188535,52745.648438
3,188536,32035.630859
4,188537,29509.990234
...,...,...
125685,314218,29406.005859
125686,314219,54205.335938
125687,314220,20702.990234
125688,314221,16224.470703


## LGBM

In [24]:
original_data_path = '/kaggle/input/used-car-price-prediction-dataset/used_cars.csv'
data_path = '/kaggle/input/playground-series-s4e9/'

original = pd.read_csv(original_data_path)
train = pd.read_csv(data_path + 'train.csv')
test = pd.read_csv(data_path + 'test.csv')

# extract numeric value from 'milage', 'price' cols
original[['milage', 'price']] = original[['milage', 'price']].map(
    lambda x: int(''.join(re.findall(r'\d+', x))))

# drop id from both train & test set
train.drop(columns=['id'], inplace=True)
test.drop(columns=['id'], inplace=True)

# merge train & original dataset
train = pd.concat([train, original], ignore_index=True)

In [25]:
train = extract_age_features(train)
test = extract_age_features(test)

train = extract_engine_features(train)
test = extract_engine_features(test)

train = extract_other_features(train)
test = extract_other_features(test)

In [26]:
def update(df):
    
    t = 100
    
    cat_c = ['brand','model','fuel_type','engine','transmission','ext_col','int_col','accident','clean_title']
    re_ = ['model','engine','transmission','ext_col','int_col']
    
    for col in re_:
        df.loc[df[col].value_counts(dropna=False)[df[col]].values < t, col] = "noise"
        
    for col in cat_c:
        df[col] = df[col].fillna('missing')
        df[col] = df[col].astype('category')
        
    return df

train  = update(train)
test   = update(test)

In [27]:
X_train = train.drop(columns=["price"])
y_train = train["price"]

X_test = test

In [28]:
def cross_validate_model_l(model, X_train, y_train, params, n_splits=10):

    # Initialize variables
    callbacks = [log_evaluation(period=150), early_stopping(stopping_rounds=200)]
    cv = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    val_scores = []
    test_preds = np.zeros((len(X_test), n_splits), dtype=np.float32)
    # Cross-validation loop
    for fold, (train_ind, valid_ind) in enumerate(cv.split(X_train)):
        # Data splitting
        X_fold_train = X_train.iloc[train_ind]
        y_fold_train = y_train.iloc[train_ind]
        X_val = X_train.iloc[valid_ind]
        y_val = y_train.iloc[valid_ind]
        
        # Model initialization and training
        clf = model(**params)
        #clf.fit(X_fold_train, y_fold_train)
        clf.fit(X_fold_train, y_fold_train, eval_set=[(X_val, y_val)], callbacks=callbacks )
        # Predict and evaluate
        test_preds[:, fold] = clf.predict(X_test)

        print("-" * 50)
        print(test_preds)

    test_preds= np.mean(test_preds, axis=1)
    return clf,test_preds

In [29]:
from lightgbm import LGBMRegressor
# params are taken from this notebook: https://www.kaggle.com/code/noodl35/optuna-lgbm-tuning-used-cars
lgb_params = {
                'num_leaves': 426,
                 'max_depth': 20,
                 'learning_rate': 0.011353178352988012,
                 'n_estimators': 10000,
                 'metric': 'rmse',
                 'subsample': 0.5772552201954328,
                 'colsample_bytree': 0.9164865430101521,
                 'reg_alpha': 1.48699088003429e-06,
                 'reg_lambda': 0.41539458543414265,
                 'min_data_in_leaf': 73,
                 'feature_fraction': 0.751673655170548,
                 'bagging_fraction': 0.5120415391590843,
                 'bagging_freq': 2,
                 'random_state': 42,
                 'min_child_weight': 0.017236362383443497,
                 'cat_smooth': 54.81317407769262,
                 'verbose' : -1             # Set to -1 for silent mode, no process information printed
}

print('LightGBM Cross-Validation Results:\n')
lgb_model,test_predsl = cross_validate_model_l(LGBMRegressor, X_train, y_train, lgb_params)

LightGBM Cross-Validation Results:

Training until validation scores don't improve for 200 rounds
[150]	valid_0's rmse: 72073.1
[300]	valid_0's rmse: 71888.4
[450]	valid_0's rmse: 71989.4
Early stopping, best iteration is:
[289]	valid_0's rmse: 71881.4
--------------------------------------------------
[[17752.193     0.        0.    ...     0.        0.        0.   ]
 [73139.2       0.        0.    ...     0.        0.        0.   ]
 [52553.758     0.        0.    ...     0.        0.        0.   ]
 ...
 [20882.643     0.        0.    ...     0.        0.        0.   ]
 [16230.833     0.        0.    ...     0.        0.        0.   ]
 [40682.31      0.        0.    ...     0.        0.        0.   ]]
Training until validation scores don't improve for 200 rounds
[150]	valid_0's rmse: 78339.9
[300]	valid_0's rmse: 78181.1
[450]	valid_0's rmse: 78282.8
Early stopping, best iteration is:
[260]	valid_0's rmse: 78152.3
--------------------------------------------------
[[17752.193 18146.96

In [30]:
lgb_result =  submission
lgb_result['price'] = test_predsl.astype(np.float32)
lgb_result

Unnamed: 0,id,price
0,188533,18186.298828
1,188534,77317.117188
2,188535,52652.062500
3,188536,31312.871094
4,188537,29945.447266
...,...,...
125685,314218,29895.275391
125686,314219,52605.195312
125687,314220,21123.800781
125688,314221,16270.471680


## CatBoost

In [31]:
#I have used 5 splits here.
def cross_validate_model_c(model, X_train, y_train, params, n_splits=10):

    # Initialize variables
    
    cv = KFold(n_splits=n_splits, shuffle=True, random_state=0)
    val_scores = []
    test_preds = np.zeros((len(X_test), n_splits), dtype=np.float32)
    # Cross-validation loop
    for fold, (train_ind, valid_ind) in enumerate(cv.split(X_train)):
        # Data splitting
        X_fold_train = X_train.iloc[train_ind]
        y_fold_train = y_train.iloc[train_ind]
        X_val = X_train.iloc[valid_ind]
        y_val = y_train.iloc[valid_ind]
        
        # Model initialization and training
        clf = model(**params)
        #clf.fit(X_fold_train, y_fold_train)
        clf.fit(X_fold_train, y_fold_train, eval_set=[(X_val, y_val)], verbose=0)
        # Predict and evaluate
        test_preds[:, fold] = clf.predict(X_test)

        print("-" * 50)
        print(test_preds)

    test_preds= np.mean(test_preds, axis=1)
    return clf,test_preds

In [32]:
from catboost import CatBoostClassifier

cat_params = {
    'cat_features':cat_cols,  
    'learning_rate': 0.075,
    'iterations': 5000,
    'depth': 9,
    'random_strength': 0,
    'l2_leaf_reg': 0.5,
    'task_type': 'GPU',
    'max_leaves': 512,
    'fold_permutation_block': 64,
    'random_seed': 42,
    'verbose': False                      
}

print('CatBoost Cross-Validation Results:\n')
cat_model, test_predsc = cross_validate_model_c(CatBoostRegressor, X_train, y_train, cat_params)

CatBoost Cross-Validation Results:

--------------------------------------------------
[[16376.514     0.        0.    ...     0.        0.        0.   ]
 [79526.18      0.        0.    ...     0.        0.        0.   ]
 [55137.812     0.        0.    ...     0.        0.        0.   ]
 ...
 [22112.162     0.        0.    ...     0.        0.        0.   ]
 [18403.898     0.        0.    ...     0.        0.        0.   ]
 [36032.82      0.        0.    ...     0.        0.        0.   ]]
--------------------------------------------------
[[16376.514 17223.428     0.    ...     0.        0.        0.   ]
 [79526.18  87403.98      0.    ...     0.        0.        0.   ]
 [55137.812 56132.926     0.    ...     0.        0.        0.   ]
 ...
 [22112.162 21617.834     0.    ...     0.        0.        0.   ]
 [18403.898 16202.609     0.    ...     0.        0.        0.   ]
 [36032.82  33078.04      0.    ...     0.        0.        0.   ]]
----------------------------------------------

In [33]:
cat_result =  submission
cat_result['price'] = test_predsc.astype(np.float32)
cat_result

Unnamed: 0,id,price
0,188533,16966.285156
1,188534,79927.835938
2,188535,56529.492188
3,188536,28888.074219
4,188537,30327.207031
...,...,...
125685,314218,28342.390625
125686,314219,56997.695312
125687,314220,21697.664062
125688,314221,17215.021484


## Ensemble XGBoost, LGBM, CatBoost

In [34]:
Pred = pd.concat([lgb_result,cat_result,xgb_result], axis=1)['price']
print(Pred)
test_preds = test_predsl * 0.7 + test_predsc * 0.1 + test_predsx*0.2

               price         price         price
0       16966.285156  16966.285156  19140.791016
1       79927.835938  79927.835938  79952.500000
2       56529.492188  56529.492188  52745.648438
3       28888.074219  28888.074219  32035.630859
4       30327.207031  30327.207031  29509.990234
...              ...           ...           ...
125685  28342.390625  28342.390625  29406.005859
125686  56997.695312  56997.695312  54205.335938
125687  21697.664062  21697.664062  20702.990234
125688  17215.021484  17215.021484  16224.470703
125689  34824.242188  34824.242188  40222.632812

[125690 rows x 3 columns]


In [35]:
ensemble_sub =  submission
ensemble_sub['price'] = test_preds

ensemble_sub.to_csv('submission.csv', index=False)