
---

# **Used Car Price Regression Problem 🏦📊**

---


---

# **Imports 📦🔧**

---

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.base import clone
import re

import optuna
from optuna.samplers import TPESampler

from sklearn.model_selection import *
from sklearn.preprocessing import *

from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

from sklearn.metrics import *

pd.set_option('display.max_columns', None)
from IPython.display import clear_output
from tqdm import tqdm, trange
from tabulate import tabulate
import random
import time
import logging
from IPython.display import display
from IPython.display import display, HTML
from colorama import Fore
from datetime import datetime
from sklearn.ensemble import *


---

# **Load and Basic Preprocessing Data 📥📊**

---

In [2]:
%%time

sample_sub = pd.read_csv('/kaggle/input/playground-series-s4e9/sample_submission.csv')
train = pd.read_csv('/kaggle/input/playground-series-s4e9/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s4e9/test.csv')
Original = pd.read_csv('/kaggle/input/used-car-price-prediction-dataset/used_cars.csv')

Original[['milage', 'price']] = Original[['milage', 'price']].map(
    lambda x: int(''.join(re.findall(r'\d+', x))))

train = pd.concat([train, Original], ignore_index=True)


def update(df):
    
    t = 100
    
    df['accident'] = df['accident'].map({
        'None reported': 'not_reported',
        'At least 1 accident or damage reported': 'reported'
    })
    df['transmission'] = df['transmission'].str.replace('/', '').str.replace('-', '')
    df['transmission'] = df['transmission'].str.replace(' ', '_')
    
    cat_c = ['brand','model','fuel_type','engine','transmission','ext_col','int_col','accident','clean_title']
    re_ = ['model','engine','transmission','ext_col','int_col']
    
    for col in re_:
        df.loc[df[col].value_counts(dropna=False)[df[col]].values < t, col] = "noise"
        
    for col in cat_c:
        df[col] = df[col].fillna('missing')
        df[col] = df[col].astype('category')
        
    return df

train  = update(train)
test   = update(test)

CPU times: user 2.89 s, sys: 182 ms, total: 3.08 s
Wall time: 3.56 s


In [3]:
%%time

def feature(df):
    current_year = datetime.now().year

    df['Vehicle_Age'] = current_year - df['model_year']

    df['Mileage_per_Year'] = df['milage'] / df['Vehicle_Age']

    def extract_horsepower(engine):
        try:
            return float(engine.split('HP')[0])
        except:
            return None

    def extract_engine_size(engine):
        try:
            return float(engine.split(' ')[1].replace('L', ''))
        except:
            return None

    df['Horsepower'] = df['engine'].apply(extract_horsepower)
    df['Engine_Size'] = df['engine'].apply(extract_engine_size)
    df['Power_to_Weight_Ratio'] = df['Horsepower'] / df['Engine_Size']

    luxury_brands =  ['Mercedes-Benz', 'BMW', 'Audi', 'Porsche', 'Land', 
                    'Lexus', 'Jaguar', 'Bentley', 'Maserati', 'Lamborghini', 
                    'Rolls-Royce', 'Ferrari', 'McLaren', 'Aston', 'Maybach']
    df['Is_Luxury_Brand'] = df['brand'].apply(lambda x: 1 if x in luxury_brands else 0)

    df['Accident_Impact'] = df.apply(lambda x: 1 if x['accident'] == 1 and x['clean_title'] == 0 else 0, axis=1)
    
    return df

train = feature(train)
test = feature(test)

CPU times: user 4.25 s, sys: 78.7 ms, total: 4.33 s
Wall time: 4.33 s


In [4]:
train.drop_duplicates(inplace = True)

In [5]:
train.shape

(192542, 20)


---

# **Voting Regressor Modeling 🧩📉**

---

In [6]:
%%time

X = train.drop(['price'], axis=1)
y = train['price']
cat_features = X.select_dtypes(include=['category']).columns.tolist()

SEED = 601
n_splits = 5

def Train_ML(X, y, model, test, n_splits=n_splits):
    
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    rmse_scores = []
    test_preds = np.zeros((test.shape[0], n_splits)) 
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        model.fit(X_train, y_train)
        val_predictions = model.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, val_predictions))
        rmse_scores.append(rmse)
        
        test_preds[:, fold] = model.predict(test)
    
    avg_rmse = np.mean(rmse_scores)
    
    mean_test_preds = np.mean(test_preds, axis=1)
    
    header = f"\n{'Final Validation RMSE:':<25} {avg_rmse:.5f}\n"
    print(header)
    
    return mean_test_preds

lgb_params ={'learning_rate': 0.017521301504983752, 'max_depth': 42, 'reg_alpha': 0.06876635751774487, 
 'reg_lambda': 9.738899198284985, 'num_leaves': 131, 'subsample': 0.2683765421728044, 
 'colsample_bytree': 0.44346036599709887} 

params1 = {'learning_rate': 0.015387355282525047, 'num_leaves': 287, 'max_depth': 10, 'min_child_samples': 32,
 'subsample': 0.5678602068076838, 'colsample_bytree': 0.5254867750210618, 'reg_alpha': 8.515713311140541e-05,
 'reg_lambda': 9.929128235845939, 'scale_pos_weight': 1.031529653438031, 'max_bin': 2894,
 'min_split_gain': 8.135732868325528e-05, 'min_child_weight': 0.9684228603448732,'boosting_type': 'gbdt',
 'objective': 'regression','metric': 'rmse'}

params2 = {'learning_rate': 0.01383980028736371, 'num_leaves': 185, 'max_depth': 11,
'min_child_samples': 47, 'subsample': 0.5179868722209913, 'colsample_bytree': 0.5099485937874763,
'reg_alpha': 2.6213681351209853e-08, 'reg_lambda': 5.24590000227586, 'scale_pos_weight': 3.6179756319298417, 
'max_bin': 2910, 'min_split_gain': 0.007117579745695504, 'min_child_weight': 0.21739378651138622}

lgb1 = LGBMRegressor(**lgb_params, random_state=SEED, verbose=-1, n_estimators=200)
lgb2 = LGBMRegressor(**params1, random_state=SEED, verbose=-1, n_estimators=200)
lgb3 = LGBMRegressor(**params2, random_state=SEED, verbose=-1, n_estimators=200)


estimator1 = VotingRegressor([
    ('LGBM_Tunned', lgb1), 
    ('LGBM_Tunned_1', lgb2), 
    ('LGBM_Tunned_2', lgb3)
], weights=[0.2, 0.7, 0.1])

vmp = Train_ML(X, y, estimator1, test)


Final Validation RMSE:    72448.33999

CPU times: user 3min 51s, sys: 1.69 s, total: 3min 53s
Wall time: 3min 50s



---

# **Submission 📤✅**

---

In [None]:
sample_sub['price'] = vmp
sample_sub.to_csv("Submission_VR.csv", index=False)
sample_sub.head()