In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.compose import ColumnTransformer
import optuna
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.reset_option('display.width', None)
pd.reset_option('display.max_colwidth', None)

> Import data

In [2]:
df = pd.read_csv('/kaggle/input/playground-series-s4e9/train.csv')
df_used = pd.read_csv('/kaggle/input/used-cars/used_cars.csv')
test = pd.read_csv('/kaggle/input/playground-series-s4e9/test.csv')

> The training and test data for this competition were generated using a deep learning model. You can improve your model by combining this data with the original dataset, which you can find in the "data description."

In [3]:
df = pd.concat([df ,df_used], ignore_index=True)

In [4]:
df.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0.0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200
1,1.0,Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999
2,2.0,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,Blue,Gray,None reported,Yes,13900
3,3.0,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000
4,4.0,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Beige,None reported,Yes,97500


In [5]:
df.shape

(192542, 13)

In [6]:
df.isnull().sum()

id               4009
brand               0
model               0
model_year          0
milage              0
fuel_type        5253
engine              0
transmission        0
ext_col             0
int_col             0
accident         2565
clean_title     22015
price               0
dtype: int64

In [7]:
def fill_missing_values(df):
    df['fuel_type'] = df['fuel_type'].fillna('Missing')
    df['accident'] = df['accident'].fillna('Missing')
    df['clean_title'] = df['clean_title'].fillna('Missing')

    # Convert object data types to catgegory
    df[df.select_dtypes('object').columns] = df.select_dtypes('object').astype('category')

    return df

In [8]:
df = fill_missing_values(df)
test = fill_missing_values(test)
df[['milage', 'price']] = df[['milage', 'price']].map(
        lambda x: int(''.join(re.findall(r'\d+', str(x))))
    )

X = df.drop(columns=['price'])
Y = df['price']


In [9]:
cat_columns = ['brand', 'model', 'fuel_type', 'engine', 'transmission', 'ext_col', 'int_col', 'accident', 'clean_title']
num_columns = ['model_year', 'milage']

In [10]:
transformer = ColumnTransformer(transformers=[
    ('cat',OneHotEncoder(handle_unknown='ignore'), cat_columns),
    ('num', StandardScaler(), num_columns)
])
X = transformer.fit_transform(X)
X_test = transformer.transform(test)

In [11]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 2, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 50),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.1, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 2, log=True),
        'random_state': 42
    }

    xgb_model = XGBRegressor(**params)

    mse_score = []

    kfold = KFold(n_splits=5, random_state=42, shuffle=True)
    for train_index, test_index in kfold.split(X):
        X_train, X_cv = X[train_index], X[test_index]
        Y_train, Y_cv = Y[train_index], Y[test_index]

        xgb_model.fit(X_train, Y_train)

        yhat = xgb_model.predict(X_cv)
        mse = mean_squared_error(Y_cv, yhat)
        mse_score.append(mse)
        

    overall_rmse = np.sqrt(np.mean(mse_score))
    return overall_rmse


In [None]:
predictions = xgb_model.predict(X_test)

result = pd.DataFrame({
    'id': test['id'],
    'price': predictions
})
result.to_csv('predictions.csv', index=False)