In [91]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

sample_csv = pd.read_csv('/kaggle/input/playground-series-s4e9/sample_submission.csv')
test_csv = pd.read_csv('/kaggle/input/playground-series-s4e9/test.csv')
train_csv = pd.read_csv('/kaggle/input/playground-series-s4e9/train.csv')

In [92]:
train_csv.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200
1,1,Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999
2,2,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,Blue,Gray,None reported,Yes,13900
3,3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000
4,4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Beige,None reported,Yes,97500


In [93]:
test_csv.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title
0,188533,Land,Rover LR2 Base,2015,98000,Gasoline,240.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,6-Speed A/T,White,Beige,None reported,Yes
1,188534,Land,Rover Defender SE,2020,9142,Hybrid,395.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,8-Speed A/T,Silver,Black,None reported,Yes
2,188535,Ford,Expedition Limited,2022,28121,Gasoline,3.5L V6 24V PDI DOHC Twin Turbo,10-Speed Automatic,White,Ebony,None reported,
3,188536,Audi,A6 2.0T Sport,2016,61258,Gasoline,2.0 Liter TFSI,Automatic,Silician Yellow,Black,None reported,
4,188537,Audi,A6 2.0T Premium Plus,2018,59000,Gasoline,252.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,A/T,Gray,Black,None reported,Yes


In [94]:
sample_csv.head()

Unnamed: 0,id,price
0,188533,43878.016
1,188534,43878.016
2,188535,43878.016
3,188536,43878.016
4,188537,43878.016


Let's extract the interesting features in the *engine* column, such as *horsepower* and *engine_size*:

In [95]:
# Extract the columns
train_csv['horsepower'] = train_csv['engine'].str.extract(r'(\d+\.?\d*)HP')
train_csv['engine_size'] = train_csv['engine'].str.extract(r'(\d+\.?\d*)L')
test_csv['horsepower'] = test_csv['engine'].str.extract(r'(\d+\.?\d*)HP')
test_csv['engine_size'] = test_csv['engine'].str.extract(r'(\d+\.?\d*)L')
# Turn them into numerical
train_csv['horsepower'] = pd.to_numeric(train_csv['horsepower'], errors='coerce')
train_csv['engine_size'] = pd.to_numeric(train_csv['engine_size'], errors='coerce')
test_csv['horsepower'] = pd.to_numeric(test_csv['horsepower'], errors='coerce')
test_csv['engine_size'] = pd.to_numeric(test_csv['engine_size'], errors='coerce')
# Show missing
test_csv['engine_size'].isnull().sum(), test_csv['horsepower'].isnull().sum(), train_csv['engine_size'].isnull().sum(), train_csv['horsepower'].isnull().sum(), 

(9407, 22181, 14214, 33259)

Now, we'll drop the *engine* column and start preprocessing the data

In [96]:
train_csv.drop('engine', axis=1, inplace=True)
test_csv.drop('engine', axis=1, inplace=True)

In [97]:
train_csv.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,transmission,ext_col,int_col,accident,clean_title,price,horsepower,engine_size
0,0,MINI,Cooper S Base,2007,213000,Gasoline,A/T,Yellow,Gray,None reported,Yes,4200,172.0,1.6
1,1,Lincoln,LS V8,2002,143250,Gasoline,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999,252.0,3.9
2,2,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,A/T,Blue,Gray,None reported,Yes,13900,320.0,5.3
3,3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000,420.0,5.0
4,4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,7-Speed A/T,Black,Beige,None reported,Yes,97500,208.0,2.0


# Preprocess the data

In [98]:
train_csv.dtypes

id                int64
brand            object
model            object
model_year        int64
milage            int64
fuel_type        object
transmission     object
ext_col          object
int_col          object
accident         object
clean_title      object
price             int64
horsepower      float64
engine_size     float64
dtype: object

In [99]:
depVar = train_csv['price']
train_csv.drop(columns=['id', 'price'], inplace=True)
cat = []
cont = []
for col in train_csv.columns:
    if train_csv[col].dtype == object:
        cat.append(col)
    else:
        cont.append(col)

In [102]:
cont
cat

['brand',
 'model',
 'fuel_type',
 'transmission',
 'ext_col',
 'int_col',
 'accident',
 'clean_title']

We're going to first take care of the missing values, in the categorical columns they will be filled with the most common value, the mode of the column.

The missing values in the continuous columns they will be filled with the median value of the column.

In [104]:
# Fill missing with median in the numerical columns
for col in cont:
    train_csv[col] = train_csv[col].fillna(train_csv[col].median())
    test_csv[col] = test_csv[col].fillna(test_csv[col].median())
# Fill missing with mode in the numerical columns
for col in cat:
    train_mode = train_csv[col].mode()[0]
    train_csv[col] = train_csv[col].fillna(train_mode)
    test_mode = test_csv[col].mode()[0]
    test_csv[col] = test_csv[col].fillna(test_mode)

In [107]:
train_csv.isnull().sum().sum(), test_csv.isnull().sum().sum()

(0, 0)

Now we have to normalize our continuous columns and encode our categorical columns:

In [109]:
# Normalizing our continuous columns
from sklearn.preprocessing import StandardScaler

SS = StandardScaler()
train_csv[cont] = SS.fit_transform(train_csv[cont])
test_csv[cont] = SS.fit_transform(test_csv[cont])
    
# Encoding our categorical columns
from sklearn.preprocessing import LabelEncoder

LE = LabelEncoder()
for col in cat:
    train_csv[col] = train_csv[col].astype(str)
    train_csv[col] = LE.fit_transform(train_csv[col])
    test_csv[col] = test_csv[col].astype(str)
    test_csv[col] = LE.fit_transform(test_csv[col])
    
train_csv.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,transmission,ext_col,int_col,accident,clean_title,horsepower,engine_size
0,31,495,-1.559808,2.957842,2,38,312,71,1,0,-1.618449,-1.671509
1,28,930,-2.443052,1.557184,2,38,263,10,0,0,-0.851173,0.089138
2,9,1575,-2.443052,1.426276,1,38,38,71,1,0,-0.198989,1.160837
3,16,758,0.206679,-0.927854,2,49,29,14,1,0,0.760106,0.931187
4,36,1077,0.913274,-1.171076,2,23,29,10,1,0,-1.273175,-1.365309


# Splitting the data into training and testing

In [112]:
from sklearn.model_selection import train_test_split

columns = cont + cat
X = train_csv[columns]
y = depVar
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=53)

In [113]:
# Hyper Parameter Optimization
params = {
    "learning_rate": [0.01, 0.05, 0.10, 0.15, 0.20, 0.25, 0.30],  # Step size shrinkage to prevent overfitting
    "max_depth": [3, 4, 5, 6, 8, 10, 12, 15],  # Maximum depth of a tree
    "min_child_weight": [1, 3, 5, 7],  # Minimum sum of instance weight (hessian) needed in a child
    "gamma": [0.0, 0.1, 0.2, 0.3, 0.4, 0.5],  # Minimum loss reduction required to make a further partition
    "colsample_bytree": [0.3, 0.4, 0.5, 0.7, 0.8],  # Subsample ratio of columns when constructing each tree
    "subsample": [0.6, 0.7, 0.8, 0.9, 1.0],  # Subsample ratio of the training instances
    "n_estimators": [100, 200, 300, 400, 500],  # Number of boosting rounds
    "reg_alpha": [0, 0.01, 0.05, 0.1, 1, 10],  # L1 regularization term on weights
    "reg_lambda": [0.01, 0.05, 0.1, 1, 10],  # L2 regularization term on weights
    "scale_pos_weight": [1, 1.5, 2, 2.5, 3],  # Controls the balance of positive and negative weights
    "colsample_bylevel": [0.6, 0.7, 0.8, 0.9, 1.0],  # Subsample ratio of columns for each split
}

In [114]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import xgboost

classifier = xgboost.XGBRegressor()
random_search = RandomizedSearchCV(classifier, param_distributions=params, n_iter=5, scoring= 'neg_mean_squared_error', n_jobs=-1, cv=5, verbose=3)

In [116]:
random_search.fit(X,y)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


In [117]:
random_search.best_estimator_

In [120]:
from xgboost import XGBRegressor

model = XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=0.8, colsample_bynode=None, colsample_bytree=0.4,
             device=None, early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, feature_types=None, gamma=0.4, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=0.01, max_bin=None, max_cat_threshold=None,
             max_cat_to_onehot=None, max_delta_step=None, max_depth=6,
             max_leaves=None, min_child_weight=7, missing=np.nan,
             monotone_constraints=None, multi_strategy=None, n_estimators=400,
             n_jobs=None, num_parallel_tree=None, random_state=None)
model.fit(X_train, y_train)

[CV 3/5] END colsample_bylevel=0.8, colsample_bytree=0.4, gamma=0.1, learning_rate=0.1, max_depth=4, min_child_weight=5, n_estimators=100, reg_alpha=0.05, reg_lambda=0.01, scale_pos_weight=2, subsample=1.0;, score=-5674795449.820 total time=   2.6s
[CV 3/5] END colsample_bylevel=1.0, colsample_bytree=0.7, gamma=0.5, learning_rate=0.15, max_depth=15, min_child_weight=7, n_estimators=300, reg_alpha=10, reg_lambda=0.05, scale_pos_weight=3, subsample=0.7;, score=-7286523085.067 total time=  34.1s
[CV 2/5] END colsample_bylevel=0.9, colsample_bytree=0.3, gamma=0.0, learning_rate=0.2, max_depth=10, min_child_weight=7, n_estimators=300, reg_alpha=0.1, reg_lambda=0.05, scale_pos_weight=1, subsample=1.0;, score=-6138091502.679 total time=  13.3s
[CV 4/5] END colsample_bylevel=0.9, colsample_bytree=0.3, gamma=0.0, learning_rate=0.2, max_depth=10, min_child_weight=7, n_estimators=300, reg_alpha=0.1, reg_lambda=0.05, scale_pos_weight=1, subsample=1.0;, score=-4749795895.130 total time=  13.5s
[CV 

In [121]:
from sklearn.metrics import mean_squared_error

valid_predictions = model.predict(X_valid)
# y_valid = y_valid.values
rmse = np.sqrt(mean_squared_error(y_valid, valid_predictions))
rmse

68507.59133467078

In [125]:
test_cols = model.get_booster().feature_names
test_csv = test_csv[test_cols]

In [129]:
predictions = model.predict(test_csv)
test_csv = pd.read_csv('/kaggle/input/playground-series-s4e9/test.csv')
ids = test_csv['id']
sub = pd.DataFrame({
    'id': ids,
    'price': predictions
})
sub.head()

Unnamed: 0,id,price
0,188533,16790.140625
1,188534,75520.5
2,188535,54763.359375
3,188536,29774.205078
4,188537,30184.748047


In [130]:
sub.to_csv('submission.csv', index=False)