In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.model_selection import GroupShuffleSplit

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import callbacks

In [2]:
data = pd.read_csv('./predict-supercars-prices-2025/supercars_train.csv', index_col='id')
data.head()

Unnamed: 0_level_0,year,brand,color,carbon_fiber_body,engine_config,horsepower,torque,weight_kg,zero_to_60_s,top_speed_mph,...,has_warranty,last_service_date,service_history,non_original_parts,model,warranty_years,damage,damage_cost,damage_type,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
wn8zA4ADUC,2023,McLaren,Silver,1,Hybrid,1045,794,1897,3.63,227,...,0,2024-08-15,authorized,0,600LT,0,1,83632.0,major,329510.72
m5EyycSRrS,2024,Aston Martin,Black,1,W16,879,510,2193,2.72,247,...,0,2025-03-29,none,1,Valhalla,0,0,,,509289.91
IOcII96Ua3,2022,Koenigsegg,White,1,V12,609,489,1277,3.16,236,...,0,2025-06-30,authorized,0,Jesko,0,1,51179.0,major,2164428.25
qlds6yyR3r,2024,Bugatti,Blue,0,V12,1044,1162,1733,2.15,248,...,1,2024-08-29,none,0,Chiron,1,0,,,2793655.4
fRR6ulIWvU,2024,Pagani,White,0,V12,853,679,1307,2.35,247,...,0,2025-02-21,authorized,0,Zonda,0,1,77100.0,major,2927390.0


In [3]:
X = data.copy()
y = X.pop('price')

In [4]:
X.columns

Index(['year', 'brand', 'color', 'carbon_fiber_body', 'engine_config',
       'horsepower', 'torque', 'weight_kg', 'zero_to_60_s', 'top_speed_mph',
       'num_doors', 'transmission', 'drivetrain', 'market_region', 'mileage',
       'num_owners', 'interior_material', 'brake_type', 'tire_brand',
       'aero_package', 'limited_edition', 'has_warranty', 'last_service_date',
       'service_history', 'non_original_parts', 'model', 'warranty_years',
       'damage', 'damage_cost', 'damage_type'],
      dtype='object')

In [5]:
y.head()

id
wn8zA4ADUC     329510.72
m5EyycSRrS     509289.91
IOcII96Ua3    2164428.25
qlds6yyR3r    2793655.40
fRR6ulIWvU    2927390.00
Name: price, dtype: float64

In [6]:
categorical_cols = [cols for cols in X.columns
                    if X[cols].dtype == 'object'
                    and X[cols].nunique() < 10]
numerical = [cols for cols in X.columns
             if X[cols].dtype in ['int64','float64']]

In [7]:
print("categorical columns is ", categorical_cols)
print(len(categorical_cols))
print("numerical cols is a ", numerical)
print(len(numerical))

categorical columns is  ['brand', 'color', 'engine_config', 'transmission', 'drivetrain', 'market_region', 'interior_material', 'brake_type', 'tire_brand', 'service_history', 'damage_type']
11
numerical cols is a  ['year', 'carbon_fiber_body', 'horsepower', 'torque', 'weight_kg', 'zero_to_60_s', 'top_speed_mph', 'num_doors', 'mileage', 'num_owners', 'aero_package', 'limited_edition', 'has_warranty', 'non_original_parts', 'warranty_years', 'damage', 'damage_cost']
17


In [8]:
preprocessor = make_column_transformer(
    (StandardScaler(), numerical),
    (OneHotEncoder(),categorical_cols),
)

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_valid,y_train,y_valid = train_test_split(X,y,random_state=0)

X_train = preprocessor.fit_transform(X_train)
X_valid = preprocessor.transform(X_valid)
y_train = y_train/100
y_valid = y_valid/100

In [11]:
input_shape = [X_train.shape[1]]
print("inpue shape: {}".format(input_shape))

inpue shape: [65]


In [12]:
# ===============================
# 1. Import Libraries
# ===============================
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from tensorflow import keras
from tensorflow.keras import layers, callbacks

# ===============================
# 2. Load Data
# ===============================
data = pd.read_csv("./predict-supercars-prices-2025/supercars_train.csv", index_col="id")

# Separate features and target
X = data.copy()
y = X.pop("price")

print("Data shape:", X.shape)
print("Target sample:\n", y.head())

# ===============================
# 3. Feature Engineering
# ===============================
# Identify categorical and numerical columns
categorical_cols = [col for col in X.columns if X[col].dtype == "object"]
numerical_cols = [col for col in X.columns if X[col].dtype in ["int64", "float64"]]

print("Categorical features:", categorical_cols)
print("Numerical features:", numerical_cols)

# Column transformer for preprocessing
preprocessor = make_column_transformer(
    (StandardScaler(), numerical_cols),
    (OneHotEncoder(handle_unknown="ignore"), categorical_cols),
)

# ===============================
# 4. Train/Validation Split
# ===============================
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)

# Fit transformer on training data, transform both train/valid
X_train = preprocessor.fit_transform(X_train)
X_valid = preprocessor.transform(X_valid)

# Normalize target (helps neural nets train better)
y_train = np.log1p(y_train)
y_valid = np.log1p(y_valid)

print("Train shape:", X_train.shape, "Validation shape:", X_valid.shape)

# ===============================
# 5. Build Deep Learning Model
# ===============================
input_shape = [X_train.shape[1]]

model = keras.Sequential([
    layers.Input(shape=input_shape),
    layers.Dense(256, activation="relu"),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.Dense(128, activation="relu"),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.Dense(1)
])


model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
    loss="mae"
)
# ===============================
# 6. Callbacks
# ===============================
early_stop = callbacks.EarlyStopping(patience=20, restore_best_weights=True)
reduce_lr = callbacks.ReduceLROnPlateau(factor=0.5, patience=10)

# ===============================
# 7. Train Model
# ===============================
history = model.fit(
    X_train, y_train,
    validation_data=(X_valid, y_valid),
    epochs=100,
    batch_size=32,
    callbacks=[early_stop, reduce_lr],
    verbose=1
)

# ===============================
# 8. Evaluate Model
# ===============================
preds = np.expm1(model.predict(X_valid).flatten())

mae = mean_absolute_error(y_valid, preds)
rmse = np.sqrt(mean_squared_error(y_valid, preds))
r2 = r2_score(y_valid, preds)

print(f"Deep Learning Results -> MAE: {mae:.4f}, RMSE: {rmse:.4f}, R²: {r2:.4f}")


Data shape: (2000, 30)
Target sample:
 id
wn8zA4ADUC     329510.72
m5EyycSRrS     509289.91
IOcII96Ua3    2164428.25
qlds6yyR3r    2793655.40
fRR6ulIWvU    2927390.00
Name: price, dtype: float64
Categorical features: ['brand', 'color', 'engine_config', 'transmission', 'drivetrain', 'market_region', 'interior_material', 'brake_type', 'tire_brand', 'last_service_date', 'service_history', 'model', 'damage_type']
Numerical features: ['year', 'carbon_fiber_body', 'horsepower', 'torque', 'weight_kg', 'zero_to_60_s', 'top_speed_mph', 'num_doors', 'mileage', 'num_owners', 'aero_package', 'limited_edition', 'has_warranty', 'non_original_parts', 'warranty_years', 'damage', 'damage_cost']
Train shape: (1600, 445) Validation shape: (400, 445)
Epoch 1/100
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - loss: 13.3782 - val_loss: 12.9054 - learning_rate: 0.0010
Epoch 2/100
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 11.9179 - val_loss: 1

In [13]:
# ===============================
# 9. Prepare Test Data
# ===============================
X_test_full = pd.read_csv("./predict-supercars-prices-2025/supercars_test.csv", index_col="id")

# Apply the same preprocessing
X_test = preprocessor.transform(X_test_full)

# Predict (don’t forget we divided y by 100 → multiply back)
test_preds = model.predict(X_test).flatten() * 100

# ===============================
# 10. Create Submission File
# ===============================

submission = pd.DataFrame({
    "id": X_test_full.index,
    "price": test_preds  # use "price" instead of "target" if Kaggle expects that
})

submission.to_csv("2submission.csv", index=False)
print("Submission file saved as submission.csv")
submission.head()


[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
Submission file saved as submission.csv


Unnamed: 0,id,price
0,N4C4A2ICG2,1353.254028
1,aIB01tNqkz,1353.254028
2,0AX3BsniiV,1353.254028
3,3J6iupWk0z,1353.254028
4,7atXIEFcBF,1353.254028


In [14]:
from sklearn.dummy import DummyRegressor

dummy = DummyRegressor(strategy="mean")
dummy.fit(X_train, y_train)
print("Dummy MAE:", mean_absolute_error(y_valid, dummy.predict(X_valid)))


Dummy MAE: 0.8527980284513302


In [17]:
# ===============================================================
# 1. Imports
# ===============================================================
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from xgboost import XGBRegressor

# ===============================================================
# 2. Load Data
# ===============================================================
train = pd.read_csv("./predict-supercars-prices-2025/supercars_train.csv", index_col="id")
test = pd.read_csv("./predict-supercars-prices-2025/supercars_test.csv", index_col="id")

X = train.drop("price", axis=1)
y = train["price"]

# Log-transform target to stabilize training
y = np.log1p(y)

# ===============================================================
# 3. Preprocessing
# ===============================================================
num_features = X.select_dtypes(include=["int64", "float64"]).columns
cat_features = X.select_dtypes(include=["object"]).columns

preprocessor = ColumnTransformer([
    ("num", StandardScaler(), num_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features)
])

# Split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# ===============================================================
# 4. XGBoost Model
# ===============================================================
xgb_model = Pipeline([
    ("preprocessor", preprocessor),
    ("xgb", XGBRegressor(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=8,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    ))
])

xgb_model.fit(X_train, y_train)
xgb_preds = xgb_model.predict(X_valid)

print("XGBoost MAE:", mean_absolute_error(np.expm1(y_valid), np.expm1(xgb_preds)))

# ===============================================================
# 5. Deep Learning Model
# ===============================================================
# Preprocess separately for NN
X_train_nn = preprocessor.fit_transform(X_train)
X_valid_nn = preprocessor.transform(X_valid)

input_shape = X_train_nn.shape[1]

nn_model = keras.Sequential([
    layers.Input(shape=(input_shape,)),
    layers.Dense(256, activation="relu"),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.Dense(128, activation="relu"),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.Dense(1)   # price (log scale)
])

nn_model.compile(optimizer=keras.optimizers.Adam(1e-3), loss="mse")

es = keras.callbacks.EarlyStopping(patience=20, restore_best_weights=True)

nn_model.fit(
    X_train_nn, y_train,
    validation_data=(X_valid_nn, y_valid),
    epochs=200,
    batch_size=64,
    callbacks=[es],
    verbose=1
)

nn_preds = nn_model.predict(X_valid_nn).flatten()

print("NN MAE:", mean_absolute_error(np.expm1(y_valid), np.expm1(nn_preds)))

# ===============================================================
# 6. Ensemble (Average of XGB + NN)
# ===============================================================
ensemble_preds = (xgb_preds + nn_preds) / 2
print("Ensemble MAE:", mean_absolute_error(np.expm1(y_valid), np.expm1(ensemble_preds)))

# ===============================================================
# 7. Predict on Test Data
# ===============================================================
X_test_nn = preprocessor.transform(test)
xgb_test_preds = xgb_model.predict(test)
nn_test_preds = nn_model.predict(X_test_nn).flatten()

final_preds = (xgb_test_preds + nn_test_preds) / 2
final_preds = np.expm1(final_preds)  # reverse log

submission = pd.DataFrame({
    "id": test.index,
    "price": final_preds
})

submission.to_csv("2submission_ensemble.csv", index=False)
print("✅ Submission file saved: submission_ensemble.csv")


XGBoost MAE: 284847.9644625
Epoch 1/200
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - loss: 186.5589 - val_loss: 177.9207
Epoch 2/200
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 173.8198 - val_loss: 162.3060
Epoch 3/200
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 155.4830 - val_loss: 140.9656
Epoch 4/200
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 130.7990 - val_loss: 113.7422
Epoch 5/200
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 102.7961 - val_loss: 84.9452
Epoch 6/200
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 73.1164 - val_loss: 56.7878
Epoch 7/200
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 46.9423 - val_loss: 33.5648
Epoch 8/200
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 26.8922 - val_loss: 17