In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from scipy import stats
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from constants import NOMINAL_COLUMNS,CONTINOUS_COLUMNS,DROP_COLUMNS,DISCRETE_COLUMNS,TARGET

In [22]:

df = pd.read_csv('data/train_pipeline/car_price_prediction.csv')
df.drop(columns=DROP_COLUMNS, inplace=True)


df["Levy"] = df["Levy"].replace("-", np.nan)
df["Levy"] = pd.to_numeric(df["Levy"], errors="coerce")

df["Engine volume"] = df["Engine volume"].str.replace("Turbo", "", regex=False)
df["Engine volume"] = pd.to_numeric(df["Engine volume"], errors="coerce")

df["Mileage"] = df["Mileage"].astype(str).str.replace(" km", "").str.replace(",", "")
df["Mileage"] = pd.to_numeric(df["Mileage"], errors="coerce")


df['Leather interior'] = df['Leather interior'].map({'Yes': 1, 'No': 0})
df['Doors'] = df['Doors'].str.extract(r'(\d+)').astype(float)

fill_values_nominal = {col: df[col].mode()[0] for col in NOMINAL_COLUMNS}
fill_values_discrete = {col: df[col].median() for col in DISCRETE_COLUMNS}
fill_values_continuous = {col: df[col].mean() for col in CONTINOUS_COLUMNS}

for col in NOMINAL_COLUMNS:
    df[col] = df[col].fillna(fill_values_nominal[col])

for col in DISCRETE_COLUMNS:
    df[col] = df[col].fillna(fill_values_discrete[col])

for col in CONTINOUS_COLUMNS:
    df[col] = df[col].fillna(fill_values_continuous[col])

for col in CONTINOUS_COLUMNS:
    df[col] = df[col].astype(float)
    z_scores = stats.zscore(df[col])
    outliers = np.abs(z_scores) > 3
    df.loc[outliers, col] = df[col].mean()

encoder_dict = {}
for col in NOMINAL_COLUMNS:
    encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
    transformed = encoder.fit_transform(df[[col]])
    df = pd.concat([df, pd.DataFrame(transformed, columns=encoder.get_feature_names_out([col]))], axis=1)
    encoder_dict[col] = encoder

df.drop(columns=NOMINAL_COLUMNS, inplace=True)

scaler_dict = {}
for col in df.columns:
    if col == TARGET:
        continue
    scaler = MinMaxScaler()
    df[col] = scaler.fit_transform(df[[col]])
    scaler_dict[col] = scaler

In [23]:
X = df.drop(columns=TARGET)
y = df[TARGET]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

xgb = XGBRegressor()
xgb.fit(X_train, y_train)

print("\nXGB Regression:")

y_test_pred = xgb.predict(X_test)

print("MAE:", mean_absolute_error(y_test, y_test_pred))
print("R²:", r2_score(y_test, y_test_pred))


XGB Regression:
MAE: 5589.1318359375
R²: 0.5252405405044556
