<a href="https://colab.research.google.com/github/RabHuss/DSN-2025-AI-Bootcamp-Qualification-Hackathon-Project-Participation/blob/main/RABIATU_Predicting_the_price_of_used_cars_2025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import HistGradientBoostingRegressor

# 1) Load data
train = pd.read_csv("train.csv", on_bad_lines="skip")
test = pd.read_csv("test.csv", on_bad_lines="skip")

# 2) Feature engineering function
def extract_engine_features(text):
    hp, liters, cyl = np.nan, np.nan, np.nan
    try:
        if isinstance(text, str):
            if "HP" in text:
                hp = float(text.split("HP")[0].strip())
            if "L" in text:
                liters = float(text.split("L")[0].split()[-1])
            if "Cylinder" in text:
                cyl = float(text.split("Cylinder")[0].split()[-1])
    except Exception:
        pass
    return pd.Series([hp, liters, cyl])

for df in [train, test]:
    df["milage"] = pd.to_numeric(df["milage"], errors="coerce")
    for col in ["fuel_type", "accident", "clean_title"]:
        if col in df.columns:
            df[col] = df[col].fillna("Unknown").astype(str)

    df["car_age"] = 2025 - df["model_year"]
    df["milage_per_year"] = np.where(df["car_age"] > 0, df["milage"] / df["car_age"], df["milage"])
    df[["horsepower", "engine_size", "cylinders"]] = df["engine"].apply(extract_engine_features)

# 3) Encode categoricals
low_cardinality = ["fuel_type", "transmission", "accident", "clean_title"]
for col in low_cardinality:
    if col in train.columns:
        le = LabelEncoder()
        le.fit(pd.concat([train[col], test[col]]).astype(str))
        train[col] = le.transform(train[col].astype(str))
        test[col] = le.transform(test[col].astype(str))

high_cardinality = ["brand", "model", "ext_col", "int_col", "engine"]
for col in high_cardinality:
    if col in train.columns:
        freq = pd.concat([train[col], test[col]]).value_counts(normalize=True)
        train[col] = train[col].map(freq).astype(float)
        test[col] = test[col].map(freq).astype(float)

# 4) Features + Target
target = "price"
feature_cols = [c for c in train.columns if c not in ["id", target]]

X = train[feature_cols].apply(lambda s: s.fillna(s.mean()) if s.dtype.kind in "fc" else s)
y = train[target].astype(float)

X_test = test[feature_cols].apply(lambda s: s.fillna(s.mean()) if s.dtype.kind in "fc" else s)

# 5) Train model
model = HistGradientBoostingRegressor(max_iter=250, learning_rate=0.08, random_state=42)
model.fit(X, y)

# 6) Predict
test_preds = model.predict(X_test)

# 7) Create submission file
submission = pd.DataFrame({
    "id": test["id"],
    "price": test_preds
})

# Print shape before saving
print("submission shape:", submission.shape)

# Save file
submission.to_csv("submission.csv", index=False)
print("Saved submission.csv successfully!")


In [None]:
val_preds = hgb.predict(X_val)
mse = mean_squared_error(y_val, val_preds)
rmse = np.sqrt(mse)
print("Validation RMSE:", rmse)