In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import KFold
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_absolute_percentage_error
import warnings

warnings.filterwarnings('ignore')

In [2]:
def solve_car_price_prediction_improved():
    # --- 1. Download data ---
    try:
        train_df = pd.read_csv('train.csv')
        test_df = pd.read_csv('test.csv')
        print("Success.")
    except FileNotFoundError:
        print("Error: file not found.")
        return

    submission_ids = test_df['id'].copy()
    y = np.log1p(train_df['price'])
    
    data = pd.concat([train_df.drop('price', axis=1), test_df], ignore_index=True)

    # --- 2. Preprocessing  ---
    data['milage'] = data['milage'].astype(str).str.replace(' mi.', '', regex=False).str.replace(',', '').replace('â€“', 'nan').astype(float)
    data['milage'].fillna(data['milage'].median(), inplace=True)
    data['car_age'] = 2025 - data['model_year']

    def extract_engine_features(engine_str):
        engine_str = str(engine_str)
        hp = re.search(r'([0-9,]+\.?[0-9]*)\s*HP', engine_str, re.I)
        liters = re.search(r'([0-9]+\.?[0-9]*)\s*L', engine_str, re.I)
        cylinders = re.search(r'(\d+)\s*Cylinder', engine_str, re.I)
        hp = float(hp.group(1).replace(',', '')) if hp else np.nan
        liters = float(liters.group(1)) if liters else np.nan
        cylinders = int(cylinders.group(1)) if cylinders else np.nan
        return hp, liters, cylinders

    engine_features = data['engine'].apply(lambda x: pd.Series(extract_engine_features(x)))
    engine_features.columns = ['horsepower', 'liters', 'cylinders']
    data = pd.concat([data, engine_features], axis=1)

    for col in ['horsepower', 'liters', 'cylinders', 'car_age']:
        data[col].fillna(data[col].median(), inplace=True)

    data['accident'] = data['accident'].fillna(data['accident'].mode()[0]).apply(lambda x: 0 if 'None reported' in str(x) else 1)
    data['clean_title'] = data['clean_title'].fillna(data['clean_title'].mode()[0]).apply(lambda x: 1 if str(x) == 'Yes' else 0)
    data['fuel_type'] = data['fuel_type'].fillna(data['fuel_type'].mode()[0])

    for col in data.select_dtypes(include=['object']).columns:
        if col != 'engine':
            data[col], _ = pd.factorize(data[col])
            
    data = data.drop(['id', 'model_year', 'engine'], axis=1)

    X = data[:len(train_df)]
    X_test = data[len(train_df):]

    # --- 3. Model training with K-Fold cross-validation  ---
    NFOLDS = 10
    folds = KFold(n_splits=NFOLDS, shuffle=True, random_state=42)
    
    oof_preds = np.zeros(X.shape[0])
    sub_preds = np.zeros(X_test.shape[0])

    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X, y)):
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_valid, y_valid = X.iloc[valid_idx], y.iloc[valid_idx]

        model = HistGradientBoostingRegressor(random_state=42, max_iter=1000, learning_rate=0.05,
                                              validation_fraction=0.1, n_iter_no_change=100)
        model.fit(X_train, y_train)

        oof_preds[valid_idx] = model.predict(X_valid)
        sub_preds += model.predict(X_test) / folds.n_splits
        
        print(f"  Fold {n_fold + 1} has been ended.")

    oof_preds = np.expm1(oof_preds)
    sub_preds = np.expm1(sub_preds)
    
    # MAPE estimation
    mape_score = mean_absolute_percentage_error(np.expm1(y), oof_preds)
    print(f" Final MAPE: {mape_score:.4f}")

    # --- 4. Creating submission file ---
    submission_df = pd.DataFrame({'id': submission_ids, 'price': sub_preds})
    submission_df.to_csv('submission_improved.csv', index=False)

In [3]:
solve_car_price_prediction_improved()

Success.
  Fold 1 has been ended.
  Fold 2 has been ended.
  Fold 3 has been ended.
  Fold 4 has been ended.
  Fold 5 has been ended.
  Fold 6 has been ended.
  Fold 7 has been ended.
  Fold 8 has been ended.
  Fold 9 has been ended.
  Fold 10 has been ended.
\n Final MAPE: 0.2106
