In [1]:
import os
import sys
import time
import joblib
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor


In [2]:
try:
    from xgboost import XGBRegressor
except Exception:
    XGBRegressor = None

try:
    from lightgbm import LGBMRegressor
except Exception:
    LGBMRegressor = None


In [3]:
models_dir = 'models'
os.makedirs(models_dir, exist_ok=True)

cleaned_path = 'data/cleaned_car_data.csv'
if not os.path.exists(cleaned_path):
    raise FileNotFoundError(cleaned_path)

df = pd.read_csv(cleaned_path)


In [4]:
target = 'price'

important_features = [
    'make','model','year','engine','transmission','fuel',
    'mileage','city','assembly','body','car_age',
    'brand_category','engine_category','age_category'
]

features = [f for f in important_features if f in df.columns]

X = df[features].copy()
y = df[target].copy()


In [5]:
if X.isnull().sum().sum() > 0:
    X = X.fillna('Unknown')

for c in X.select_dtypes(include=['int64','float64']).columns:
    if X[c].dtype == 'float64':
        X[c] = X[c].astype('float32')
    if X[c].dtype == 'int64':
        X[c] = X[c].astype('int32')


In [6]:
y_log = np.log1p(y)


In [7]:
price_bins = pd.qcut(y, q=10, labels=False, duplicates='drop')

X_train, X_test, y_train, y_test = train_test_split(
    X, y_log,
    test_size=0.2,
    random_state=42,
    stratify=price_bins
)


In [8]:
cat_cols = X_train.select_dtypes(include=['object','category']).columns.tolist()
num_cols = X_train.select_dtypes(include=['int32','int64','float32','float64']).columns.tolist()


In [9]:
def limit_categories(df_in, categorical_cols, n_categories=15):
    dfc = df_in.copy()
    for col in categorical_cols:
        if col in dfc.columns and dfc[col].nunique() > n_categories:
            top = dfc[col].value_counts().nlargest(n_categories-1).index
            dfc[col] = dfc[col].where(dfc[col].isin(top), other='Other')
            if dfc[col].nunique() <= 50:
                dfc[col] = dfc[col].astype('category')
    return dfc


In [10]:
X_train = limit_categories(X_train, cat_cols, n_categories=15)
X_test  = limit_categories(X_test,  cat_cols, n_categories=15)

cat_cols = X_train.select_dtypes(include=['object','category']).columns.tolist()
num_cols = X_train.select_dtypes(include=['int32','int64','float32','float64']).columns.tolist()


In [11]:
num_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', num_transformer, num_cols),
    ('cat', cat_transformer, cat_cols)
])


In [12]:
rf = RandomForestRegressor(
    n_estimators=100,
    max_depth=15,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

models = {'Random Forest': rf}

if XGBRegressor is not None:
    models['XGBoost'] = XGBRegressor(
        n_estimators=150,
        max_depth=8,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        verbosity=1,
        n_jobs=-1
    )

if LGBMRegressor is not None:
    models['LightGBM'] = LGBMRegressor(
        n_estimators=150,
        max_depth=8,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        verbose=-1,
        n_jobs=-1
    )


In [13]:
def calculate_metrics(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)

    with np.errstate(divide='ignore', invalid='ignore'):
        mape = np.mean(np.abs((y_true - y_pred) / np.maximum(y_true, 1))) * 100

    r2 = r2_score(y_true, y_pred)

    return {
        'mse': mse,
        'rmse': rmse,
        'mae': mae,
        'mape': mape,
        'r2': r2
    }


In [14]:
results = {}

for name, model in models.items():
    start = time.time()

    pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    pipe.fit(X_train, y_train)

    y_pred_log = pipe.predict(X_test)
    y_pred = np.expm1(y_pred_log)
    y_test_orig = np.expm1(y_test)

    metrics = calculate_metrics(y_test_orig, y_pred)

    results[name] = {
        'pipeline': pipe,
        'metrics': metrics,
        'predictions': y_pred,
        'training_time': time.time() - start
    }




In [15]:
best = max(results.items(), key=lambda x: x[1]['metrics']['r2'])[0]
best_pipeline = results[best]['pipeline']


In [16]:
joblib.dump(best_pipeline, os.path.join(models_dir, 'best_model.pkl'), compress=3)
joblib.dump(preprocessor, os.path.join(models_dir, 'preprocessor.pkl'), compress=3)
joblib.dump(results, os.path.join(models_dir, 'all_results.pkl'), compress=3)

features_info = {
    'all_features': features,
    'categorical': cat_cols,
    'numerical': num_cols
}

joblib.dump(features_info, os.path.join(models_dir, 'features.pkl'), compress=3)

print('saved models and artifacts')


saved models and artifacts
