In [17]:
# Step 1: Load Dataset
import pandas as pd
df = pd.read_csv("used_cars.csv")  # Use your actual filename

# Step 2: Handle Missing Values
df['fuel_type'].fillna(df['fuel_type'].mode()[0], inplace=True)
df['accident'].fillna("None reported", inplace=True)
df['clean_title'].fillna("Yes", inplace=True)

# Step 3: Feature Engineering
import numpy as np
import re

def extract_hp(text):
    match = re.search(r"(\d+\.?\d*)HP", str(text))
    return float(match.group(1)) if match else np.nan

def extract_engine_size(text):
    match = re.search(r"(\d+\.?\d*)L", str(text))
    return float(match.group(1)) if match else np.nan

df['horsepower'] = df['engine'].apply(extract_hp)
df['engine_size_L'] = df['engine'].apply(extract_engine_size)
df['car_age'] = 2025 - df['model_year']
df['accident_reported'] = df['accident'].apply(lambda x: 0 if 'None' in str(x) else 1)
df['clean_title'] = df['clean_title'].apply(lambda x: 1 if x == "Yes" else 0)

# Step 4: One-Hot Encoding
df = pd.get_dummies(df, columns=['brand', 'fuel_type', 'transmission'], drop_first=True)

# Step 5: Drop Unnecessary Columns
df.drop(['model', 'model_year', 'engine', 'accident', 'ext_col', 'int_col'], axis=1, inplace=True)

# Step 6: Clean and convert price column
df['price'] = df['price'].replace('[\$,]', '', regex=True)  # Remove symbols
df['price'] = pd.to_numeric(df['price'], errors='coerce')
print(f"After cleaning: {df['price'].notnull().sum()} valid price entries.")

df = df.dropna(subset=['price'])


# Clean 'milage' column
df['milage'] = df['milage'].str.replace('mi.', '', regex=False)
df['milage'] = df['milage'].str.replace(',', '', regex=False)
df['milage'] = pd.to_numeric(df['milage'], errors='coerce')

# Drop rows where mileage is missing
df = df.dropna(subset=['milage'])
# Step 7: Define X and y
y = np.log1p(df['price'])
X = df.drop(['price'], axis=1)


# Step 8: Train/Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


After cleaning: 4009 valid price entries.


  df['price'] = df['price'].replace('[\$,]', '', regex=True)  # Remove symbols
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['fuel_type'].fillna(df['fuel_type'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['accident'].fillna("None reported", inplace=True)
The behavior will change in pandas 3

In [21]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

models = {
    
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, random_state=42)
}

def evaluate_model(name, model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"\n🔸 {name}")
    print(f"  MAE:  {mean_absolute_error(y_test, y_pred):.2f}")
    print(f"  RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.2f}")
    print(f"  R²:   {r2_score(y_test, y_pred):.4f}")
    print("-" * 40)

for name, model in models.items():
    evaluate_model(name, model, X_train, X_test, y_train, y_test)



🔸 Decision Tree
  MAE:  0.32
  RMSE: 0.49
  R²:   0.6901
----------------------------------------

🔸 Random Forest
  MAE:  0.23
  RMSE: 0.37
  R²:   0.8256
----------------------------------------

🔸 XGBoost
  MAE:  0.22
  RMSE: 0.35
  R²:   0.8418
----------------------------------------


In [23]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.7, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.9, 1.0],
    'gamma': [0, 0.1, 0.2],
    'reg_alpha': [0, 0.01, 0.1],
    'reg_lambda': [0.8, 1.0, 1.2]
}

xgb = XGBRegressor(objective='reg:squarederror', random_state=42)
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_grid,
    n_iter=25,
    cv=3,
    verbose=1,
    scoring='r2',
    n_jobs=-1
)

random_search.fit(X_train, y_train)
best_xgb = random_search.best_estimator_
print("✅ Best Parameters:", random_search.best_params_)


Fitting 3 folds for each of 25 candidates, totalling 75 fits
✅ Best Parameters: {'subsample': 1.0, 'reg_lambda': 1.2, 'reg_alpha': 0, 'n_estimators': 300, 'max_depth': 7, 'learning_rate': 0.2, 'gamma': 0, 'colsample_bytree': 0.7}


In [25]:
import joblib
joblib.dump(best_xgb, 'xgb_car_price_model2.pkl')
joblib.dump(X.columns.tolist(), 'feature_columns2.pkl')


['feature_columns2.pkl']

In [None]:
!streamlit run app2.py