In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import category_encoders as ce
from sklearn.preprocessing import StandardScaler
import time
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import RandomizedSearchCV
import joblib
import json
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (14, 7)

# Car Price Prediction Project  
  
This project aims to compare nine different machine learning models (Linear Regression, KNN, Decision Tree, Random Forest, Gradient Boosting, XGBoost, LightGBM, CatBoost, AdaBoost) to predict the `sellingprice` value of used cars.

## Step 1: Data Loading and Exploratory Data Analysis (EDA)  
  
The first phase of the project is to understand the raw data. The `car_prices.csv` file was loaded using the `pandas` library, and the following initial examinations were performed to understand the basic structure of the dataset:

1.  **`.head()`**: The first 5 rows of the dataset were examined to observe the column names and general data structure.
2.  **`.info()`**: The total number of rows, column data types (Dtype), and the number of non-null values were checked. This allowed us to quickly see which columns had missing data (NaN).
3.  **`.isecondsull().sum()`**: The total number of missing values in each column was clearly calculated.
4.  **`.describe()`**: The basic statistics (mean, min, max, quartiles) of numerical columns (e.g., `odometer`, `sellingprice`) were examined. This is important for detecting possible outliers or erroneous data (e.g., minimum price £1).
5.  **`.describe(include=['object'])`**: The number of unique values, the most frequent category (top), and its frequency (freq) were examined for categorical (text) columns (e.g., `make`, `model`, `transmission`).

In [None]:
df = pd.read_csv("car_prices.csv", on_bad_lines="skip")
df.head()

In [None]:
df.info()

In [None]:
print(df.isnull().sum())

In [None]:
df.describe()

In [None]:
print(df.describe(include=["object"]))

## Step 1.5: Visualising Data  
  
Before commencing the data cleaning process, it is important to visualise the most critical issues and relationships identified in Step 1 (EDA). At this stage, the distribution of our target variable `sellingprice`, potential data leakage (`mmr`), and the impact of `odometer`—one of the most important features—on price were examined.

1.  **`sellingprice` Distribution (Histogram):** It was observed that the price distribution was skewed to the right and that there were very low (erroneous) price records close to 0.
2.  **`mmr` vs `sellingprice` (Scatter Plot):** The graph of these two variables showed an almost perfect linear (y=x) relationship. This proves that the `mmr` column acts as an "answer" for `sellingprice` and leads to **data leakage**. Therefore, the `mmr` column should not be included in model training.
3.  **`odometer` vs `sellingprice` (Scatter Plot):** As expected, a negative relationship was observed, showing that as mileage increases, the selling price (generally) decreases.

In [None]:
sns.histplot(df['sellingprice'], bins=100, kde=True)
plt.title('Selling Price Distribution', fontsize=16)
plt.xlabel('Selling Price', fontsize=12)
plt.ylabel('Frequency (Number)', fontsize=12)
plt.show()

In [None]:
sample_df = df.sample(n=5000, random_state=42)
sns.scatterplot(data=sample_df, x='mmr', y='sellingprice', alpha=0.6)

max_val = max(sample_df['mmr'].max(), sample_df['sellingprice'].max())
min_val = min(sample_df['mmr'].min(), sample_df['sellingprice'].min())

plt.plot([min_val, max_val], [min_val, max_val], color='red', linestyle='--', linewidth=2)
plt.title('Market Value (mmr) vs Selling Price', fontsize=16)
plt.xlabel('Market Value (mmr)', fontsize=12)
plt.ylabel('Selling Price', fontsize=12)
plt.show()

In [None]:
sns.scatterplot(data=sample_df, x='odometer', y='sellingprice', alpha=0.6)
plt.title('Kilometre (odometer) vs Selling price', fontsize=16)
plt.xlabel('Kilometre (odometer)', fontsize=12)
plt.ylabel('Selling Price', fontsize=12)
plt.show()

## Step 2: Data Cleaning and Feature Engineering

Raw data is not suitable for training models. The following steps were taken to resolve the issues identified in Steps 1 and 1.5:

1.  **Removal of Data Leakage and Unnecessary Columns:**
    * `mmr`: As demonstrated in Step 1.5, it was **removed** from the dataset because it had a perfect correlation with `sellingprice` and caused data leakage.
    * `vin`: It was **removed** because it is a unique identifier (ID) for each record and does not provide a pattern for the model.

2.  **Outlier Cleaning:**
    * There were obvious erroneous entries such as `$1` in the `sellingprice` column. To prevent the model from being affected by this noise, records with a `sellingprice` value below $100 were **filtered** from the dataset.

3.  **Feature Engineering:**
    * To help models better understand the concept of "time," a new numerical feature called `car_age` (the age of the car at the time of sale) was **derived** using the `saledate` (object) and `year` (int) columns.
    * The `saledate` column was first converted to `datetime` format, and invalid dates were marked as `NaT`.
    * The formula `car_age = saledate.year - year` was applied.
    * After processing, the `saledate` and `year` columns were **removed** from the dataset.

4.  **Missing Value Imputation:**
    * **Numeric Columns (`odometer`, `condition`):** Missing values were imputed with the **median** value, which is more robust against outliers.
    * **Categorical Columns (`transmission`, `make`, `model`, `body`, `colour`, etc.):** All categorical missing values, including over 65,000 missing `transmission` values, were **filled** with a new category named `"Unknown"` so that the model could learn "information absence" as a feature.

5.  **Result:** As a result of these operations, a cleaned dataset with no missing (NaN) values was obtained and saved as `car_prices_cleaned.csv`.

In [None]:
df = df.drop(["vin", "mmr"], axis=1)

In [None]:
df = df[df["sellingprice"] > 100]

In [None]:
df["saledate"] = pd.to_datetime(df["saledate"], errors="coerce", utc=True)
if pd.api.types.is_datetime64tz_dtype(df["saledate"]):
	df["saledate"] = df["saledate"].dt.tz_convert(None)

df = df.dropna(subset=["saledate", "year"])
df["sale_year"] = df["saledate"].dt.year.astype(int)
df["car_age"] = df["sale_year"] - df["year"].astype(int)
df = df.drop(["year", "saledate", "sale_year"], axis=1)

In [None]:
for col in ["odometer", "condition"]:
    median_val = df[col].median()
    df[col] = df[col].fillna(median_val)
categorical_cols = ['make', 'model', 'trim', 'body', 'transmission', 'color', 'interior']
for col in categorical_cols:
    df[col] = df[col].fillna("Unknown")
df["seller"] = df["seller"].fillna("Unknown")

In [None]:
print(f"Number of rows remaining after cleaning: {len(df)}")
print("\n--- Cleaned Data Set Information (info) ---")
df.info()
print("\n--- Missing Data Status of the Cleaned Dataset (isecondsull().sum) ---")
print(df.isnull().sum())
print("\n--- First 5 Rows of the Cleaned Data (head) ---")
print(df.head())

In [None]:
df.to_csv("car_prices_cleaned.csv", index=False)

## Step 3: Advanced Data Transformation (Target Encoding) and Data Splitting

Machine learning models cannot process text data (`object`). Our dataset contains columns with thousands of unique categories (high cardinality), such as `model`, `trim`, and `seller`.

**Challenge:** Applying `One-Hot Encoding` (`pd.get_dummies`) to these columns would create over 15,000 new columns, leading to the "Curse of Dimensionality" and reducing the performance of models such as Linear Regression and KNN.

**Solution:** Instead of discarding these columns, an advanced technique called **Target Encoding** was used to increase accuracy.

1.  **What is Target Encoding?**
    * This technique replaces a categorical value (e.g., `model = 'Sorento'`) with the average target variable corresponding to that category (in our project, the average `sellingprice`).
    * This way, instead of 15,000+ columns, we obtain numerical columns containing *very powerful* information about the price.

2.  **Preventing Data Leakage:**
    * The biggest risk of this technique is data leakage. To prevent this, the dataset was split into 80% Training (`X_train`, `y_train`) and 20% Test (`X_test`, `y_test`) **before any encoding was performed**.
    * `TargetEncoder` from the `category_encoders` library was used.
* The encoder was "trained" (`.fit()`) using **ONLY** the `X_train` and `y_train` data.
    * This trained encoder was applied to both the `X_train` and `X_test` data (`.transform()`). This ensured that no test data information leaked into the training phase.

3.  **Result:**
    * All categorical columns (`make`, `model`, `trim`, `body`, etc.) were successfully converted to numerical (float64) columns.
    * The dataset is now ready for the next stage: Scaling and Modelling.

In [None]:
df_clean = pd.read_csv("car_prices_cleaned.csv")
df_clean.shape

In [None]:
x = df_clean.drop("sellingprice", axis=1)
y = df_clean["sellingprice"]
x_train, x_test, y_train, y_test = train_test_split(x,y, train_size=0.8, random_state=7)

In [None]:
categorical_cols = x_train.select_dtypes(include=["object"]).columns.tolist()
print(f"Target Encoding will be applied to {len(categorical_cols)} columns found: {categorical_cols}")

In [None]:
# handle_unknown='value': If a category appears in the test set but not in the training set 
# (e.g., a new 'model'), assign it the overall average price.
encoder = ce.TargetEncoder(cols=categorical_cols, handle_unknown="value")
encoder.fit(x_train, y_train)
x_train_encoded = encoder.transform(x_train)
x_test_encoded = encoder.transform(x_test)
x_train_encoded.info()

In [None]:
x_train_encoded.to_csv("x_train_encoded.csv", index=False)
x_test_encoded.to_csv("x_test_encoded.csv", index=False)
y_train.to_csv("y_train.csv", index=False)
y_test.to_csv("y_test.csv", index=False)

## Step 4: Feature Scaling

Although our dataset is now entirely numerical, one final challenge remains: the scales of the features (columns) vary greatly.

* `odometer` takes values in the 100,000s, while `condition` takes values between 1 and 5.
* Columns such as `make` and `model`, which we created with Target Encoding, represent average prices (e.g., 5,000 - 50,000).

**Problem:** Distance and coefficient-based models such as **Linear Regression** and **KNN (K-Nearest Neighbors)** consider the `odometer` column 10,000 times more important than the `condition` column simply because its numerical value is larger.

**Solution:** Standardise all features using `StandardScaler` from the `sklearn.preprocessing` library.

1.  **What is Standardisation?** It is the process of transforming all features so that their means are 0 (${\mu}=0$) and their standard deviations are 1 (${\sigma}=1$).
2.  **Preventing Data Leakage:** As with `TargetEncoder`, `StandardScaler` is "trained" (`.fit()`) ONLY on the `X_train` data, and this training (calculated mean and standard deviation) is applied to both the `X_train` and `X_test` data (`.transform()`).
3.  **Tree-Based Models:** Tree-based models such as Random Forest, XGBoost, and LightGBM are *not affected* by scaling. However, to fairly compare all 9 of our models on the same dataset, using scaled data would be a consistent approach.

**Result:** The `X_train_scaled.csv` and `X_test_scaled.csv` files have been created. The data preparation process is complete. Our data is now ready for modelling.

In [None]:
x_train = pd.read_csv("x_train_encoded.csv")
x_test = pd.read_csv("x_test_encoded.csv")
# We convert the DataFrame to a Series using .squeeze()
y_train = pd.read_csv("y_train.csv").squeeze()
y_test = pd.read_csv("y_test.csv").squeeze()

In [None]:
scaler = StandardScaler()
scaler.fit(x_train)

In [None]:
# .transform() returns a numpy array.
# To preserve the column names, we convert it back to a DataFrame.
x_train_scaled_np = scaler.transform(x_train)
x_train_scaled = pd.DataFrame(x_train_scaled_np, columns=x_test.columns)
x_test_scaled_np = scaler.transform(x_test)
x_test_scaled = pd.DataFrame(x_test_scaled_np, columns=x_test.columns)
x_train_scaled.head()

In [None]:
print(x_train_scaled.describe())

## Step 5: Baseline Model Training and Comparison

At this stage, nine different models were trained using **default parameters** and their performance was compared for a fair "Baseline Comparison". The aim was to see which models were "naturally" more suited to this dataset.

The evaluation metrics were set as R² (close to 1 = good), MAE (low = good) and RMSE (low = good).

### 5.1 Model 1: Linear Regression

Linear Regression is the most basic regression model, attempting to establish a linear relationship between all features and the target (`sellingprice`).

- Performance Results:
  - **R²:** 0.7615
  - **MAE:** 3092.15 
  - **RMSE:** 4780.80 

In [None]:
x_train_scaled.to_csv("x_train_scaled.csv", index=False)
x_test_scaled.to_csv("x_test_scaled.csv", index=False)

In [None]:
x_train = pd.read_csv("x_train_scaled.csv")
x_test = pd.read_csv("x_test_scaled.csv")
y_train = pd.read_csv("y_train.csv").squeeze()
y_test = pd.read_csv("y_test.csv").squeeze()

In [None]:
model_lr = LinearRegression()
start_fit = time.time()
model_lr.fit(x_train, y_train)
end_fit = time.time()
print(f"The model was trained for {end_fit-start_fit:.2f} seconds.")
start_pred = time.time()
y_pred_lr = model_lr.predict(x_test)
end_pred = time.time()
print(f"The model predicted in {end_pred-start_pred:.2f} seconds.")

r2_lr = r2_score(y_test, y_pred_lr)
mae_lr = mean_absolute_error(y_test, y_pred_lr)
mse_lr = mean_squared_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mse_lr)
print("--- Linear Regression Results ---")
print(f"R-squared: {r2_lr:.4f}")
print(f"MAE: {mae_lr:.2f} $")
print(f"RMSE: {rmse_lr:.2f} $")

### 5.2 Model 2: KNN (K-Nearest Neighbors)

KNN is a "distance"-based model. To predict the price of a car, it takes the average price of the `k` neighbouring cars (from the training data) that most closely resemble that car in the test data in terms of features.

The **Feature Scaling** we performed in Step 4 was essential for this model to work. The model was run with the default parameter `k=5`.

- Performance Results:
  - **R²:** 0.8842
  - **MAE:** 2003.76
  - **RMSE:** 3331.80

In [None]:
model_knn = KNeighborsRegressor(n_jobs=-1)
start_fit = time.time()
model_knn.fit(x_train, y_train)
end_fit = time.time()
print(f"The model was trained for {end_fit-start_fit:.2f} seconds.")
start_pred = time.time()
y_pred_knn = model_knn.predict(x_test)
end_pred = time.time()
print(f"The model predicted in {end_pred-start_pred:.2f} seconds.")

r2_knn = r2_score(y_test, y_pred_knn)
mae_knn = mean_absolute_error(y_test, y_pred_knn)
mse_knn = mean_squared_error(y_test, y_pred_knn)
rmse_knn = np.sqrt(mse_knn)
print("\n--- KNN Results ---")
print(f"R-squared: {r2_knn:.4f}")
print(f"MAE: {mae_knn:.2f} $")
print(f"RMSE: {rmse_knn:.2f} $")

### 5.3 Model 3: Decision Tree

A Decision Tree is a model that works by splitting data according to a series of "if-then" rules. When run with default parameters (unlimited depth), it is highly prone to overfitting the training data. This test aims to measure the performance of the model in its natural (raw) state.

- Performance Results:
  - **R²:** 0.9077
  - **MAE:** 1728.58
  - **RMSE:** 2975.09

In [None]:
model_dt = DecisionTreeRegressor(random_state=7)
start_fit = time.time()
model_dt.fit(x_train, y_train)
end_fit = time.time()
print(f"The model was trained for {end_fit-start_fit:.2f} seconds.")
start_pred = time.time()
y_pred_dt = model_dt.predict(x_test)
end_pred = time.time()
print(f"The model predicted in {end_pred-start_pred:.2f} seconds.")

r2_dt = r2_score(y_test, y_pred_dt)
mae_dt = mean_absolute_error(y_test, y_pred_dt)
mse_dt = mean_squared_error(y_test, y_pred_dt)
rmse_dt = np.sqrt(mse_dt)
print("\n--- Decision Tree Results ---")
print(f"R-squared: {r2_dt:.4f}")
print(f"MAE: {mae_dt:.2f} $")
print(f"RMSE: {rmse_dt:.2f} $")

### 5.4 Model 4: Random Forest

Random Forest is an ensemble learning method designed to solve the biggest problem of a single Decision Tree (Model 5.3), namely **overfitting**.

This model builds a "forest" consisting of **100** more random and less deep trees instead of a single deep tree. When making a prediction, it takes the average of the predictions of these 100 trees. This approach makes the model much more stable and reliable.

- Performance Results:
  - **R²:** 0.9551
  - **MAE:** 1220.07
  - **RMSE:** 2075.03

In [None]:
model_rf = RandomForestRegressor(random_state=7, n_jobs=-1)
start_fit = time.time()
model_rf.fit(x_train, y_train)
end_fit = time.time()
print(f"The model was trained for {end_fit-start_fit:.2f} seconds.")
start_pred = time.time()
y_pred_rf = model_rf.predict(x_test)
end_pred = time.time()
print(f"The model predicted in {end_pred-start_pred:.2f} seconds.")

r2_rf = r2_score(y_test, y_pred_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)
print("\n--- Random Forest Results ---")
print(f"R-squared: {r2_rf:.4f}")
print(f"MAE: {mae_rf:.2f} $")
print(f"RMSE: {rmse_rf:.2f} $")

### 5.5 Model 5: Gradient Boosting (GBM)

After the "Bagging" method used by Random Forest, we test the Gradient Boosting model, the first and most classic member of the "Boosting" family.

* **Bagging (Random Forest):** 100 independent trees are trained in parallel and the results are averaged (Democracy).
* **Boosting (Gradient Boosting):** Trees are trained *sequentially*. Each new tree specialises in correcting the errors made by the previous tree.

This "learning from mistakes" approach typically produces very powerful and highly accurate models.

- Performance Results:
  - **R²:** 0.8700
  - **MAE:** 2220.40
  - **RMSE:** 3530.08

In [None]:
model_gb = GradientBoostingRegressor(random_state=7)
start_fit = time.time()
model_gb.fit(x_train, y_train)
end_fit = time.time()
print(f"The model was trained for {end_fit-start_fit:.2f} seconds.")
start_pred = time.time()
y_pred_gb = model_gb.predict(x_test)
end_pred = time.time()
print(f"The model predicted in {end_pred-start_pred:.2f} seconds.")

r2_gb = r2_score(y_test, y_pred_gb)
mae_gb = mean_absolute_error(y_test, y_pred_gb)
mse_gb = mean_squared_error(y_test, y_pred_gb)
rmse_gb = np.sqrt(mse_gb)
print("\n--- Gradient Boosting Results ---")
print(f"R-squared: {r2_gb:.4f}")
print(f"MAE: {mae_gb:.2f} $")
print(f"RMSE: {rmse_gb:.2f} $")

### 5.6 Model 6: XGBoost (eXtreme Gradient Boosting)

The `GradientBoostingRegressor` (scikit-learn) in Model 5.5, with its default parameters (MAE: 2161), lagged far behind Random Forest (MAE: 1210). This demonstrates the sensitivity of classic GBM to hyperparameters.

XGBoost ("Extreme Boosting") is a much faster and more efficient implementation of the same "boosting" idea. Predominantly used in data science competitions such as Kaggle, this model is much more advanced than classic GBM in terms of both parallelisation (speed) and regularisation (preventing overfitting).

- Performance Results:
  - **R²:** 0.9417
  - **MAE:** 1463.40
  - **RMSE:** 2363.51

In [None]:
model_xgb = XGBRegressor(random_state=7, n_jobs=-1)
start_fit = time.time()
model_xgb.fit(x_train, y_train)
end_fit = time.time()
print(f"The model was trained for {end_fit-start_fit:.2f} seconds.")
start_pred = time.time()
y_pred_xgb = model_xgb.predict(x_test)
end_pred = time.time()
print(f"The model predicted in {end_pred-start_pred:.2f} seconds.")

r2_xgb = r2_score(y_test, y_pred_xgb)
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
rmse_xgb = np.sqrt(mse_xgb)
print("\n--- XGBoost Results ---")
print(f"R-squared: {r2_xgb:.4f}")
print(f"MAE: {mae_xgb:.2f} $")
print(f"RMSE: {rmse_xgb:.2f} $")

### 5.7 Model 7: LightGBM (Light Gradient Boosting Machine) 
 The second member of the modern "boosting" family is LightGBM, developed by Microsoft. It is designed to be even faster than XGBoost. 
 While XGBoost grows trees "level-wise", LightGBM follows a "leaf-wise" strategy that minimises error. This provides a significant speed advantage, particularly with datasets containing hundreds of thousands of rows (like ours).

- Performance Results:
  - **R²:** 0.9229
  - **MAE:** 1736.79
  - **RMSE:** 2718.66

In [None]:
model_lgbm= LGBMRegressor(random_state=7, verbosity=-1, n_jobs=-1)
start_fit = time.time()
model_lgbm.fit(x_train, y_train)
end_fit = time.time()
print(f"The model was trained for {end_fit-start_fit:.2f} seconds.")
start_pred = time.time()
y_pred_lgbm = model_lgbm.predict(x_test)
end_pred = time.time()
print(f"The model predicted in {end_pred-start_pred:.2f} seconds.")

r2_lgbm = r2_score(y_test, y_pred_lgbm)
mae_lgbm = mean_absolute_error(y_test, y_pred_lgbm)
mse_lgbm = mean_squared_error(y_test, y_pred_lgbm)
rmse_lgbm = np.sqrt(mse_lgbm)
print("\n--- LightGBM Results ---")
print(f"R-squared: {r2_lgbm:.4f}")
print(f"MAE: {mae_lgbm:.2f} $")
print(f"RMSE: {rmse_lgbm:.2f} $")

### 5.8 Model 8: CatBoost (Categorical Boosting)

The latest member of the modern "boosting" family is CatBoost, developed by Yandex.

The most powerful feature of this model, as its name suggests, is its ability to process **categorical** data automatically and very efficiently (while preventing data leakage).

*Note: In our project, we had already converted all categorical data to numerical values using `Target Encoding` in Step 3. Therefore, we are not testing this core feature of CatBoost; we are feeding it our pre-processed numerical data, just like other models.*

Nevertheless, CatBoost is known as a model that generally produces very stable and highly accurate results with default parameters (1000 trees).

- Performance Results:
  - **R²:** 0.9494
  - **MAE:** 1366.16
  - **RMSE:** 2201.53

In [None]:
model_cb = CatBoostRegressor(random_state=7, silent=True)
start_fit = time.time()
model_cb.fit(x_train, y_train)
end_fit = time.time()
print(f"The model was trained for {end_fit-start_fit:.2f} seconds.")
start_pred = time.time()
y_pred_cb = model_cb.predict(x_test)
end_pred = time.time()
print(f"The model predicted in {end_pred-start_pred:.2f} seconds.")

r2_cb = r2_score(y_test, y_pred_cb)
mae_cb = mean_absolute_error(y_test, y_pred_cb)
mse_cb = mean_squared_error(y_test, y_pred_cb)
rmse_cb = np.sqrt(mse_cb)
print("\n--- CatBoost Results ---")
print(f"R-squared: {r2_cb:.4f}")
print(f"MAE: {mae_cb:.2f} $")
print(f"RMSE: {rmse_cb:.2f} $")

### 5.9 Model 9: AdaBoost (Adaptive Boosting)

The final model in our baseline comparison list is AdaBoost (Adaptive Boosting), the original progenitor of the "boosting" family.

This model uses a different "boosting" logic than GBM or XGBoost. Instead of correcting errors with a "gradient", it assigns more **"weight"** to the data points that the previous model predicted *incorrectly*. This forces the next model to focus on these "difficult" examples.

- Performance Results:
  - **R²:** 0.1338
  - **MAE:** 7901.81
  - **RMSE:** 9111.64

In [None]:
model_ada = AdaBoostRegressor(random_state=7)
start_fit = time.time()
model_ada.fit(x_train, y_train)
end_fit = time.time()
print(f"The model was trained for {end_fit-start_fit:.2f} seconds.")
start_pred = time.time()
y_pred_ada = model_ada.predict(x_test)
end_pred = time.time()
print(f"The model predicted in {end_pred-start_pred:.2f} seconds.")

r2_ada = r2_score(y_test, y_pred_ada)
mae_ada = mean_absolute_error(y_test, y_pred_ada)
mse_ada = mean_squared_error(y_test, y_pred_ada)
rmse_ada = np.sqrt(mse_ada)
print("\n--- AdaBoost Results ---")
print(f"R-squared: {r2_ada:.4f}")
print(f"MAE: {mae_ada:.2f} $")
print(f"RMSE: {rmse_ada:.2f} $")

## Step 5.10: Baseline Comparison Results (Summary Table)

Throughout Step 5, nine different machine learning models were trained with their **default parameters** and compared on the same test data. The purpose of this baseline comparison is to determine which algorithms are "naturally" suited to this dataset and which ones deserve to be "finalists" for optimisation (Step 6).

All models were run on the *same* training and test data prepared in Step 3 (`Target Encoding`) and Step 4 (`StandardScaler`).

### Analysis and Findings

This table provides very clear insights for our project:

1.  **Clear Winner (Accuracy):** **Random Forest** (RMSE: 2027.03) was by far the most successful model, delivering the lowest error with default parameters.
2.  **Best Finalists:** **CatBoost** (RMSE: 2112.82) and **XGBoost** (RMSE: 2287.18) proved the power of the "boosting" family as the second and third best models.
3.  **Speed Champion:** **LightGBM** became the fastest model by training 316,000 lines of data in under 1 second (0.87 seconds). It has tremendous potential for optimisation.
4.  **Major Disappointments:** Classic `Gradient Boosting` (RMSE: 3404.94) and `AdaBoost` (RMSE: 9347.90) proved how far behind modern boosting implementations (XGB, LGBM, CatBoost) they are.
5.  **Strong Starts:** Simpler models such as `Decision Tree` (RMSE: 2832.68) and `KNN` (RMSE: 3243.52) easily outperformed `Linear Regression` (RMSE: 4591.67) thanks to our features enhanced with `Target Encoding`.

In [None]:
data = [
    {"Model": "Linear Regression", "R²": 0.7615, "MAE ($)": 3092.15, "RMSE ($)": 4780.8, "Train Time (seconds)": 0.19},
    {"Model": "KNN", "R²": 0.8842, "MAE ($)": 2003.76, "RMSE ($)": 3331.8, "Train Time (seconds)": 2.28},
    {"Model": "Decision Tree", "R²": 0.9077, "MAE ($)": 1728.58, "RMSE ($)": 2975.09, "Train Time (seconds)": 5.56},
    {"Model": "Random Forest", "R²": 0.9551, "MAE ($)": 1220.07, "RMSE ($)": 2075.03, "Train Time (seconds)": 84.34},
    {"Model": "Gradient Boosting", "R²": 0.8700, "MAE ($)": 2220.40, "RMSE ($)": 3530.08, "Train Time (seconds)": 96.05},
    {"Model": "XGBoost", "R²": 0.9417, "MAE ($)": 1463.40, "RMSE ($)": 2363.51, "Train Time (seconds)": 1.67},
    {"Model": "LightGBM", "R²": 0.9229, "MAE ($)": 1736.79, "RMSE ($)": 2718.66, "Train Time (seconds)": 1.69},
    {"Model": "CatBoost", "R²": 0.9494, "MAE ($)": 1366.16, "RMSE ($)": 2201.53, "Train Time (seconds)": 34.16},
    {"Model": "AdaBoost", "R²": 0.1338, "MAE ($)": 7901.81, "RMSE ($)": 9111.64, "Train Time (seconds)": 37.49}
]
results_df = pd.DataFrame(data)
results_df = results_df.sort_values(by="RMSE ($)", ascending=True)
print(results_df.to_string(index=False))
results_df.to_csv("model_baseline_comparison.csv", index=False)

## Step 6: Hyperparameter Optimisation (Finding the Best Parameters)

Step 5's "Basic Comparison" showed the results obtained by the models using their default parameters. In this step, to find the "best" model, which is the main objective of the project, we will take the **top 3 finalists (Random Forest, CatBoost, XGBoost)** from that table and subject them to **Hyperparameter Optimisation** (Tuning).

The goal is to reduce their error margins (RMSE) further by modifying these models' default settings (e.g., number of trees).

For this process, `sklearn.model_selection.RandomisedSearchCV` was used. This tool finds the best settings by trying random combinations from the specified parameter ranges (`n_iter=10`) and testing them with 3-fold cross-validation (`cv=3`).

### 6.1 Finalist 1: Random Forest Optimisation

**Reason:** It led the baseline comparison with the lowest RMSE value (2,027.03).
**Disadvantage:** As a single training run took 43 seconds, this optimisation process (10 iter * 3 CV) was very slow.

* **Baseline RMSE:** 2075.03 
* **Optimised RMSE:** 2072.57
* **Best Parameters Found:**
    * `n_estimators`: 100
    * `max_features`: *1.0*
    * `max_depth`: *none*
    * `min_samples_split`: *5*
    * `min_samples_leaf`: *1*
* **Improvement:** *2.46* 

In [None]:
param_dist = {
    'n_estimators': [100, 200, 300], # The number of trees in the forest
    'max_depth': [10, 20, 30, None], # Maximum depth of trees (None=unlimited)
    'max_features': ['sqrt', 'log2', 1.0], # Number of features to be used in each tree
    'min_samples_split': [2, 5],      # Minimum sample required to split a branch
    'min_samples_leaf': [1, 2, 4]       # Minimum sample required per leaf
}
rf = RandomForestRegressor(random_state=7)
rf_random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=10, 
    cv=3,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1,
    random_state=42,
    verbose=2 # Show progress while searching
)
start_time = time.time()
rf_random_search.fit(x_train, y_train)
end_time = time.time()
print(f"Optimisation completed. Duration: {(end_time - start_time) / 60:.2f} minutes.")
print(f"Best Parameters Found:\n {rf_random_search.best_params_}")
best_rf = rf_random_search.best_estimator_
y_pred_best_rf = best_rf.predict(x_test)

r2_best_rf = r2_score(y_test, y_pred_best_rf)
rmse_best_rf = np.sqrt(mean_squared_error(y_test, y_pred_best_rf))
print(f"Optimised R-Squared: {r2_best_rf:.4f}")
print(f"Optimised RMSE:      {rmse_best_rf:.2f} $")

print(f"Baseline RMSE:       {2027.03:.2f} $")
print(f"Optimised RMSE:      {rmse_best_rf:.2f} $")
print(f"Recovery (RMSE):     {2027.03 - rmse_best_rf:.2f} $")

### 6.2 Finalist 2: CatBoost Optimisation

**Reason:** It was a very strong "boosting" candidate, having the second lowest RMSE (2112.82) in the baseline comparison. Unlike Random Forest, CatBoost is generally expected to respond very well to optimisation.

* **Baseline RMSE:** 2201.53 
* **Optimised RMSE:** *2108.06* 
* **Best Parameters Found:**
    * `iterations`: *1000*
    * `learning_rate`: *0.05*
    * `depth`: *10*
    * `l2_leaf_reg`: *5*
* **Improvement:** *93.47* 

In [None]:
param_dist = {
    'iterations': [500, 1000, 1500],  # Number of trees (baseline was 1000)
    'depth': [6, 8, 10],              # Tree depth
    'learning_rate': [0.01, 0.05, 0.1], # Learning speed (one of the most important ones)
    'l2_leaf_reg': [1, 3, 5]          # The L2 penalty term that prevents overfitting
}
cb = CatBoostRegressor(random_state=7, silent=True)
cb_random_search = RandomizedSearchCV(
    estimator=cb,
    param_distributions=param_dist,
    n_iter=10, 
    cv=3,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1,
    random_state=42,
    verbose=2 
)

start_time = time.time()
cb_random_search.fit(x_train, y_train)
end_time = time.time()
print(f"Optimisation completed. Duration: {(end_time - start_time) / 60:.2f} minutes.")
print(f"Best Parameters Found:\n {cb_random_search.best_params_}")
best_cb = cb_random_search.best_estimator_
y_pred_best_cb = best_cb.predict(x_test)

r2_best_cb = r2_score(y_test, y_pred_best_cb)
rmse_best_cb = np.sqrt(mean_squared_error(y_test, y_pred_best_cb))
print(f"Optimised R-Squared: {r2_best_cb:.4f}")
print(f"Optimised RMSE:      {rmse_best_cb:.2f} $")

print(f"Baseline RMSE:       {2112.82:.2f} $")
print(f"Optimised RMSE:      {rmse_best_cb:.2f} $")
print(f"Recovery (RMSE):     {2112.82 - rmse_best_cb:.2f} $")

### 6.3 Finalist 3: XGBoost Optimisation

**Reason:** It had the third-best RMSE (2,287.18) in the baseline comparison. However, its biggest advantage was its **speed**; it was trained in 1.35 seconds. This offers the potential to make the optimisation (tuning) process extremely efficient.

* **Baseline RMSE:** 2363.51
* **Optimised RMSE:** 1,902.00
* **Best Found Parameters:**
    * `subsample`: *1.0*
    * `n_estimators`: *1000*
    * `max_depth`: *7*
    * `learning_rate`: *0.1*
    * `gamma`: *0.1*
* **Improvement:** *461.51* 

In [None]:
param_dist = {
    'n_estimators': [100, 300, 500, 1000], # Number of trees
    'max_depth': [3, 5, 7, 9],          # Tree depth
    'learning_rate': [0.01, 0.05, 0.1], # Learning speed
    'subsample': [0.7, 0.8, 1.0],       # How much of the data will be used for each tree?
    'gamma': [0, 0.1, 0.2]              # Penalty term preventing overfitting
}
xgb = XGBRegressor(random_state=7, n_jobs=-1)
xgb_random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=10, 
    cv=3,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1,
    random_state=42,
    verbose=2
)

start_time = time.time()
xgb_random_search.fit(x_train, y_train)
end_time = time.time()
print(f"Optimisation completed. Duration: {(end_time - start_time) / 60:.2f} minutes.")
print(f"Best Parameters Found:\n {xgb_random_search.best_params_}")
best_xgb = xgb_random_search.best_estimator_
y_pred_best_xgb = best_xgb.predict(x_test)

r2_best_xgb = r2_score(y_test, y_pred_best_xgb)
rmse_best_xgb = np.sqrt(mean_squared_error(y_test, y_pred_best_xgb))
print(f"Optimised R-Squared: {r2_best_xgb:.4f}")
print(f"Optimised RMSE:      {rmse_best_xgb:.2f} $")

print(f"Baseline RMSE:       {2287.18:.2f} $")
print(f"Optimised RMSE:      {rmse_best_xgb:.2f} $")
print(f"Recovery (RMSE):     {2287.18 - rmse_best_xgb:.2f} $")

### 6.4 Validating the Final Model: Overfitting Test

In Step 6.3, we found that the **Optimised XGBoost** model gave the best *test* score with an RMSE of 1800.91.

However, a model being "best" means not only having a low test score, but also being *robust*. In this step, a validation test was performed to check whether the model "memorised" the training data (overfitting).

The model was trained with the best parameters (`n_estimators=1000`, `max_depth=7`, etc.) and made predictions on both the *training data* (seen before) and the *test data* (never seen before).

* **Training RMSE:** *1565.98* 
* **Test RMSE:** *1902.00* 
* **Difference (Test - Training):** *336.01* 

In [None]:
best_params_xgb = {
    'subsample': 1.0, 
    'n_estimators': 1000, 
    'max_depth': 7, 
    'learning_rate': 0.1, 
    'gamma': 0.1,
    'random_state': 42, 
    'n_jobs': -1         
}
model_xgb_final = XGBRegressor(**best_params_xgb)
model_xgb_final.fit(x_train, y_train)
print("... Predictions are being made on the test data....")
y_pred_test = model_xgb_final.predict(x_test)
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))

print("... Predictions are being made on training data...")
y_pred_train = model_xgb_final.predict(x_train)
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
print(f"Train RMSE: {rmse_train:.2f} $")
print(f"Test RMSE:  {rmse_test:.2f} $")
print("---------------------------------------------")
print(f"Difference (Test - Train):         {rmse_test - rmse_train:.2f} $")
print(f"Test Error / Training Error Ratio: {rmse_test / rmse_train:.2f}")

### 6.5 Visual Validation of the Final Model: Learning Curve

The best way to prove how "healthy" the model (Optimised XGBoost) is, is to visualise the training process.

The graph below shows how the error (RMSE) on both the Training and Test data changed *each time a tree was added* throughout the model's 1000-tree (`n_estimators`) training process.

<center><img src="xgboost_learning_curve.png" alt="Drawing" style="width": 900px;/></center>

**Graph Analysis:**

* **Training RMSE Line (Blue):** Shows that the model reduces its error (as expected) to 1391 as it sees the training data. The fact that it does not drop to zero (overfitting) is a healthy sign.
* **Test RMSE Line (Orange):** This is the model's error on data it has never seen before. This line also decreases *parallel* to the training line and reaches a **stable (flat) plateau** at around 1800 after approximately 200-300 trees.

**Conclusion:** The fact that the test line does not rise while the training line falls, but instead follows it in a parallel and stable manner, visually demonstrates that our model is not overfitting and is successfully generalising.

In [None]:
best_params_xgb = {
    'subsample': 1.0, 
    'n_estimators': 1000, 
    'max_depth': 7, 
    'learning_rate': 0.1, 
    'gamma': 0.1,
    'random_state': 42, 
    'n_jobs': -1
}
model_xgb_final = XGBRegressor(**best_params_xgb)
eval_set = [(x_train, y_train), (x_test, y_test)]
model_xgb_final.fit(x_train, y_train, eval_set=eval_set, verbose=False)
results = model_xgb_final.evals_result()
train_rmse = results['validation_0']['rmse']
test_rmse = results['validation_1']['rmse']
iterations = range(1, len(train_rmse) + 1)

plt.figure(figsize=(12, 7))
sns.lineplot(x=iterations, y=train_rmse, label='Train RMSE')
sns.lineplot(x=iterations, y=test_rmse, label='Test RMSE')
plt.title('XGBoost Learning Curve\n(Overfitting Test)', fontsize=16)
plt.xlabel('Number of Trees (Iteration)', fontsize=12)
plt.ylabel('RMSE', fontsize=12)
plt.legend(fontsize=12)
plt.grid(True)
plt.ylim(bottom=0, top=min(max(train_rmse), max(test_rmse)) * 1.1) 
plt.savefig("xgboost_learning_curve.png", dpi=300, bbox_inches='tight')
plt.show()

## Step 7: Final Results and Project Summary

This project aimed to solve a vehicle price prediction problem using 9 different machine learning models (`car_prices.csv`).

The project flow consisted of the following steps:
1.  **Step 1: Exploratory Data Analysis (EDA)** (Visual and numerical analysis)
2.  **Step 2: Data Cleaning and Feature Engineering** (`car_age` was created, `mmr` leakage was prevented)
3.  **Step 3: Advanced Encoding** (`Target Encoding` was used for high-cardinality columns such as `model` and `trim`)
4.  **Step 4: Feature Scaling** (`StandardScaler` applied for KNN and LinReg)
5.  **Step 5: Baseline Comparison** (9 models tested with default parameters and **Random Forest** (RMSE: $2027) selected as leader)
6.  **Step 6: Hyperparameter Optimisation** (The top 3 finalists - RF, CatBoost, XGBoost - were optimised using `RandomisedSearchCV`)

### The Project's Final Winner

The optimisation tests conducted in Step 6 altered the baseline comparison results from Step 5 and determined the project's final winner.

* **Random Forest** optimisation failed due to memory errors and overfitting (RMSE: 2072.57), failing to surpass the default model (RMSE: 2027.03).
* **CatBoost** optimisation took 10.04 minutes and took the lead by reducing the RMSE from 2112.82 to **2108.06**.
* **XGBoost** optimisation took only **2.41 minutes** and achieved an **overwhelming superiority** by reducing the RMSE from 2287$ to **1902.00$**.

### FINAL WINNING TABLE (Based on RMSE)

| Model | Status | RMSE ($) | R² | Optimisation Time (Minutes) | Description |
| :--- | :--- | :--- | :--- | :--- | :--- |
| **XGBoost (Optimised)** | **Project Winner** | **1902.00** | **0.9623** | **2.41** | **Best balance of speed and accuracy.** |
| CatBoost (Optimised) | Second | 2108.06 | 0.9536 | 10.04 | Very strong and stable accuracy. |
| Random Forest (Baseline) | Third | 2072.57 | 0.9552 | 21.15 | Optimisation failed (Overfitting). |

### Project Result

In this project, 9 models were compared, and the **Optimised XGBoost** model was selected as the best price prediction model with an RMSE of £1902.00 and an R² score of 96.23%.

(The project's best parameters: `{'subsample': 1.0, 'n_estimators': 1000, 'max_depth': 7, 'learning_rate': 0.1, 'gamma': 0.1}`)


In [None]:
categorical_cols = ['make', 'model', 'trim', 'body', 'transmission', 'color', 'interior', 'seller']
for col in categorical_cols:

    df[col] = df[col].astype(str).str.title().str.strip()

df['state'] = df['state'].str.upper().str.strip()

us_states = {
    'AL': 'Alabama', 'AK': 'Alaska', 'AZ': 'Arizona', 'AR': 'Arkansas', 'CA': 'California',
    'CO': 'Colorado', 'CT': 'Connecticut', 'DE': 'Delaware', 'FL': 'Florida', 'GA': 'Georgia',
    'HI': 'Hawaii', 'ID': 'Idaho', 'IL': 'Illinois', 'IN': 'Indiana', 'IA': 'Iowa',
    'KS': 'Kansas', 'KY': 'Kentucky', 'LA': 'Louisiana', 'ME': 'Maine', 'MD': 'Maryland',
    'MA': 'Massachusetts', 'MI': 'Michigan', 'MN': 'Minnesota', 'MS': 'Mississippi', 'MO': 'Missouri',
    'MT': 'Montana', 'NE': 'Nebraska', 'NV': 'Nevada', 'NH': 'New Hampshire', 'NJ': 'New Jersey',
    'NM': 'New Mexico', 'NY': 'New York', 'NC': 'North Carolina', 'ND': 'North Dakota', 'OH': 'Ohio',
    'OK': 'Oklahoma', 'OR': 'Oregon', 'PA': 'Pennsylvania', 'RI': 'Rhode Island', 'SC': 'South Carolina',
    'SD': 'South Dakota', 'TN': 'Tennessee', 'TX': 'Texas', 'UT': 'Utah', 'VT': 'Vermont',
    'VA': 'Virginia', 'WA': 'Washington', 'WV': 'West Virginia', 'WI': 'Wisconsin', 'WY': 'Wyoming'
}

state_ui_map = {us_states.get(code, code): code for code in sorted(df['state'].unique())}

options_optimized = {
    "makes": sorted(df['make'].unique().tolist()),
    "transmissions": sorted(df['transmission'].unique().tolist()),
    "colors": sorted(df['color'].unique().tolist()),
    "interiors": sorted(df['interior'].unique().tolist()),
    "states_map": state_ui_map,  
    "make_models": df.groupby('make')['model'].unique().apply(lambda x: sorted(list(x))).to_dict(),
    "model_trims": df.groupby('model')['trim'].unique().apply(lambda x: sorted(list(x))).to_dict(),
    "model_bodies": df.groupby('model')['body'].unique().apply(lambda x: sorted(list(x))).to_dict(),
    "make_sellers": df.groupby('make')['seller'].unique().apply(lambda x: sorted(list(x))).to_dict()
}

with open('options.json', 'w') as f:
    json.dump(options_optimized, f)

joblib.dump(model_xgb_final, 'xgb_model.joblib')
joblib.dump(encoder, 'target_encoder.joblib')
joblib.dump(scaler, 'scaler.joblib')