In [1]:
# Libraries

import joblib
import json
import math
import numpy as np
import pandas as pd
import tensorflow as tf
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential

# Pre-process Car Data

In [2]:
# Load dataset and remove duplicates
df = pd.read_csv('cars.csv')
df = df.drop_duplicates()

# List of common colours
car_colours = [
    'Black', 'White', 'Silver', 'Grey', 'Gray', 'Red', 'Blue', 'Green', 'Yellow',
    'Orange', 'Brown', 'Beige', 'Gold', 'Maroon', 'Purple', 'Pink'
]

# List of acceptable body types
body_types = [
    'SUV', 'Minivan', 'Coupe', 'Convertible', 'Sedan', 'Truck', 'Hatchback',
    'Wagon', 'Cabriolet', 'Compact', 'Van', 'Cab'
]

# Clean each row individually
for index, row in df.iterrows():
    # Fix Kilometres: set to '0 km' if missing and car is New
    if pd.isna(row['Kilometres']) and row.get('Status') == 'New':
        df.at[index, 'Kilometres'] = '0 km'

    # Normalize transmission values
    transmission = row.get('Transmission')
    if isinstance(transmission, str):
        transmission_lower = transmission.lower()
        if 'automatic' in transmission_lower:
            df.at[index, 'Transmission'] = 'automatic'
        elif 'manual' in transmission_lower:
            df.at[index, 'Transmission'] = 'manual'
        else:
            df.at[index, 'Transmission'] = None
    else:
        df.at[index, 'Transmission'] = None

    # Normalize exterior colour
    colour = row.get('Exterior Colour')
    if isinstance(colour, str):
        matched_colour = None
        for car_colour in car_colours:
            if car_colour.lower() in colour.lower():
                matched_colour = 'Grey' if car_colour == 'Gray' else car_colour
                break
        df.at[index, 'Exterior Colour'] = matched_colour
    else:
        df.at[index, 'Exterior Colour'] = None

    # Normalize body type
    body = row.get('Body Type')
    if isinstance(body, str):
        matched_type = None
        for bt in body_types:
            if bt.lower() in body.lower():
                matched_type = bt
                break
        df.at[index, 'Body Type'] = matched_type
    else:
        df.at[index, 'Body Type'] = None

# Define final columns to keep
output_column = ['Price']
input_columns = ['Year', 'Make', 'Model', 'Kilometres']
columns_to_keep = output_column + input_columns

# Retain only necessary columns and drop rows with missing data
df = df[columns_to_keep]
df = df.dropna()

# Convert 'Price' and 'Kilometres' to integers
def parse_numeric(value):
    """Remove commas and extract numeric portion from strings like '34,000 km'."""
    if isinstance(value, str):
        value = value.split()[0].replace(',', '')
    return int(value)

df['Price'] = df['Price'].apply(parse_numeric)
df['Kilometres'] = df['Kilometres'].apply(parse_numeric)

# Save the cleaned dataset
df.to_csv("cars_preprocessed.csv", index=False)

# One Hot Encode Make & Model Cols

In [3]:
# Initialize OneHotEncoder with dense output
encoder = OneHotEncoder(sparse_output=False)

# Categorical columns to encode
categorical_cols = ['Make', 'Model']

def one_hot_encode_categorical_cols(
    df: pd.DataFrame,
    categorical_cols: list[str],
    encoder: OneHotEncoder,
    fit: bool = False
) -> pd.DataFrame:
    """
    One-hot encodes specified categorical columns in the DataFrame.

    Parameters:
    - df (pd.DataFrame): The input DataFrame.
    - categorical_cols (list[str]): Column names to be one-hot encoded.
    - encoder (OneHotEncoder): A scikit-learn OneHotEncoder instance.
    - fit (bool): If True, fits the encoder before transforming; otherwise, assumes encoder is already fitted.

    Returns:
    - pd.DataFrame: DataFrame with categorical columns replaced by one-hot encoded features.
    """
    if fit:
        encoded_data = encoder.fit_transform(df[categorical_cols])
        joblib.dump(encoder, 'encoder.pkl') # Save the fitted scaler for reuse
    else:
        encoded_data = encoder.transform(df[categorical_cols])

    # Create DataFrame from encoded array
    encoded_df = pd.DataFrame(
        encoded_data,
        columns=encoder.get_feature_names_out(categorical_cols),
        index=df.index
    )

    # Drop original categorical columns and concatenate encoded columns
    df = df.drop(columns=categorical_cols)
    return pd.concat([df, encoded_df], axis=1)

# Apply one-hot encoding to the DataFrame
df_encoded = one_hot_encode_categorical_cols(df, categorical_cols, encoder, fit=True)

# Scale Km & Year Cols

In [4]:
# Define numerical columns to scale
numerical_cols = ['Kilometres', 'Year']
scaler = StandardScaler()

def scale_numerical_cols(
    df: pd.DataFrame,
    numerical_cols: list[str],
    scaler: StandardScaler,
    fit: bool = False
) -> pd.DataFrame:
    """
    Scales numerical columns in the DataFrame using the provided scaler.

    Parameters:
    - df (pd.DataFrame): The input DataFrame.
    - numerical_cols (list[str]): List of column names to be scaled.
    - scaler (StandardScaler): A scikit-learn scaler (e.g., StandardScaler, MinMaxScaler).
    - fit (bool): If True, fits the scaler to the data before transforming. If False, uses a pre-fitted scaler.

    Returns:
    - pd.DataFrame: A DataFrame with scaled numerical columns and original non-numerical columns.
    """
    # Separate numerical and non-numerical columns
    numerical_data = df[numerical_cols]
    other_data = df.drop(columns=numerical_cols)

    # Fit and/or transform the numerical columns
    if fit:
        scaled_values = scaler.fit_transform(numerical_data)
        joblib.dump(scaler, 'scaler_x.pkl')  # Save the fitted scaler for reuse
    else:
        scaled_values = scaler.transform(numerical_data)

    # Create DataFrame for scaled values with original indices
    scaled_df = pd.DataFrame(scaled_values, columns=numerical_cols, index=df.index)

    # Combine scaled numerical data with unscaled data
    return pd.concat([scaled_df, other_data], axis=1)

# Apply scaling to the encoded DataFrame
df_scaled = scale_numerical_cols(df_encoded, numerical_cols, scaler, fit=True)

# Split Data into Train / Test

In [5]:
# Separate features and target
X = df_scaled.drop('Price', axis=1)
Y = df_scaled['Price']  # Preserves the original (unscaled) price

# Split into train and test sets (still unscaled)
X_train, X_test, y_train, y_test = train_test_split(
    X.values, Y.values, test_size=0.2, shuffle=True, random_state=42
)

# Scale the training target only (to avoid data leakage)
scaler_y = StandardScaler()
y_train_scaled = scaler_y.fit_transform(y_train.reshape(-1, 1))
y_test_scaled = scaler_y.transform(y_test.reshape(-1, 1))

# Save the fitted scaler after training
joblib.dump(scaler_y, 'scaler_y.pkl')

['scaler_y.pkl']

# Create Model

In [6]:
model = Sequential()
model.add(Dense(units=64, activation='relu', input_dim=X_train.shape[1]))
model.add(Dense(units=32, activation='relu'))
model.add(Dense(units=1, activation='linear'))  # Good: linear output for regression

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [7]:
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error'])

In [8]:
# Convert input features to float32
X_train = X_train.astype(np.float32)
X_test = X_test.astype(np.float32)

# Convert scaled targets to float32 (used for training)
y_train_scaled = y_train_scaled.astype(np.float32)
y_test_scaled = y_test_scaled.astype(np.float32)

# Train Model

In [10]:
# Train the model
model.fit(X_train, y_train_scaled, epochs=100, batch_size=32, validation_split=0.2)

Epoch 1/100
[1m1179/1179[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 7ms/step - loss: 182402528.0000 - mean_absolute_error: 11613.7852 - val_loss: 42617084.0000 - val_mean_absolute_error: 5706.3721
Epoch 2/100
[1m1179/1179[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - loss: 31861438.0000 - mean_absolute_error: 4900.9326 - val_loss: 12119816.0000 - val_mean_absolute_error: 3035.7346
Epoch 3/100
[1m1179/1179[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 9494092.0000 - mean_absolute_error: 2671.8386 - val_loss: 4192601.7500 - val_mean_absolute_error: 1781.6334
Epoch 4/100
[1m1179/1179[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - loss: 3361795.0000 - mean_absolute_error: 1590.7496 - val_loss: 1571033.1250 - val_mean_absolute_error: 1088.0798
Epoch 5/100
[1m1179/1179[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 1260238.1250 - mean_absolute_error: 972.1578 - val_loss: 607524.8125 - val_m

<keras.src.callbacks.history.History at 0x7f238ab17890>

# Compare Price Diff

In [11]:
# Save the trained model
model.save('car_price_prediction_model.keras')

# Predict on test set (scaled input → scaled output)
y_pred_scaled = model.predict(X_test)

# Inverse transform predictions and true labels to original price scale
y_pred_original = scaler_y.inverse_transform(y_pred_scaled)
y_test_original = scaler_y.inverse_transform(y_test_scaled)

# Compute average absolute error
absolute_errors = np.abs(y_test_original - y_pred_original)
average_error = np.mean(absolute_errors)

print(f"Average error is ${average_error:.2f}")

[1m369/369[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Average error is $6169.04
