In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("./data/car_price_prediction.csv") #loading the dataset

In [3]:
# Remove non-numeric characters and convert to numeric
data['Mileage'] = data['Mileage'].str.replace(' km', '').str.replace(',', '').astype(float)

In [4]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Define features and target
features = ["Manufacturer", "Model", "Prod. year", "Category", "Fuel type", "Mileage"]
target = "Price"

X = data[features]
y = data[target]

In [5]:
# Define preprocessing for numerical and categorical features
numeric_features = ["Prod. year", "Mileage"]
numeric_transformer = StandardScaler()

categorical_features = ["Manufacturer", "Model", "Category", "Fuel type"]
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Create a preprocessing and model pipeline
linear_model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                        ('model', LinearRegression())])

# If working with a classification problem and Random Forest
rf_model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                    ('model', RandomForestClassifier(random_state=42))])

In [6]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Train the linear regression model
linear_model_pipeline.fit(X_train, y_train)

# Make predictions
y_pred = linear_model_pipeline.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Linear Regression Model - Mean Absolute Error: {mae}")
print(f"Linear Regression Model - Mean Squared Error: {mse}")
print(f"Linear Regression Model - R² Score: {r2}")


Linear Regression Model - Mean Absolute Error: 11304.207092737768
Linear Regression Model - Mean Squared Error: 1682485453.671205
Linear Regression Model - R² Score: -4.399562685837752


In [19]:
import joblib

filename = "Linear Regressor"
with open(filename, "wb") as file:
    joblib.dump(linear_model_pipeline, file)

    print(f"the linear regressor has be saved as {filename}")

the linear regressor has be saved as Linear Regressor


In [9]:
# Example: Using Random Forest Regressor for comparison

from sklearn.ensemble import RandomForestRegressor

# Define the pipeline for Random Forest Regressor
rf_model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                    ('model', RandomForestRegressor(random_state=42))])

# Train the model
rf_model_pipeline.fit(X_train, y_train)

# Evaluate the model
y_pred_rf = rf_model_pipeline.predict(X_test)

# Calculate metrics
mae_rf = mean_absolute_error(y_test, y_pred_rf)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f"Random Forest Regressor - Mean Absolute Error: {mae_rf}")
print(f"Random Forest Regressor - Mean Squared Error: {mse_rf}")
print(f"Random Forest Regressor - R² Score: {r2_rf}")


Random Forest Regressor - Mean Absolute Error: 5806.274518642822
Random Forest Regressor - Mean Squared Error: 149938850.64224634
Random Forest Regressor - R² Score: 0.5188046224597626


In [10]:
import joblib
model_filename  = "Random Forest Regressor"
with open (model_filename, "wb") as file:
    joblib.dump(rf_model_pipeline, file)

    print(f"the model has be saved to {model_filename}")


the model has be saved to Random Forest Regressor


In [23]:
# load model 
model = joblib.load("./models/Random Forest Regressor")
model2 = joblib.load("./models/Linear Regressor")

new_input_data = pd.DataFrame({
    'Manufacturer': ['MERCEDES-BENZ', 'Honda'],
    'Model': ['E 350', 'Civic'],
    'Prod. year': [2014, 2015],
    'Category': ['Sedan', 'Sedan'],
    'Fuel type': ['Diesel', 'Petrol'],
    'Mileage': [184467, 80000]
})

predcitons = model.predict(new_input_data)
predcitons2 = model2.predict(new_input_data)


# print prediction
print(predcitons)
print(predcitons2 )

# random forest regressor is best

[  933.47 17945.61]
[12603.41633847 46627.05744075]
