In [3]:
import pandas as pd 
import streamlit as st
from sklearn.model_selection import cross_val_score # store the scores 
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV, Lasso
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import plotly.express as px
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [4]:
df = pd.read_csv("clean_data.csv")
# feature transofrmation
df["age"] = 2024-df["year"]
df["mile_per_year"] = df["mileage_in_km"]/df["age"]
df["per_liter_km"] = 100/df["fuel_consumption_l_100km"]


In [20]:
df.mile_per_year.max()

225000.0

In [93]:
# Define features and Target (Regressand and Regressor)
y = df["price_in_euro"]
X = df[['brand', 'model', 'power_ps', 'transmission_type', 'fuel_type', 'age', 'mile_per_year', 'per_liter_km']]

In [94]:
# Preprocessing for categorical and numerical data using sklearn ColumnTransformer model

preprocessor = ColumnTransformer(transformers= [
    ("num", StandardScaler(), ["power_ps", "age", "mile_per_year", "per_liter_km"]),
    ("cat", OneHotEncoder(handle_unknown="ignore",drop='first'),["brand", "model", "power_ps","transmission_type", "fuel_type"])
]
)

In [95]:
# create a pipeline that combines preprocessing and model training, 
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor), 
    ('regressor', XGBRegressor(
        
    ))
])

In [96]:
# Train Test Split on the training data 
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 42)

In [97]:
# fit the model on the training data
pipeline.fit(X_train, y_train)

In [98]:
# predict the model on the test set 
y_predict = pipeline.predict(X_test)




In [99]:
# create a new dataframe with actual and predicted value, append fetures from actual data for visualizing on toggle. 
actual_predicted_error = pd.DataFrame({
    "actual": y_test,
    "predicted": y_predict.round(),
    "error": y_test -y_predict,
    "brand": X_test["brand"],
    "model": X_test["model"],
    "age": X_test["age"]
})


In [None]:
# saving the results into csv file. this is used for visualization on machine lerning radio
actual_predicted_error.to_csv(f"{str(pipeline[1])[:-2]}_results.csv", index=False)

In [105]:
#best_alpha = pipeline.named_steps['regressor'].alpha_ # Show best alpha which minimizes the loss function. 
#best_l1_ratio = pipeline.named_steps['regressor'].l1_ratio_ # show best regularization ratio of l1 and l2, used in elasticlasso model
#print(best_alpha)
#print(best_l1_ratio)
# Calculate MSE on the test set
MAE = mean_absolute_error(y_test, y_predict)
print(f"Mean Absolute Error: {MAE:.2f}")
r2_score(y_test, y_predict)


Mean Absolute Error: 3871.58


0.7204550460272603