In [1]:
# import basic modules
import pandas as pd
import numpy as np
import pickle
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score

In [2]:
# import dataset
df = pd.read_csv('audi.csv')

In [3]:
df.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,A1,2017,12500,Manual,15735,Petrol,150,55.4,1.4
1,A6,2016,16500,Automatic,36203,Diesel,20,64.2,2.0
2,A1,2016,11000,Manual,29946,Petrol,30,55.4,1.4
3,A4,2017,16800,Automatic,25952,Diesel,145,67.3,2.0
4,A3,2019,17300,Manual,1998,Petrol,145,49.6,1.0


In [4]:
# Alternatively X and y can be obtained directly from the frame attribute:
X = df.drop('price', axis=1)
y = df['price']

In [5]:
# Transform Numeric Features
numeric_features = ["year", "mileage","tax", "mpg","engineSize"]
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

In [6]:
# Transform Categorical Features
categorical_features = ["transmission", "fuelType"]
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

In [7]:
# Set preporcessor using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

In [8]:
# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
pipe = Pipeline(
    steps=[("preprocessor", preprocessor), ("model", RandomForestRegressor())]
)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [10]:
pipe.fit(X_train, y_train)

In [11]:
# Predict
y_pred = pipe.predict(X_test)

In [13]:
print('MAE :', mean_absolute_error(y_test, y_pred))


MAE : 1782.3328231543878


## Tuning

In [14]:
# Apply grid search to find best param and prevent overfit
param_grid = {
 'model__min_samples_leaf': [1,2],
 'model__min_samples_split': [1,2],
 'model__n_estimators': [150]
}

grid_search = GridSearchCV(pipe, param_grid, cv=5,scoring='neg_mean_absolute_error')
grid_search

In [None]:
grid_search.fit(X, y)

In [None]:
print("Best params:")
print(grid_search.best_params_)

In [None]:
print("Best score:")
print(abs(grid_search.best_score_))

## Final Pipe

In [None]:
# Final Pipe to deploy
final_pipe = Pipeline(
    steps=[("preprocessor", preprocessor), ("model", RandomForestRegressor(min_samples_leaf= 2, min_samples_split= 2, n_estimators= 150))]
)

In [None]:
final_pipe.fit(X, y)

In [None]:
pickle.dump(final_pipe, open('final_pipe.sav','wb'))