In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

1. import car sale csv
2. manage the missing data and make all data in numerical form
3. 


In [2]:
data = pd.read_csv("data/car-sales-extended-missing-data.csv")
data

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [3]:
data.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [7]:
data.dtypes

Make              object
Colour            object
Odometer (KM)    float64
Doors            float64
Price            float64
dtype: object

In [11]:
#Getting data  ready 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Modelling 
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV

#setup random seed
np.random.seed(42)

#imort data and drom the missing data rows
data = pd.read_csv("data/car-sales-extended-missing-data.csv")
data.dropna(subset=["Price"], inplace=True)

# define different features and transformer pipeline
categorical_feature = ["Make", "Colour"]
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="Missing")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))])

door_feature = ["Doors"]
door_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value=4))])

numeric_feature = ["Odometer (KM)"]
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean"))])

#setup preprocessing steps 

preprocessor = ColumnTransformer(
                    transformers=[
                        ("categorical", categorical_transformer, categorical_feature),
                        ("door", door_transformer, door_feature),
                        ("numeric", numeric_transformer, numeric_feature)])

## preprocessing and modelling pipeline 
model = Pipeline( steps=[("preprocessor", preprocessor),
                         ("model", RandomForestRegressor())])

#Split data
x = data.drop("Price", axis=1)
y = data["Price"]

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)

# Fit and score the model
model.fit(x_train, y_train)
model.score(x_test, y_test)

0.22027638379177727

### Hyper parameter changing to improve the model

##### It is also possible to use GridsearchCV or RandomIzedSearchCV with ou Pipeline

In [18]:
#use GridSearchCV with our Regression Pipline

from sklearn.model_selection import GridSearchCV

pipe_grid = {
    "preprocessor__numeric__imputer__strategy": ["mean", "median"],
    "model__n_estimators": [100, 1000],
    "model__max_depth": [None, 5],
    "model__max_features": [1, 100],
    "model__min_samples_split" : [2,4]
}

gs_model = GridSearchCV(model, pipe_grid, cv =5, verbose=2)
gs_model.fit(x_train, y_train)

Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV] END model__max_depth=None, model__max_features=1, model__min_samples_split=2, model__n_estimators=100, preprocessor__numeric__imputer__strategy=mean; total time=   0.1s
[CV] END model__max_depth=None, model__max_features=1, model__min_samples_split=2, model__n_estimators=100, preprocessor__numeric__imputer__strategy=mean; total time=   0.0s
[CV] END model__max_depth=None, model__max_features=1, model__min_samples_split=2, model__n_estimators=100, preprocessor__numeric__imputer__strategy=mean; total time=   0.0s
[CV] END model__max_depth=None, model__max_features=1, model__min_samples_split=2, model__n_estimators=100, preprocessor__numeric__imputer__strategy=mean; total time=   0.0s
[CV] END model__max_depth=None, model__max_features=1, model__min_samples_split=2, model__n_estimators=100, preprocessor__numeric__imputer__strategy=mean; total time=   0.0s
[CV] END model__max_depth=None, model__max_features=1, model__min_sa

In [19]:
gs_model.score(x_test, y_test)

0.33673622259138436

In [20]:
from joblib import dump, load

dump(gs_model, filename="models/Grid_Search_model_alltogether1.joblib")

['models/Grid_Search_model_alltogether1.joblib']