In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [24]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline

data = pd.read_csv('../../Datas/car-sales-extended-missing-data.csv')

In [25]:
data.dtypes


Make              object
Colour            object
Odometer (KM)    float64
Doors            float64
Price            float64
dtype: object

In [26]:
data.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [27]:
data.dropna(subset = ['Price'], inplace = True)

In [28]:
data.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [30]:
np.random.seed(42)

categorical_features = ['Make', 'Colour']
categorical_transformer = Pipeline(steps = [('imputer',
                                             SimpleImputer(strategy = 'constant', fill_value='missing')),
                                           ('onehot', OneHotEncoder(handle_unknown='ignore'))])
Door_feature = ['Doors']
Door_transformer = Pipeline(steps = [('imputer', SimpleImputer(strategy = 'constant', fill_value = 4))])
numeric_feature = ['Odometer (KM)']
numeric_transformer = Pipeline(steps = [('imputer', SimpleImputer(strategy = 'mean'))])

preporcessor = ColumnTransformer(transformers = 
                                [('cat', categorical_transformer, categorical_features),
                                ('door', Door_transformer, Door_feature),
                                ('numeric', numeric_transformer, numeric_feature)])

model = Pipeline(steps = [('preproccessor', preporcessor),
                         ('model', RandomForestRegressor())])

X = data.drop('Price', axis = 1)
y = data['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size = 0.2)
model.fit(X_train, y_train)
model.score(X_test,y_test)

0.22188417408787875

In [38]:
grids = {'preproccessor__numeric__imputer__strategy': ['mean', 'median'],
         'model__n_estimators': [100,800],
         'model__max_depth': [None, 5],
         'model__max_features': ["auto"],
         'model__min_samples_split': [2, 4]}

In [39]:
from sklearn.model_selection import GridSearchCV
gs_model = GridSearchCV(estimator = model, param_grid=grids, cv = 5, verbose = 2)
gs_model.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preproccessor__numeric__imputer__strategy=mean; total time=   0.3s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preproccessor__numeric__imputer__strategy=mean; total time=   0.2s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preproccessor__numeric__imputer__strategy=mean; total time=   0.2s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preproccessor__numeric__imputer__strategy=mean; total time=   0.2s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preproccessor__numeric__imputer__strategy=mean; total time=   0.3s
[CV] END model__max_depth=None, model__max_featur

[CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preproccessor__numeric__imputer__strategy=median; total time=   0.2s
[CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preproccessor__numeric__imputer__strategy=median; total time=   0.2s
[CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preproccessor__numeric__imputer__strategy=median; total time=   0.2s
[CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=800, preproccessor__numeric__imputer__strategy=mean; total time=   1.3s
[CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=800, preproccessor__numeric__imputer__strategy=mean; total time=   1.3s
[CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=800, preproccess

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preproccessor',
                                        ColumnTransformer(transformers=[('cat',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer(fill_value='missing',
                                                                                                        strategy='constant')),
                                                                                         ('onehot',
                                                                                          OneHotEncoder(handle_unknown='ignore'))]),
                                                                         ['Make',
                                                                          'Colour']),
                                                                        ('door',
        

In [40]:
gs_model.score(X_test, y_test)

0.3309627999327971