In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn

In [2]:
data = pd.read_csv("scikit-learn-data/car-sales-extended-missing-data.csv")
data.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [3]:
data.dtypes

Make              object
Colour            object
Odometer (KM)    float64
Doors            float64
Price            float64
dtype: object

In [4]:
data.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

<div align="center" style="color: orange; font-size: 24px;">

<h1>Steps to Do:</h1>

1. Fill missing data <br>  
2. Convert data to numbers <br>  
3. Build a model on data  

</div>


In [26]:
# Getting data ready
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Modelling 
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split , GridSearchCV

np.random.seed(42)

#drop data with missing labels(Price)
data.dropna(subset=["Price"] , inplace=True)

#define different features and transformer pipleine
categorical_features = ["Make" , "Colour"]
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

door_features = ["Doors"]
door_transformers = Pipeline(steps=[
    ("imputer" , SimpleImputer(strategy="constant" , fill_value=4))
])

numeric_features = ["Odometer (KM)"]
numeric_transformer = Pipeline(steps=[
    ("imputer" , SimpleImputer(strategy="mean"))
])

# setting up preprocessing steps(fiiling ans converting to numbers)
preprocessor = ColumnTransformer(
                        transformers=[
                            ("cat" , categorical_transformer , categorical_features) , 
                            ("door" , door_transformers , door_features) ,
                            ("num" , numeric_transformer , numeric_features)
                            
                        ])





# create a preprocessing and modeling pipleine
model= Pipeline(steps=[
    ("preprocessor" , preprocessor) , 
    ("model" , RandomForestRegressor())
])

#split data
x = data.drop("Price" , axis=1)
y = data["Price"]
x_train , x_test , y_train , y_test = train_test_split(x , y , test_size=0.2)

model.fit(x_train , y_train)

In [27]:
model.score(x_test , y_test)

0.22188417408787875

Using gridSearchvc and randomizedSearchcv with our pipelines

In [30]:
# use gridSeachCbV with  regression pipeline
pipeline_grid = {
    "preprocessor__num__imputer__strategy":["mean" , "median"],
    "model__n_estimators":[100,1000] , 
    "model__max_depth":[None , 5] , 
    "model__min_samples_split":[2 , 4]
}

gs_model = GridSearchCV(model ,pipeline_grid , cv=5 , verbose=2)
gs_model.fit(x_train , y_train)


Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END model__max_depth=None, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.1s
[CV] END model__max_depth=None, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.1s
[CV] END model__max_depth=None, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.1s
[CV] END model__max_depth=None, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.1s
[CV] END model__max_depth=None, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.1s
[CV] END model__max_depth=None, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=median; total time=   0.1s
[CV] END model__max_depth=None, model__min_samples_spli

In [31]:
gs_model.score(x_test , y_test)

0.3339554263158365