In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 

In [12]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import cross_val_score, train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, roc_curve, classification_report
from sklearn.datasets import fetch_california_housing, load_breast_cancer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# New # New # New # New # New # New # New # New 
from sklearn.pipeline import Pipeline

In [3]:
from joblib import dump, load

In [15]:
data.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [31]:
# Random Seed 
np.random.seed(42)



# Import the Data 
data = pd.read_csv("data/car-sales-extended-missing-data.csv")



# Droping the Price missing values
data.dropna(subset= "Price", inplace=True)



# Filling missing values
cat_features = ["Make", "Colour"]
cat_trasformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
                          ("onehot", OneHotEncoder(handle_unknown= "ignore"))])

door_features = ["Doors"]
door_transorm = Pipeline(steps=[("imputer", SimpleImputer(strategy="constant", fill_value=4)), 
                         ("onehot", OneHotEncoder(handle_unknown="ignore"))])

num_features = ["Odometer (KM)"]
num_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="mean"))])



# Setup the preprocessing the Steps(filling the missing values then converting it into numeric values)
preprocessor  = ColumnTransformer([("cat", cat_trasformer, cat_features),
                                 ("door", door_transorm, door_features),
                                 ("num", num_transformer, num_features)]
                            )



# Creating a preprocessing and modelling pipeline
model = Pipeline(steps=[("preprocessor", preprocessor),
                        ("model", RandomForestRegressor())])



# Split the Data into X and Y
x = data.drop("Price", axis =1)
y= data["Price"]



# Split the data into training and testing set 
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

model.fit(x_train, y_train)

model.score(x_test, y_test)

0.21735623151692096

In [36]:
pipe_grid = {
    "preprocessor__num__imputer__strategy": ["mean", "median"],
    "model__n_estimators" : [100, 1000],
    "model__max_depth" : [None, 5],
    "model__max_features" : ["auto"],
    "model__min_samples_leaf" : [1, 2]
}

gs_model = GridSearchCV(model,
                       pipe_grid,
                       cv=5,
                       verbose =2)
gs_model.fit(x_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_leaf=1, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.1s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_leaf=1, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.1s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_leaf=1, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.1s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_leaf=1, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.1s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_leaf=1, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.1s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_le

[CV] END model__max_depth=5, model__max_features=auto, model__min_samples_leaf=1, model__n_estimators=100, preprocessor__num__imputer__strategy=median; total time=   0.0s
[CV] END model__max_depth=5, model__max_features=auto, model__min_samples_leaf=1, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time=   0.9s
[CV] END model__max_depth=5, model__max_features=auto, model__min_samples_leaf=1, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time=   0.9s
[CV] END model__max_depth=5, model__max_features=auto, model__min_samples_leaf=1, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time=   0.9s
[CV] END model__max_depth=5, model__max_features=auto, model__min_samples_leaf=1, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time=   1.2s
[CV] END model__max_depth=5, model__max_features=auto, model__min_samples_leaf=1, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('cat',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer(fill_value='missing',
                                                                                                        strategy='constant')),
                                                                                         ('onehot',
                                                                                          OneHotEncoder(handle_unknown='ignore'))]),
                                                                         ['Make',
                                                                          'Colour']),
                                                                        ('door',
         

In [37]:
gs_model.score(x_test, y_test)

0.32972147176560707