In [2]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv("04-sklearn & ML Models\Data\car-sales-extended-missing-data.csv")
data

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [4]:
data.dtypes

Make              object
Colour            object
Odometer (KM)    float64
Doors            float64
Price            float64
dtype: object

In [5]:
data.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [6]:
# Getting data ready
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [7]:
# Modelling 
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV

In [8]:
np.random.seed(42)

In [9]:
pip install --upgrade scikit-learn




In [10]:
data = pd.read_csv("04-sklearn & ML Models\Data\car-sales-extended-missing-data.csv")
data.dropna(subset=['Price'], inplace=True)

# Define different features and transformer pipeline
cata_feature = ['Make', 'Colour']
cata_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('oneHot', OneHotEncoder(handle_unknown='ignore'))
])

door_feature = ['Doors']
door_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=4))
])

odometer_feature = ['Odometer (KM)']
odometer_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])







In [11]:
# Setup preprocessing steps (fill missing value then convert to numbers)
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cata_transformer, cata_feature),
        ('door', door_transformer, door_feature),
        ('num', odometer_transformer, odometer_feature)
    ]
)


In [12]:
# creating pipelien model
model = Pipeline(steps=[("preprocessor", preprocessor),
                        ('model', RandomForestRegressor)
                        ])


x = data.drop('Price', axis=1)
y = data.Price
x_tran, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2)


In [13]:
model.named_steps

{'preprocessor': ColumnTransformer(transformers=[('cat',
                                  Pipeline(steps=[('imputer',
                                                   SimpleImputer(fill_value='missing',
                                                                 strategy='constant')),
                                                  ('oneHot',
                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                  ['Make', 'Colour']),
                                 ('door',
                                  Pipeline(steps=[('imputer',
                                                   SimpleImputer(fill_value=4,
                                                                 strategy='constant'))]),
                                  ['Doors']),
                                 ('num',
                                  Pipeline(steps=[('imputer', SimpleImputer())]),
                                  ['Odometer (KM)'])])

In [14]:
model.fit(x_tran, y_train)

AttributeError: 'numpy.ndarray' object has no attribute '_validate_params'

In [None]:
x_tran

Unnamed: 0,Make,Colour,Odometer (KM),Doors
416,Honda,White,18933.0,4.0
100,Honda,Blue,146233.0,4.0
371,,Blue,23545.0,4.0
20,Toyota,,124844.0,4.0
995,Toyota,Black,35820.0,4.0
...,...,...,...,...
306,Honda,Red,108681.0,4.0
175,Toyota,Blue,51155.0,4.0
449,Honda,White,146703.0,4.0
25,Honda,Blue,125819.0,4.0


In [15]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Load the data
data = pd.read_csv("04-sklearn & ML Models\Data\car-sales-extended-missing-data.csv")
data.dropna(subset=['Price'], inplace=True)

# Define different features and transformer pipeline
cata_features = ['Make', 'Colour']
cata_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('oneHot', OneHotEncoder(handle_unknown='ignore'))
])

door_feature = ['Doors']
door_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=4))
])

odometer_feature = ['Odometer (KM)']
odometer_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])

# Setup preprocessing steps (fill missing value, then convert to numbers)
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cata_transformer, cata_features),
        ('door', door_transformer, door_feature),
        ('num', odometer_transformer, odometer_feature)
    ]
)

# Create the pipeline model
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ('model', RandomForestRegressor())
])

x = data.drop('Price', axis=1)
y = data.Price

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# Fit the model
model.fit(x_train, y_train)


In [16]:
model.score(x_test, y_test)

0.18266646105837814

In [17]:
from sklearn.model_selection import GridSearchCV

pipe_gride = {
    'preprocessor__num__imputer__strategy': ["mean", "median"],
    'model__n_estimators': [100, 1000],
    'model__max_depth': [None, 5],
    'model__max_features': ['sqrt'],  # Change 'auto' to 'sqrt'
    'model__min_samples_split': [2, 4]
}

gs_model = GridSearchCV(model, pipe_gride, cv=5, verbose=2)
gs_model.fit(x_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.2s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.2s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.2s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.2s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.2s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_sampl

In [18]:
gs_model.score(x_test, y_test)

0.26231122258434136

In [19]:
x_test

Unnamed: 0,Make,Colour,Odometer (KM),Doors
42,Honda,Black,200490.0,4.0
595,Honda,Black,61120.0,4.0
465,Toyota,White,117637.0,4.0
394,BMW,Blue,85320.0,3.0
895,Nissan,White,48277.0,4.0
...,...,...,...,...
743,Honda,Red,47404.0,4.0
441,Nissan,Green,90446.0,4.0
999,Toyota,Blue,248360.0,4.0
5,Honda,Red,42652.0,4.0
