In [1]:
# import necessary liberties

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt



In [2]:
car_sales_missing = pd.read_csv("car-sales-extended-missing-data.csv")
car_sales_missing

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [3]:
car_sales_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [4]:
# Droping Prices with null

car_sales_missing.dropna(subset=["Price"],inplace=True)

car_sales_missing.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [5]:
# Split X and Y

x = car_sales_missing.drop("Price",axis=1)
y = car_sales_missing["Price"]

x.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,35431.0,4.0
1,BMW,Blue,192714.0,5.0
2,Honda,White,84714.0,4.0
3,Toyota,White,154365.0,4.0
4,Nissan,Blue,181577.0,3.0


In [6]:
y.head()

0    15323.0
1    19943.0
2    28343.0
3    13434.0
4    14043.0
Name: Price, dtype: float64

In [7]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.2)

x_train.shape, x_test.shape, y_train.shape, y_test.shape

((760, 4), (190, 4), (760,), (190,))

In [8]:
# import liberties
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# using importer
car_imputer = SimpleImputer(strategy="constant", fill_value="missing")
door_imputer = SimpleImputer(strategy="constant", fill_value=4)
num_imputer = SimpleImputer(strategy="mean")

# define columns
car_column = ["Make","Colour"]
door_column = ["Doors"]
octometer_column = ["Odometer (KM)"]

# imputer
imputer = ColumnTransformer([("car_imputer",car_imputer,car_column),
                             ("door_imputer",door_imputer,door_column),
                             ("num_imputer",num_imputer,octometer_column)])

# Filling values seperately
filled_x_train = imputer.fit_transform(x_train)
filled_x_test = imputer.transform(x_test)

# Checking values
filled_x_train

array([['Honda', 'Red', 4.0, 108794.0],
       ['BMW', 'Red', 5.0, 149413.0],
       ['BMW', 'missing', 5.0, 206446.0],
       ...,
       ['Nissan', 'White', 4.0, 193389.0],
       ['Honda', 'Red', 4.0, 47933.0],
       ['Toyota', 'White', 4.0, 32748.0]], dtype=object)

In [9]:
filled_data_frame = pd.DataFrame(filled_x_train, columns =["Make","Colour","Doors","Odometer (KM)"])
filled_data_frame.head()

Unnamed: 0,Make,Colour,Doors,Odometer (KM)
0,Honda,Red,4.0,108794.0
1,BMW,Red,5.0,149413.0
2,BMW,missing,5.0,206446.0
3,Honda,Black,4.0,158337.0
4,BMW,Red,5.0,100060.0


In [14]:
filled_x_test[:10]

array([['Nissan', 'Green', 4.0, 90446.0],
       ['Honda', 'White', 4.0, 130783.0],
       ['Toyota', 'White', 4.0, 132022.37603305784],
       ['Toyota', 'Blue', 4.0, 71306.0],
       ['Toyota', 'White', 4.0, 122453.0],
       ['Nissan', 'Blue', 4.0, 132022.37603305784],
       ['Toyota', 'White', 4.0, 69222.0],
       ['Honda', 'Green', 4.0, 237627.0],
       ['Honda', 'White', 4.0, 30080.0],
       ['Honda', 'Red', 4.0, 16079.0]], dtype=object)

In [15]:
test_data_frame = pd.DataFrame(filled_x_test, columns = ["Make","Colour","Doors","Odometer (KM)"])
test_data_frame.head()

Unnamed: 0,Make,Colour,Doors,Odometer (KM)
0,Nissan,Green,4.0,90446.0
1,Honda,White,4.0,130783.0
2,Toyota,White,4.0,132022.376033
3,Toyota,Blue,4.0,71306.0
4,Toyota,White,4.0,122453.0


In [16]:
filled_data_frame.isna().sum()

Make             0
Colour           0
Doors            0
Odometer (KM)    0
dtype: int64

In [17]:
test_data_frame.isna().sum()

Make             0
Colour           0
Doors            0
Odometer (KM)    0
dtype: int64

In [18]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

features = ["Make","Colour","Doors"]
encoder1 = OneHotEncoder()
experiment = ColumnTransformer([("one hot",
                                  encoder1,
                                  features)],
                                 remainder = "passthrough")

transformed_x_train = experiment.fit_transform(filled_data_frame)
transformed_x_train

<760x15 sparse matrix of type '<class 'numpy.float64'>'
	with 3040 stored elements in Compressed Sparse Row format>

In [21]:
transformed_x_train.toarray()

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 1.08794e+05],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 1.49413e+05],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 2.06446e+05],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 1.93389e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 4.79330e+04],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 3.27480e+04]])

In [22]:
transformed_x_test = experiment.fit_transform(test_data_frame)
transformed_x_test

<190x15 sparse matrix of type '<class 'numpy.float64'>'
	with 760 stored elements in Compressed Sparse Row format>

In [23]:
# Training the dataset

from sklearn.ensemble import RandomForestRegressor

model1 = RandomForestRegressor()

In [24]:
# Filling the model1

model1.fit(transformed_x_train,y_train)


In [26]:
model1.score(transformed_x_test,y_test)

0.33775757232969694