In [1]:
import pandas as pd
import numpy as np


In [2]:
# import missing data
car_sales_missing = pd.read_csv("./052 car-sales-missing-data.csv")

In [5]:
X = car_sales_missing.drop("Price", axis=1)
y = car_sales_missing["Price"]


# option #1 fill missing data with Pandas

In [6]:
car_sales_missing["Make"].fillna("missing", inplace=True)

car_sales_missing["Colour"].fillna("missing", inplace=True)

car_sales_missing["Odometer"].fillna(car_sales_missing["Odometer"].mean(), inplace=True)

car_sales_missing["Doors"].fillna(4, inplace=True)


In [7]:
# remove rows with missing Price value
car_sales_missing.dropna(inplace=True)

In [8]:
X = car_sales_missing.drop("Price", axis=1)
y = car_sales_missing["Price"]

# covert data

In [9]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder='passthrough')

transformed_X = transformer.fit_transform(car_sales_missing)

# option 2: fill missings with scikitlearn 

In [10]:
car_sales_missing = pd.read_csv("./052 car-sales-missing-data.csv")
car_sales_missing.dropna(subset=["Price"], inplace=True)

In [11]:
X = car_sales_missing.drop("Price", axis=1)
y = car_sales_missing["Price"]
y = y.str.replace("[\$\,\.]", "").astype(int)

In [12]:
# fill missing values with scikit-learn
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# fill categorical values with 'missing' & numerical values with mean values
cat_inputer = SimpleImputer(strategy="constant", )
door_inputer = SimpleImputer(strategy='constant', fill_value=4)
num_inputer = SimpleImputer(strategy='mean')

#define columns
cat_feature = ['Make', "Colour"]
door_feature = ["Doors"]
num_features = ["Odometer"]

# create an imputter (something that fills missing data)
imputer = ColumnTransformer([('cat_imputer', cat_inputer, cat_feature),
                            ("door_imputer", door_inputer, door_feature),
                            ('num_imputer', num_inputer, num_features)])

# transform the data
filled_X = imputer.fit_transform(X)
filled_X

array([['Toyota', 'White', 4.0, 150043.0],
       ['Honda', 'Red', 4.0, 87899.0],
       ['Toyota', 'Blue', 3.0, 98763.2],
       ['BMW', 'Black', 5.0, 11179.0],
       ['Nissan', 'White', 4.0, 213095.0],
       ['Toyota', 'Green', 4.0, 98763.2],
       ['Honda', 'missing_value', 4.0, 98763.2],
       ['missing_value', 'White', 4.0, 31600.0]], dtype=object)

In [13]:
car_sales_filled = pd.DataFrame(filled_X,
                                columns=["Make", "Colour", "Doors", "Odometer (KM)"])

In [14]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder='passthrough')

transformed_X = transformer.fit_transform(car_sales_filled)

In [15]:
# let's fir a model
np.random.seed(42)
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(transformed_X, y, test_size=0.3)

model = RandomForestRegressor()
model.fit(X_train, y_train)
model.score(X_test, y_test)

-65.97626