In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
car_sales = pd.read_csv('data/car_sales.csv')

In [3]:
type(car_sales)

pandas.core.frame.DataFrame

In [4]:
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [5]:
car_sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Make           951 non-null    object 
 1   Colour         950 non-null    object 
 2   Odometer (KM)  950 non-null    float64
 3   Doors          950 non-null    float64
 4   Price          950 non-null    float64
dtypes: float64(3), object(2)
memory usage: 39.2+ KB


In [6]:
car_sales.describe()

Unnamed: 0,Odometer (KM),Doors,Price
count,950.0,950.0,950.0
mean,131253.237895,4.011579,16042.814737
std,69094.857187,0.382539,8581.695036
min,10148.0,3.0,2796.0
25%,70391.25,4.0,9529.25
50%,131821.0,4.0,14297.0
75%,192668.5,4.0,20806.25
max,249860.0,5.0,52458.0


In [7]:
car_sales.dtypes

Make              object
Colour            object
Odometer (KM)    float64
Doors            float64
Price            float64
dtype: object

In [8]:
car_sales.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [9]:
car_sales.dropna(subset=['Price'], inplace=True)

In [10]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [11]:
categorical_features = ['Make', 'Colour']

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=-1)),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [12]:
numerical_features = ['Odometer (KM)']

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

In [13]:
# Define Doors features
door_feature = ["Doors"]

# Create Doors transformer Pipeline
door_transformer = Pipeline(steps=[
    # Set SimpleImputer strategy to "constant" and fill value to 4
    ("imputer", SimpleImputer(strategy="constant", fill_value=4))])

In [14]:
# Data preprocessing

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numerical_transformer, numerical_features),
        ('door', door_transformer, door_feature)
])

In [15]:
X = car_sales.drop('Price', axis=1)
y = car_sales['Price']

In [16]:
from sklearn.model_selection import train_test_split

X_train, y_train, X_test, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [17]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((760, 4), (190, 4), (760,), (190,))

In [18]:
# Import Ridge from sklearn's linear_model module
from sklearn.linear_model import Ridge

# Import SVR from sklearn's svm module
from sklearn.svm import SVR

# Import RandomForestRegressor from sklearn's ensemble module
from sklearn.ensemble import RandomForestRegressor

In [19]:
# Create dictionary of model instances, there should be 4 total key, value pairs
# in the form {"model_name": model_instance}.
# Don't forget there's two versions of SVR, one with a "linear" kernel and the
# other with kernel set to "rbf".

regression_models = {
    'Ridge' : Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', Ridge()),
    ]),
    'SVR_linear' : Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', SVR(kernel='linear'))
    ]),
    'SVR_rbf' : Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', SVR(kernel='rbf'))
    ]),
    'RandomForestRegressor' : Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor())
    ])
}

results = {}

In [20]:
np.random.seed(42)

for model_name, model in regression_models.items():
    model.fit(X_train, y_train)
    results[model_name] = model.score(X_test, y_test)

results

TypeError: Encoders require their input argument must be uniformly strings or numbers. Got ['int', 'str']