<a href="https://colab.research.google.com/github/VishalMoradia/ML-Practice-P1/blob/main/CarPricePrediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

### Importing data

In [None]:
df = pd.read_csv('/car-sales-extended-missing-data.csv')
df.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


#### Looking at missing data

In [None]:
df.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

#### Dropping the rows with no label ("Price values are missing as that is our target variable")

In [None]:
df.dropna(subset=['Price'], inplace = True)
df.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [None]:
from sklearn.model_selection import train_test_split


X = df.drop('Price', axis = 1)
y = df['Price']

np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [None]:
X.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
dtype: int64

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer


# FIll categorical values with 'missing' and numerical values with mean

cat_imputer = SimpleImputer(strategy='constant', fill_value = 'missing')
door_imputer = SimpleImputer(strategy='constant', fill_value = 4)
num_imputer = SimpleImputer(strategy='mean')

# Define columns

cat_features = ['Make', 'Colour']
door_feature = ['Doors']
num_feature = ['Odometer (KM)']


# Create an imputer (Something that fills missing data)

imputer = ColumnTransformer([
                             ('cat_imputer', cat_imputer, cat_features),
                             ('door_imputer', door_imputer, door_feature),
                             ('num_imputer', num_imputer, num_feature)
])

# Transform the data

filled_X_train = imputer.fit_transform(X_train)
filled_X_test = imputer.fit_transform(X_test)

filled_X_train

array([['Honda', 'White', 4.0, 71934.0],
       ['Toyota', 'Red', 4.0, 162665.0],
       ['Honda', 'White', 4.0, 42844.0],
       ...,
       ['Toyota', 'White', 4.0, 196225.0],
       ['Honda', 'Blue', 4.0, 133117.0],
       ['Honda', 'missing', 4.0, 150582.0]], dtype=object)

In [None]:
cars_df_train = pd.DataFrame(filled_X_train,
                       columns = ['Make', 'Color', 'Doors', 'Odometer'])

cars_df_test = pd.DataFrame(filled_X_test,
                       columns = ['Make', 'Color', 'Doors', 'Odometer'])

cars_df_train.head()

Unnamed: 0,Make,Color,Doors,Odometer
0,Honda,White,4.0,71934.0
1,Toyota,Red,4.0,162665.0
2,Honda,White,4.0,42844.0
3,Honda,White,4.0,195829.0
4,Honda,Blue,4.0,219217.0


In [None]:
cars_df_train.isna().sum()

Make        0
Color       0
Doors       0
Odometer    0
dtype: int64

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer


category_feature = ['Make', 'Color', 'Doors']
one_hot = OneHotEncoder()

transformer = ColumnTransformer([
                                 ('one_hot', one_hot, category_feature)], remainder='passthrough')


transformed_X_train = transformer.fit_transform(cars_df_train)
transformed_X_test = transformer.transform(cars_df_test)


transformed_X_train.toarray()

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 7.19340e+04],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 1.62665e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 4.28440e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 1.96225e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 1.33117e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 1.50582e+05]])

### Fitting a model

In [None]:
np.random.seed(42)

from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()


model.fit(transformed_X_train, y_train)
model.score(transformed_X_test, y_test)

0.25366332156443805