In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("cars.csv")

In [3]:
# create a temporary dataframe to work with
df_tmp = df.copy()
df_tmp.head()

Unnamed: 0,ID,Name,Price,Brand,Model,Variant,Series,Year,Kilometers,Type,Gearbox,Fuel,Status,CC,Color,Seating Capacity
0,11530130,2010 Toyota Rukus Build 2 AZE151R,9999,Toyota,Rukus,Build 2,AZE151R,2010,263000,Wagon,Automatic,Unleaded Petrol,Used,2362,Grey,5
1,12190570,2021 Mercedes-Benz V 250 D Avantgarde MWB 447 ...,117990,Mercedes-Benz,V,250 D Avantgarde MWB,447 MY21,2021,19,Wagon,Automatic,Diesel,New In Stock,2143,Black,7
2,12321855,2021 Mercedes-Benz Valente 116 CDI MWB RWD 447...,79990,Mercedes-Benz,Valente,116 CDI MWB RWD,447 MY21,2021,24,Wagon,Automatic,Diesel,New In Stock,2143,Black,8
3,12346971,2010 Mercedes-Benz E250 CDI Avantgarde 207,34990,Mercedes-Benz,E250,CDI Avantgarde,207,2010,120579,Cabriolet,Automatic,Diesel,Used,2143,Black,4
4,12363884,2016 Holden Cruze CD JH MY16,15990,Holden,Cruze,CD,JH MY16,2016,72506,Sportswagon,Automatic,Unleaded Petrol,Used,1796,White,5


In [4]:
len(df_tmp)

17048

# Based off correlation, we are going to select the features who have correlation from -0.5 to +1 with price. Thus, selected features are: Year, Kilometers, Gearbox, Fuel, Status, CC.

In [5]:
df_tmp.drop(columns=['ID', 'Name', 'Variant', 'Series', 'Type', 'Color', 'Seating Capacity'], inplace=True)

In [6]:
df_tmp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17048 entries, 0 to 17047
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Price       17048 non-null  int64 
 1   Brand       17048 non-null  object
 2   Model       17048 non-null  object
 3   Year        17048 non-null  int64 
 4   Kilometers  17048 non-null  int64 
 5   Gearbox     17048 non-null  object
 6   Fuel        17048 non-null  object
 7   Status      17048 non-null  object
 8   CC          17048 non-null  int64 
dtypes: int64(4), object(5)
memory usage: 1.2+ MB


# Convert all of the string values into category values

In [7]:
for label, content in df_tmp.items():
    if pd.api.types.is_string_dtype(content):
        df_tmp[label] = content.astype("category").cat.codes

In [8]:
df_tmp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17048 entries, 0 to 17047
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   Price       17048 non-null  int64
 1   Brand       17048 non-null  int8 
 2   Model       17048 non-null  int16
 3   Year        17048 non-null  int64
 4   Kilometers  17048 non-null  int64
 5   Gearbox     17048 non-null  int8 
 6   Fuel        17048 non-null  int8 
 7   Status      17048 non-null  int8 
 8   CC          17048 non-null  int64
dtypes: int16(1), int64(4), int8(4)
memory usage: 632.8 KB


In [9]:
df_tmp.isna().sum()

Price         0
Brand         0
Model         0
Year          0
Kilometers    0
Gearbox       0
Fuel          0
Status        0
CC            0
dtype: int64

In [10]:
from sklearn.ensemble import RandomForestRegressor

X = df_tmp.drop("Price", axis=1)
y = df_tmp["Price"]

# Instantiate model
model = RandomForestRegressor(n_jobs=-1, random_state=42)

# Fit the model
model.fit(X, y)

RandomForestRegressor(n_jobs=-1, random_state=42)

In [11]:
model.score(X, y)

0.9801778259180698

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=True, random_state=42)

In [12]:
model.score(X_test, y_test)

0.9743079619191913

In [13]:
model.score(X_train, y_train)

0.9826393306102253

# Building an evaluation function

In [38]:
# Create evaluation function
from sklearn.metrics import mean_squared_log_error, mean_absolute_error, r2_score

def rmsle(y_test, y_preds):
    """
    Calculate root mean squared log error between predictions and
    true labels.
    """
    return np.sqrt(mean_squared_log_error(y_test, y_preds))

# Create function to evaluate model on a few different levels
def show_scores(model):
    train_preds = model.predict(X_train)
    test_preds = model.predict(X_test)
    scores = {"Training MAE": mean_absolute_error(y_train, train_preds),
              "Test MAE": mean_absolute_error(y_test, test_preds),
              "Training R^2": r2_score(y_train, train_preds),
              "Test R^2": r2_score(y_test, test_preds)}
    return scores

## Testing our model on a subset (to tune the hyperparameters)

In [15]:
# change max_samples value
model = RandomForestRegressor(n_jobs=-1,
                              random_state=42,
                              max_samples=1000)

In [16]:
model.fit(X_train, y_train)

RandomForestRegressor(max_samples=1000, n_jobs=-1, random_state=42)

In [17]:
show_scores(model)

{'Training MAE': 6081.259096918228,
 'Test MAE': 6522.201395603744,
 'Training RMSLE': 0.2433642346864556,
 'Test RMSLE': 0.26626018497768755,
 'Training R^2': 0.7744171730671561,
 'Test R^2': 0.7588500249687221}

# Hyperparameter tuning with RandomizedSearchCV

In [40]:
from sklearn.model_selection import RandomizedSearchCV

# Different RandomForestRegressor hyperparameters
rf_grid = {"n_estimators": np.arange(10, 100, 10),
           "max_depth": [None, 3, 5, 10],
           "min_samples_split": np.arange(2, 20, 2),
           "min_samples_leaf": np.arange(1, 20, 2),
           "max_features": [0.5, 1, "sqrt", "auto"],
           "max_samples": [1000]}

# Instantiate RandomizedSearchCV model
rs_model = RandomizedSearchCV(RandomForestRegressor(n_jobs=-1, 
                                                    random_state=42),
                              param_distributions=rf_grid,
                              n_iter=100,
                              cv=5,
                              verbose=True)

# Fit the RandomizedSearchCV model
rs_model.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


RandomizedSearchCV(cv=5,
                   estimator=RandomForestRegressor(n_jobs=-1, random_state=42),
                   n_iter=100,
                   param_distributions={'max_depth': [None, 3, 5, 10],
                                        'max_features': [0.5, 1, 'sqrt',
                                                         'auto'],
                                        'max_samples': [1000],
                                        'min_samples_leaf': array([ 1,  3,  5,  7,  9, 11, 13, 15, 17, 19]),
                                        'min_samples_split': array([ 2,  4,  6,  8, 10, 12, 14, 16, 18]),
                                        'n_estimators': array([10, 20, 30, 40, 50, 60, 70, 80, 90])},
                   verbose=True)

In [41]:
# Find the best model hyperparameters
rs_model.best_params_

{'n_estimators': 40,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_samples': 1000,
 'max_features': 'auto',
 'max_depth': None}

In [29]:
# Evaluate the RandomizedSearch model
show_scores(rs_model), show_scores(model)

({'Training MAE': 6090.414189174401,
  'Test MAE': 6534.694707237231,
  'Training RMSLE': 0.24364362275868742,
  'Test RMSLE': 0.2667160165742893,
  'Training R^2': 0.7695091876125204,
  'Test R^2': 0.7575255790732172},
 {'Training MAE': 6081.259096918228,
  'Test MAE': 6522.201395603744,
  'Training RMSLE': 0.2433642346864556,
  'Test RMSLE': 0.26626018497768755,
  'Training R^2': 0.7744171730671561,
  'Test R^2': 0.7588500249687221})

## Train a model with the best hyperparameters.
**Note**: These were found after 100 iterations of `RandomizedSearchCV`

In [42]:
# Most ideal hyperparameters
ideal_model = RandomForestRegressor(n_estimators=40,
                                    min_samples_leaf=1,
                                    min_samples_split=2,
                                    max_features='auto',
                                    n_jobs=-1,
                                    max_samples=1000,
                                    max_depth=None,
                                    random_state=42)

# Fit the ideal model
ideal_model.fit(X_train, y_train)

RandomForestRegressor(max_samples=1000, n_estimators=40, n_jobs=-1,
                      random_state=42)

In [31]:
show_scores(ideal_model)

{'Training MAE': 6090.414189174401,
 'Test MAE': 6534.694707237231,
 'Training RMSLE': 0.24364362275868742,
 'Test RMSLE': 0.2667160165742893,
 'Training R^2': 0.7695091876125204,
 'Test R^2': 0.7575255790732172}

## Make predictions on test data

In [32]:
# Predict on unseen data
# Brand         0
# Model         0
# Year          0
# Kilometers    0
# Gearbox       0
# Fuel          0
# Status        0
# CC
unseen_data = {
                "Brand": "Holden",
                "Model": "Cruze",
                "Year": 2015,
                "Kilometers": 200000,
                "Gearbox": "Manual",
                "Fuel": "Unleaded Petrol",
                "Status": "Used",
                "CC": 2050
              }

data = pd.DataFrame(data=unseen_data, index=[0])
data


Unnamed: 0,Brand,Model,Year,Kilometers,Gearbox,Fuel,Status,CC
0,Holden,Cruze,2015,200000,Manual,Unleaded Petrol,Used,2050


In [33]:
# Convert string datatypes into category
for label, content in data.items():
    if pd.api.types.is_string_dtype(content):
        data[label] = content.astype("category").cat.codes

In [34]:
ideal_model.predict(data)

array([23867.05])

In [45]:
np.random.seed(42)

from sklearn import linear_model

lm = linear_model.LinearRegression()
lm.fit(X_train, y_train)

lasso_model = linear_model.Lasso(alpha=0.1)
lasso_model.fit(X_train, y_train)

ridge_model = linear_model.Ridge(alpha=0.1)
ridge_model.fit(X_train, y_train)

elastic_model = linear_model.ElasticNet(alpha=0.1)
elastic_model.fit(X_train, y_train)

model = RandomForestRegressor(n_jobs=-1)
model.fit(X_train, y_train)

RandomForestRegressor(n_jobs=-1)

In [46]:
show_scores(lm), show_scores(lasso_model), show_scores(ridge_model)

({'Training MAE': 11661.632512156451,
  'Test MAE': 11568.434763138175,
  'Training R^2': 0.4347992190070715,
  'Test R^2': 0.49751639003023307},
 {'Training MAE': 11661.634474901271,
  'Test MAE': 11568.43006600556,
  'Training R^2': 0.4347992188760381,
  'Test R^2': 0.4975166666531362},
 {'Training MAE': 11661.634391096815,
  'Test MAE': 11568.435484673522,
  'Training R^2': 0.4347992189872134,
  'Test R^2': 0.4975163682697049})

In [47]:
show_scores(elastic_model), show_scores(ideal_model), show_scores(model)

({'Training MAE': 11675.077777569775,
  'Test MAE': 11572.921882981249,
  'Training R^2': 0.4345192893311782,
  'Test R^2': 0.49710718830161316},
 {'Training MAE': 6118.789195923073,
  'Test MAE': 6526.8773188470195,
  'Training R^2': 0.7704900165086559,
  'Test R^2': 0.7566857152326609},
 {'Training MAE': 1917.0133589925654,
  'Test MAE': 5166.670574593664,
  'Training R^2': 0.9792619035408948,
  'Test R^2': 0.7915160803259842})

In [48]:
import pickle

pickle.dump(ideal_model, open("ideal_model.pkl", "wb"))