In [17]:
import pandas as pd
import numpy as np

In [18]:
df = pd.read_csv("cars.csv")

In [19]:
# create a temporary dataframe to work with
df_tmp = df.copy()
df_tmp.head()

Unnamed: 0,ID,Name,Price,Brand,Model,Variant,Series,Year,Kilometers,Type,Gearbox,Fuel,Status,CC,Color,Seating Capacity
0,11530130,2010 Toyota Rukus Build 2 AZE151R,9999,Toyota,Rukus,Build 2,AZE151R,2010,263000,Wagon,Automatic,Unleaded Petrol,Used,2362,Grey,5
1,12190570,2021 Mercedes-Benz V 250 D Avantgarde MWB 447 ...,117990,Mercedes-Benz,V,250 D Avantgarde MWB,447 MY21,2021,19,Wagon,Automatic,Diesel,New In Stock,2143,Black,7
2,12321855,2021 Mercedes-Benz Valente 116 CDI MWB RWD 447...,79990,Mercedes-Benz,Valente,116 CDI MWB RWD,447 MY21,2021,24,Wagon,Automatic,Diesel,New In Stock,2143,Black,8
3,12346971,2010 Mercedes-Benz E250 CDI Avantgarde 207,34990,Mercedes-Benz,E250,CDI Avantgarde,207,2010,120579,Cabriolet,Automatic,Diesel,Used,2143,Black,4
4,12363884,2016 Holden Cruze CD JH MY16,15990,Holden,Cruze,CD,JH MY16,2016,72506,Sportswagon,Automatic,Unleaded Petrol,Used,1796,White,5


In [31]:
len(df_tmp)

17048

# Based off correlation, we are going to select the features who have correlation from -0.5 to +1 with price. Thus, selected features are: Year, Kilometers, Gearbox, Fuel, Status, CC.

In [20]:
df_tmp.drop(columns=['ID', 'Name', 'Variant', 'Series', 'Type', 'Color', 'Seating Capacity'], inplace=True)

In [21]:
df_tmp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17048 entries, 0 to 17047
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Price       17048 non-null  int64 
 1   Brand       17048 non-null  object
 2   Model       17048 non-null  object
 3   Year        17048 non-null  int64 
 4   Kilometers  17048 non-null  int64 
 5   Gearbox     17048 non-null  object
 6   Fuel        17048 non-null  object
 7   Status      17048 non-null  object
 8   CC          17048 non-null  int64 
dtypes: int64(4), object(5)
memory usage: 1.2+ MB


# Convert all of the string values into category values

In [22]:
for label, content in df_tmp.items():
    if pd.api.types.is_string_dtype(content):
        df_tmp[label] = content.astype("category").cat.codes

In [23]:
df_tmp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17048 entries, 0 to 17047
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   Price       17048 non-null  int64
 1   Brand       17048 non-null  int8 
 2   Model       17048 non-null  int16
 3   Year        17048 non-null  int64
 4   Kilometers  17048 non-null  int64
 5   Gearbox     17048 non-null  int8 
 6   Fuel        17048 non-null  int8 
 7   Status      17048 non-null  int8 
 8   CC          17048 non-null  int64
dtypes: int16(1), int64(4), int8(4)
memory usage: 632.8 KB


In [24]:
df_tmp.isna().sum()

Price         0
Brand         0
Model         0
Year          0
Kilometers    0
Gearbox       0
Fuel          0
Status        0
CC            0
dtype: int64

In [25]:
from sklearn.ensemble import RandomForestRegressor

X = df_tmp.drop("Price", axis=1)
y = df_tmp["Price"]

# Instantiate model
model = RandomForestRegressor(n_jobs=-1, random_state=42)

# Fit the model
model.fit(X, y)

RandomForestRegressor(n_jobs=-1, random_state=42)

In [26]:
model.score(X, y)

0.9801778259180698

In [27]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=True, random_state=42)

In [28]:
model.score(X_test, y_test)

0.9743079619191913

In [29]:
model.score(X_train, y_train)

0.9826393306102253

# Building an evaluation function

In [35]:
# Create evaluation function
from sklearn.metrics import mean_squared_log_error, mean_absolute_error, r2_score

def rmsle(y_test, y_preds):
    """
    Calculate root mean squared log error between predictions and
    true labels.
    """
    return np.sqrt(mean_squared_log_error(y_test, y_preds))

# Create function to evaluate model on a few different levels
def show_scores(model):
    train_preds = model.predict(X_train)
    test_preds = model.predict(X_test)
    scores = {"Training MAE": mean_absolute_error(y_train, train_preds),
              "Test MAE": mean_absolute_error(y_test, test_preds),
              "Training RMSLE": rmsle(y_train, train_preds),
              "Test RMSLE": rmsle(y_test, test_preds),
              "Training R^2": r2_score(y_train, train_preds),
              "Test R^2": r2_score(y_test, test_preds)}
    return scores

## Testing our model on a subset (to tune the hyperparameters)

In [36]:
# change max_samples value
model = RandomForestRegressor(n_jobs=-1,
                              random_state=42,
                              max_samples=10000)

In [37]:
model.fit(X_train, y_train)

RandomForestRegressor(max_samples=10000, n_jobs=-1, random_state=42)

In [38]:
show_scores(model)

AttributeError: 'RandomForestRegressor' object has no attribute 'r2_score'