In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from scipy.stats import randint
import time

In [3]:
#Load Documents
file = "02_Used_Cars_Pipeline.csv"
df = pd.read_csv(file)
df.head()

Unnamed: 0,manufacturer,fuel,title_status,transmission,drive,grouped_color,type_grouped,condition_grouped,cylinders_cat,price_segment,manufacturer_freq,log_model_freq,log_state_freq,log_odometer,log_car_age,log_miles_age,manufacturer_marketcap,log_price
0,gmc,gas,clean,other,4wd,neutral,work,3,3,3,10598,8.2938,8.12829,10.966887,2.197225,8.887567,22,10.422013
1,chevrolet,gas,clean,other,4wd,colorful,work,3,3,2,35027,9.091332,8.12829,11.173669,2.564949,8.688917,12,10.025307
2,chevrolet,gas,clean,other,4wd,colorful,work,3,3,3,35027,9.091332,8.12829,9.860632,1.098612,9.167537,12,10.586357
3,toyota,gas,clean,other,4wd,colorful,work,3,3,3,23302,7.85205,8.12829,10.624371,1.791759,9.015031,1,10.341452
4,ford,gas,clean,automatic,rwd,neutral,work,4,2,1,44881,9.289706,8.12829,11.759793,2.302585,9.562631,8,9.615872


In [4]:
#Train Test Split
numeric_features = ['log_odometer', 'log_car_age', 'log_miles_age', 'log_state_freq', 'log_model_freq','manufacturer_marketcap', 'condition_grouped','cylinders_cat']
categorical_features = ['fuel', 'title_status', 'transmission', 'drive', 'type_grouped']
y = np.expm1(df['log_price']) #Price
X = df[numeric_features + categorical_features]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2212)

In [17]:
#Start Timer + Train Dummy 
start_time = time.time()
dummy = DummyRegressor(strategy="median")
dummy.fit(X_train,y_train)

#Predict Median + End Searching Timer
y_pred = dummy.predict(X_test)
end_time = time.time()
elapsed_time = end_time - start_time

#Performance Metrics
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)

#Print Outputs
print(f"Training time = {elapsed_time:.2f} seconds")
print(f"Median        = {y_pred[1]:.2f}")
print(f"MAE           = {mae:.2f}")
print(f"MAPE          = {mape:.2f}")
#Training time = 0.01 seconds
#Median        = 15999.00
#MAE           = 10325.47
#MAPE          = 1.02

Training time = 0.01 seconds
Median        = 15999.00
MAE           = 10325.47
MAPE          = 1.02


In [13]:
#Save Results with row_index to combine into 1 df
test_results_dummy = X_test.copy()
test_results_dummy = test_results_dummy.reset_index(drop=True)
test_results_dummy['dummy_pred_log'] = np.log1p(y_pred) 
test_results_dummy['dummy_pred'] = y_pred
test_results_dummy['row_id'] = test_results_dummy.index 
test_results_dummy.to_csv('08_results_models/Prediction_Dummy.csv',index=False)