In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from scipy.stats import randint
import time

In [3]:
#Load Documents
file = "02_Used_Cars_Pipeline.csv"
df = pd.read_csv(file)
df.head()

Unnamed: 0,manufacturer,fuel,title_status,transmission,drive,grouped_color,type_grouped,condition_grouped,cylinders_cat,price_segment,manufacturer_freq,log_model_freq,log_state_freq,log_odometer,log_car_age,log_miles_age,manufacturer_marketcap,log_price
0,gmc,gas,clean,other,4wd,neutral,work,3,3,3,10598,8.2938,8.12829,10.966887,2.197225,8.887567,22,10.422013
1,chevrolet,gas,clean,other,4wd,colorful,work,3,3,2,35027,9.091332,8.12829,11.173669,2.564949,8.688917,12,10.025307
2,chevrolet,gas,clean,other,4wd,colorful,work,3,3,3,35027,9.091332,8.12829,9.860632,1.098612,9.167537,12,10.586357
3,toyota,gas,clean,other,4wd,colorful,work,3,3,3,23302,7.85205,8.12829,10.624371,1.791759,9.015031,1,10.341452
4,ford,gas,clean,automatic,rwd,neutral,work,4,2,1,44881,9.289706,8.12829,11.759793,2.302585,9.562631,8,9.615872


In [4]:
#Train Test Split
numeric_features = ['log_odometer', 'log_car_age', 'log_miles_age', 'log_state_freq', 'log_model_freq','manufacturer_marketcap', 'condition_grouped','cylinders_cat']
categorical_features = ['fuel', 'title_status', 'transmission', 'drive', 'type_grouped']
y = df['log_price']
X = df[numeric_features + categorical_features]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2212)

In [6]:
#Choose Preprocessors
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(), categorical_features),
    ('num', StandardScaler(), numeric_features)
])
#Create Pipeline
pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('model', RandomForestRegressor(random_state=2212))
])

In [7]:
#Pipeline
pipeline

In [8]:
#Start Timer + Hyperparameters RandomSearch 
start_time = time.time()
print(start_time)
param_dist = {
    'model__n_estimators': randint(100, 251),       
    'model__max_depth': [20, 30, 40, 50],          
    'model__min_samples_split': randint(2, 4),      
    'model__min_samples_leaf': randint(1, 3),      
    'model__max_features': ['sqrt', 'log2']          
}
#Set up Random Search
random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=200,
    cv=3,
    scoring='neg_mean_absolute_error',
    verbose=2,
    n_jobs=-1,
    random_state=2212
)
#Search best HPs RS + End Searching Timer
random_search.fit(X_train, y_train)

end_time = time.time()
print(end_time)
elapsed_time = end_time - start_time

#Print Outputs
print(f"Total training time: {elapsed_time/60:.2f} minutes")
print(f"Best HP:{random_search.best_params_}")
print(f"MAE  CV: {np.expm1(-random_search.best_score_):.2f}")

#Train Best Model + Predict Test set
best_model = random_search.best_estimator_
y_pred_log = best_model.predict(X_test)
y_pred = np.expm1(y_pred_log)       
y_true = np.expm1(y_test)
#Total training time: 56.22 minutes
#Best HP:{'model__max_depth': 50, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 226}
#MAE  CV: 0.16


1746651654.667083
Fitting 3 folds for each of 200 candidates, totalling 600 fits
[CV] END model__max_depth=30, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=3, model__n_estimators=110; total time=  38.8s
[CV] END model__max_depth=30, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=3, model__n_estimators=110; total time=  39.2s
[CV] END model__max_depth=30, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=3, model__n_estimators=182; total time= 1.3min
[CV] END model__max_depth=30, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=3, model__n_estimators=182; total time= 1.3min
[CV] END model__max_depth=30, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=3, model__n_estimators=182; total time= 1.3min
[CV] END model__max_depth=50, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=189; total ti



[CV] END model__max_depth=20, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=3, model__n_estimators=238; total time=  50.7s
[CV] END model__max_depth=20, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=3, model__n_estimators=238; total time=  51.0s
[CV] END model__max_depth=50, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=247; total time=  51.0s
[CV] END model__max_depth=50, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=247; total time=  50.3s
[CV] END model__max_depth=50, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=247; total time=  50.5s
[CV] END model__max_depth=20, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=3, model__n_estimators=148; total time=  26.7s
[CV] END model__max_depth=20, model__max_features=log2, model__min_sam

In [39]:
#Print Performance Metrics
mae = mean_absolute_error(y_true, y_pred)
mape = mean_absolute_percentage_error(y_true, y_pred)

print(f"MAE: {mae:.2f}")
print(f"MAPE: {mape:.4f}") 
#MAE: 1627.73
#MSE: 9388934.52
#MAPE: 0.1638

MAE: 1627.73
MAPE: 0.1638


In [33]:
#Save Results with Row_index to combine later in 1 DF
test_results_1stage = X_test.copy()
test_results_1stage = test_results_1stage.reset_index(drop=True)
test_results_1stage['1stage_pred_log'] = y_pred_log
test_results_1stage['1stage_pred'] = y_pred
test_results_1stage['row_id'] = test_results_1stage.index 
test_results_1stage.to_csv('08_results_models/Prediction_1stage.csv',index=False)


In [37]:
#Check Model
rf_trained = best_model.named_steps['model']

#Feature Importances
importances = rf_trained.feature_importances_

#Feature Names
onehot_features = best_model.named_steps['preprocessing'].named_transformers_['cat'].get_feature_names_out(categorical_features)
numerical_features = numeric_features
all_feature_names = np.concatenate([numerical_features, onehot_features])

#Create DataFrame
feature_importance_df = pd.DataFrame({
    'feature': all_feature_names,
    'importance_1stage': importances
})
#Save Results
feature_importance_df.to_csv("08_feature_importance/Feature_Importance_Regression.csv",index=False)
feature_importance_df = feature_importance_df.sort_values(by='importance_1stage', ascending=False)
feature_importance_df.to_csv("08_feature_importance/Sorted_Importance_Regression.csv",index=False)