In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.metrics import confusion_matrix, classification_report
from scipy.stats import randint
import os
import time

In [84]:
#Load Documents
train = "05_segment_train_test_sets/Train_High_Segment.csv"
test = "05_segment_train_test_sets/Test_High_Segment.csv"
traindf = pd.read_csv(train)
testdf = pd.read_csv(test)

In [52]:
#Train Test Split
numeric_features = ['log_odometer', 'log_car_age', 'log_miles_age', 'log_state_freq', 'log_model_freq','manufacturer_marketcap', 'condition_grouped','cylinders_cat']
categorical_features = ['fuel', 'title_status', 'transmission', 'drive', 'type_grouped']

X_train = traindf[numeric_features + categorical_features]
y_train = traindf['log_price']

X_test  = testdf[numeric_features + categorical_features]
y_test  = testdf['log_price']

In [53]:
#Choose Preprocessors
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(), categorical_features),
    ('num', StandardScaler(), numeric_features)
])

#Create Pipeline
pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('model', RandomForestRegressor(random_state=2212))
])

In [54]:
#Start Timer + Hyperparameters RandomSearch
start_time = time.time()
print(start_time)
param_dist = {
    'model__n_estimators': randint(100, 251),       
    'model__max_depth': [20, 30, 40, 50],          
    'model__min_samples_split': randint(2, 4),      
    'model__min_samples_leaf': randint(1, 3),      
    'model__max_features': ['sqrt', 'log2']          
}
#Set up Random Search
random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=200,
    cv=3,
    scoring='neg_mean_absolute_error',
    verbose=2,
    n_jobs=-1,
    random_state=2212
)
#Search best HPs RS + End Searching Timer
random_search.fit(X_train, y_train)

end_time = time.time()
print(end_time)
elapsed_time = end_time - start_time

#Print Outputs
print(f"Total training time: {elapsed_time/60:.2f} minutes")
print(f"Best HP:{random_search.best_params_}")
print(f"MAE  CV: {np.expm1(-random_search.best_score_):.2f}")

Train Best Model + Predict Test set
best_model = random_search.best_estimator_
y_pred_log = best_model.predict(X_test)
y_pred = np.expm1(y_pred_log)       
y_true = np.expm1(y_test)
#Total training time: 9.83 minutes
#Best HP:{'model__max_depth': 50, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 226}
#MAE  CV: 0.04



1746690323.076849
Fitting 3 folds for each of 200 candidates, totalling 600 fits
[CV] END model__max_depth=30, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=3, model__n_estimators=110; total time=   5.2s
[CV] END model__max_depth=30, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=3, model__n_estimators=110; total time=   5.2s
[CV] END model__max_depth=50, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=189; total time=   9.1s
[CV] END model__max_depth=30, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=3, model__n_estimators=182; total time=   9.3s
[CV] END model__max_depth=50, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=189; total time=   9.2s
[CV] END model__max_depth=30, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=3, model__n_estimators=182; total ti

In [55]:
#Print Performance Metrics
mae = mean_absolute_error(y_true, y_pred)
mape = mean_absolute_percentage_error(y_true, y_pred)

print(f"MAE High Segment: {mae:.2f}")
print(f"MAPE High Segment: {mape:.4f}")

#MAE High Segment: 1639.21
#MAPE High Segment: 0.0805

MAE High Segment: 1639.21
MSE High Segment: 14741290.48
MAPE High Segment: 0.0805


In [91]:
#Save Documents
testdf['high_pred_log'] = y_pred_log
testdf['high_pred'] = y_pred
testdf.to_csv("08_results_models/Prediction_High.csv",index=False)

In [93]:
#Check Model
rf_trained = best_model.named_steps['model']

#Feature Importances
importances = rf_trained.feature_importances_

#Feature Names
onehot_features = best_model.named_steps['preprocessing'].named_transformers_['cat'].get_feature_names_out(categorical_features)
numerical_features = numeric_features
all_feature_names = np.concatenate([numerical_features, onehot_features])

#Create DataFrame
feature_importance_df = pd.DataFrame({
    'feature': all_feature_names,
    'importance_high': importances
})

feature_importance_df.to_csv("08_feature_importance/Feature_Importance_High.csv",index=False)
feature_importance_df = feature_importance_df.sort_values(by='importance_high', ascending=False)
feature_importance_df.to_csv("08_feature_importance/Sorted_Importance_High.csv",index=False)