In [82]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, mean_absolute_error, mean_absolute_percentage_error, f1_score
from sklearn.metrics import confusion_matrix, classification_report
from scipy.stats import randint
import os
import time

In [19]:
#Load Documents
file = "02_Used_Cars_Pipeline.csv"
df = pd.read_csv(file)
df.head()

Unnamed: 0,manufacturer,fuel,title_status,transmission,drive,grouped_color,type_grouped,condition_grouped,cylinders_cat,price_segment,manufacturer_freq,log_model_freq,log_state_freq,log_odometer,log_car_age,log_miles_age,manufacturer_marketcap,log_price
0,gmc,gas,clean,other,4wd,neutral,work,3,3,3,10598,8.2938,8.12829,10.966887,2.197225,8.887567,22,10.422013
1,chevrolet,gas,clean,other,4wd,colorful,work,3,3,2,35027,9.091332,8.12829,11.173669,2.564949,8.688917,12,10.025307
2,chevrolet,gas,clean,other,4wd,colorful,work,3,3,3,35027,9.091332,8.12829,9.860632,1.098612,9.167537,12,10.586357
3,toyota,gas,clean,other,4wd,colorful,work,3,3,3,23302,7.85205,8.12829,10.624371,1.791759,9.015031,1,10.341452
4,ford,gas,clean,automatic,rwd,neutral,work,4,2,1,44881,9.289706,8.12829,11.759793,2.302585,9.562631,8,9.615872


In [20]:
#Train Test Split
numeric_features = ['log_odometer', 'log_car_age', 'log_miles_age', 'log_state_freq', 'log_model_freq','manufacturer_marketcap', 'condition_grouped','cylinders_cat']
categorical_features = ['fuel', 'title_status', 'transmission', 'drive', 'type_grouped']
y = df['price_segment']
X = df[numeric_features + categorical_features]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2212)

In [21]:
#Choose Preprocessors
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(), categorical_features),
    ('num', StandardScaler(), numeric_features)
])
#Create Pipeline
pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('model', RandomForestClassifier(random_state=2212))
])

In [23]:
#Start Training Timer + Hyperparameters RandomSearch
start_time = time.time()
print(start_time)
param_dist = {
    'model__n_estimators': randint(100, 251),       
    'model__max_depth': [20, 30, 40, 50],          
    'model__min_samples_split': randint(2, 4),      
    'model__min_samples_leaf': randint(1, 3),      
    'model__max_features': ['sqrt', 'log2']          
}

#Set up Random Search
random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=200,
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=2212),
    scoring='f1_macro',
    verbose=2,
    n_jobs=-1,
    random_state=2212
)
#Search best HPs RS + End Searching Timer
random_search.fit(X_train, y_train) 
end_time = time.time()
print(end_time)

#Print Outputs
elapsed_time = end_time - start_time
print(f"Total tuning time: {elapsed_time/60:.2f} minutes")
print(f"Best HP:{random_search.best_params_}")
print(f"Best CV F1_macro: {random_search.best_score_:.2f}")

#Train Best Model + Predict Test set
best_model = random_search.best_estimator_
y_segment_pred = best_model.predict(X_test)
#Total tuning time: 44.74 minutes
#Best HP:{'model__max_depth': 50, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 1, 'model__min_samples_split': 3, 'model__n_estimators': 233}
#Best CV F1_macro: 0.92


1746646792.323491
Fitting 3 folds for each of 200 candidates, totalling 600 fits
[CV] END model__max_depth=30, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=3, model__n_estimators=110; total time=  27.8s
[CV] END model__max_depth=30, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=3, model__n_estimators=110; total time=  27.9s
[CV] END model__max_depth=50, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=189; total time=  45.4s
[CV] END model__max_depth=50, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=189; total time=  45.4s
[CV] END model__max_depth=50, model__max_features=log2, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=189; total time=  45.7s
[CV] END model__max_depth=30, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=3, model__n_estimators=182; total ti



[CV] END model__max_depth=50, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=114; total time=  21.7s
[CV] END model__max_depth=50, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=114; total time=  21.8s
[CV] END model__max_depth=40, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=113; total time=  24.3s
[CV] END model__max_depth=40, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=181; total time=  33.6s
[CV] END model__max_depth=40, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=113; total time=  24.3s
[CV] END model__max_depth=40, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=181; total time=  33.2s
[CV] END model__max_depth=40, model__max_features=sqrt, model__min_sam

In [94]:
#Print Performance Metrics
print("Confusion Matrix")
print(confusion_matrix(y_test, y_segment_pred))
print('Classification report')
print(classification_report(y_test, y_segment_pred, digits=3))
#Confusion Matrix
#[[25506   952    36]
# [ 1173 13966   468]
# [   32   559  9676]]
#Classification report
#              precision    recall  f1-score   support
#
#           1      0.955     0.963     0.959     26494
#           2      0.902     0.895     0.899     15607
#           3      0.950     0.942     0.946     10267
#
#    accuracy                          0.939     52368
#   macro avg      0.936     0.933     0.935     52368
#weighted avg      0.938     0.939     0.938     52368

Confusion Matrix
[[25506   952    36]
 [ 1173 13966   468]
 [   32   559  9676]]
Classification report
              precision    recall  f1-score   support

           1      0.955     0.963     0.959     26494
           2      0.902     0.895     0.899     15607
           3      0.950     0.942     0.946     10267

    accuracy                          0.939     52368
   macro avg      0.936     0.933     0.935     52368
weighted avg      0.938     0.939     0.938     52368



Creating Test and Train Segment sets

In [77]:
#Create Test set with row_index
test_data = X_test.copy().reset_index(drop=True)
test_data['row_id'] = test_data.index
test_data['segment_pred'] = y_segment_pred
test_data['segment_true'] = y_test.reset_index(drop=True) 
test_data['log_price'] = df.loc[y_test.index, 'log_price'].reset_index(drop=True)  
test_data['price_true'] = np.expm1(test_data['log_price'])


In [79]:
#Testset Segment
test_dataset_low  = test_data[test_data['segment_pred'] == 1].copy()
test_dataset_mid  = test_data[test_data['segment_pred'] == 2].copy()
test_dataset_high = test_data[test_data['segment_pred'] == 3].copy()

In [81]:
#Save Segment Testsets to CSV
test_dataset_low.to_csv("05_segment_train_test_sets/Test_Low_Segment.csv",index=False)
test_dataset_mid.to_csv("05_segment_train_test_sets/Test_Mid_Segment.csv",index=False)
test_dataset_high.to_csv("05_segment_train_test_sets/Test_High_Segment.csv",index=False)

Trainingset

In [41]:
#Create Training Sets
segment_trainingset = X_train.copy()
segment_trainingset['log_price'] = df.loc[segment_trainingset.index, 'log_price']
segment_trainingset['price'] =  np.expm1(segment_trainingset['log_price'])
segment_trainingset['price_segment'] = df.loc[segment_trainingset.index, 'price_segment']

In [42]:
#Segment sets +/- 3000 
train_dataset_low  = segment_trainingset[segment_trainingset['price'] <= 19000].copy()
train_dataset_mid  = segment_trainingset[(segment_trainingset['price'] > 13000) & (segment_trainingset['price'] <= 33000 )].copy()
train_dataset_high = segment_trainingset[segment_trainingset['price'] > 27000].copy()

In [43]:
#Save Segment Testsets to CSV
train_dataset_low.to_csv("05_segment_train_test_sets/Train_Low_Segment.csv")
train_dataset_mid.to_csv("05_segment_train_test_sets/Train_Mid_Segment.csv")
train_dataset_high.to_csv("05_segment_train_test_sets/Train_High_Segment.csv")

Feature Importance

In [91]:
#Check Model
rf_trained = best_model.named_steps['model']

#Feature importances
importances = rf_trained.feature_importances_

#Feature Names
onehot_features = best_model.named_steps['preprocessing'].named_transformers_['cat'].get_feature_names_out(categorical_features)
numerical_features = numeric_features
all_feature_names = np.concatenate([numerical_features, onehot_features])

#Create DataFrame
feature_importance_df = pd.DataFrame({
    'feature': all_feature_names,
    'importance_Classification': importances
})

feature_importance_df.to_csv("08_feature_importance/Feature_Importance_Classification.csv",index=False)
feature_importance_df = feature_importance_df.sort_values(by='importance_Classification', ascending=False)
feature_importance_df.to_csv("08_feature_importance/Sorted_Importance_Classification.csv",index=False)