In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
import xgboost as xgb
import seaborn as sns
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

# Make NumPy printouts easier to read.
np.set_printoptions(precision=3, suppress=True)

## Read CSV
data_source = "../data/vehicle_data/final_bat_auction_data.csv"
df = pd.read_csv(data_source)

print(f'Data ingested -- Found {len(df)} Rows')

  from pandas import MultiIndex, Int64Index


Data ingested -- Found 53343 Rows


In [2]:
df.drop(["Mileage Notes", "Details"], inplace=True, axis=1)

# rename columns to lowercase for easability
df.columns = ['make', 'model', 'year', 'miles', 'final bid price', 'color',
              'auction_year', 'engine_size', 'cylinders']

# Drop any leftover nans
df = df.dropna()
print("Number of unique Makes:", df.make.nunique())

# Check nan values again
print('\nAfter:\n', df.isna().sum())

# name value to predict
target_feature = "final bid price"

y = df[target_feature]
X = df.drop(target_feature, axis=1)
# X.head()

Number of unique Makes: 74

After:
 make               0
model              0
year               0
miles              0
final bid price    0
color              0
auction_year       0
engine_size        0
cylinders          0
dtype: int64


In [3]:
## Find Numerical & Categorical Columns
# Get numerical and categorical feature columns
print('\nCalculating Numerical and Categorical Features...')
print(f'There are {len(X.columns)} total columns.')

numerical_features = X.select_dtypes(include='number').columns.tolist()
print(f'There are {len(numerical_features)} numerical features.')

categorical_features = X.select_dtypes(exclude='number').columns.tolist()
print(f'There are {len(categorical_features)} categorical features.', '\n')

print(X.columns)

## Pre-Process Data

print('Fetching Preprocessing Pipeline...\n')

numeric_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean', missing_values=np.nan)),
    ('scale', MinMaxScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent', missing_values=np.nan)),
    ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

full_processor = ColumnTransformer(transformers=[
    ('number', numeric_pipeline, numerical_features),
    ('category', categorical_pipeline, categorical_features)
])


# X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state=42)

model = XGBRegressor(n_estimators=100, random_state=42)

model_pipeline = Pipeline(steps=[
        ('preprocessor', full_processor),
        ('model', model)
    ])



Calculating Numerical and Categorical Features...
There are 8 total columns.
There are 5 numerical features.
There are 3 categorical features. 

Index(['make', 'model', 'year', 'miles', 'color', 'auction_year',
       'engine_size', 'cylinders'],
      dtype='object')
Fetching Preprocessing Pipeline...



In [4]:
model = XGBRegressor(random_state=42, learning_rate=0.5, n_estimators=400, max_depth=5)


model_pipeline = Pipeline(steps=[
        ('preprocessor', full_processor),
        ('model', model)
    ])

hist = model_pipeline.fit(X, y)

In [9]:
# Custom Predictions
tst_data = ["subaru", "wrx sti", 2016, 55000, "blue", 2022, 2.5, 4]
columns = ['make', 'model', "year", 'miles', 'color', 'auction_year', 'engine_size', 'cylinders']
tst = pd.DataFrame()
for idx, col in enumerate(columns):
    tst[col] = pd.Series(tst_data[idx])
preds = model_pipeline.predict(tst)
preds

array([38067.54], dtype=float32)

In [31]:
from sklearn.metrics import mean_squared_error as MSE
# .MSE(X, y)
r2 = hist.score(X, y)
r2

# MSE = (1 - r2) * np.var(y)
hist['model']

TypeError: 'XGBRegressor' object is not subscriptable

In [10]:
import joblib

# Save the XGB model first:
model_pipeline.named_steps['model'].save_model('../api/models/xgb_model.h5')

# Save the preprocessor next
joblib.dump(model_pipeline.named_steps['preprocessor'], '../api/models/preprocessor.joblib')


# # This hack allows us to save the sklearn pipeline:
# model_pipeline.named_steps['model'] = None

# # Finally, save the pipeline:
# joblib.dump(model_pipeline, './models/sklearn_pipeline.joblib')

# del model_pipeline

['../api/models/preprocessor.joblib']